Added check during scraping to deal with pdfs
Checks if the URL is a PDF during the scraping process (single_url.ts). TODO: Run integration tests - Does this strat affect the running time? ps. Some comments need to be removed if we decide to proceed with this strategy.
This commit is contained in:
@@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
@@ -66,9 +67,17 @@ export async function scrapWithScrapingBee(
|
||||
);
|
||||
return "";
|
||||
}
|
||||
const decoder = new TextDecoder();
|
||||
const text = decoder.decode(response.data);
|
||||
return text;
|
||||
// Check the content type of the response
|
||||
const contentType = response.headers['content-type'];
|
||||
if (contentType && contentType.includes('application/pdf')) {
|
||||
// Handle PDF content type
|
||||
return fetchAndProcessPdf(url);
|
||||
} else {
|
||||
// Assume the content is text and decode it
|
||||
const decoder = new TextDecoder();
|
||||
const text = decoder.decode(response.data);
|
||||
return text;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error scraping with Scraping Bee: ${error}`);
|
||||
return "";
|
||||
|
||||
Reference in New Issue
Block a user