Added check during scraping to deal with pdfs

Checks if the URL is a PDF during the scraping process (single_url.ts).

TODO: Run integration tests - Does this strat affect the running time?

ps. Some comments need to be removed if we decide to proceed with this strategy.
This commit is contained in:
rafaelsideguide
2024-05-13 09:13:42 -03:00
parent 5a2712fa5a
commit f4348024c6
4 changed files with 49 additions and 15 deletions
+12 -3
View File
@@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
dotenv.config();
@@ -66,9 +67,17 @@ export async function scrapWithScrapingBee(
);
return "";
}
const decoder = new TextDecoder();
const text = decoder.decode(response.data);
return text;
// Check the content type of the response
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
// Handle PDF content type
return fetchAndProcessPdf(url);
} else {
// Assume the content is text and decode it
const decoder = new TextDecoder();
const text = decoder.decode(response.data);
return text;
}
} catch (error) {
console.error(`Error scraping with Scraping Bee: ${error}`);
return "";