Added check during scraping to deal with pdfs

Checks if the URL is a PDF during the scraping process (single_url.ts). TODO: Run integration tests - Does this strat affect the running time? ps. Some comments need to be removed if we decide to proceed with this strategy.
2024-05-13 09:13:42 -03:00
parent 5a2712fa5a
commit f4348024c6
4 changed files with 49 additions and 15 deletions
@@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities";
 import { parseMarkdown } from "../../lib/html-to-markdown";
 import { excludeNonMainTags } from "./utils/excludeTags";
 import { urlSpecificParams } from "./utils/custom/website_params";
+import { fetchAndProcessPdf } from "./utils/pdfProcessor";

 dotenv.config();

@@ -66,9 +67,17 @@ export async function scrapWithScrapingBee(
      );
      return "";
    }
-    const decoder = new TextDecoder();
-    const text = decoder.decode(response.data);
-    return text;
+    // Check the content type of the response
+    const contentType = response.headers['content-type'];
+    if (contentType && contentType.includes('application/pdf')) {
+      // Handle PDF content type
+      return fetchAndProcessPdf(url);
+    } else {
+      // Assume the content is text and decode it
+      const decoder = new TextDecoder();
+      const text = decoder.decode(response.data);
+      return text;
+    }
  } catch (error) {
    console.error(`Error scraping with Scraping Bee: ${error}`);
    return "";