Merge branch 'main' into detect-pdfs

2024-05-17 09:55:51 -07:00
parent 8eb2e95f19 5c1e6d188c
commit df6c3d1e7d
28 changed files with 1403 additions and 118 deletions
@@ -118,7 +118,8 @@ export async function scrapWithPlaywright(url: string): Promise<string> {

 export async function scrapSingleUrl(
  urlToScrap: string,
-  pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
+  pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
+  existingHtml: string = ""
 ): Promise<Document> {
  urlToScrap = urlToScrap.trim();

@@ -215,8 +216,15 @@ export async function scrapSingleUrl(
      : ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];

    for (const scraper of scrapersInOrder) {
+      // If exists text coming from crawler, use it
+      if (existingHtml && existingHtml.trim().length >= 100) {
+        let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
+        text = await parseMarkdown(cleanedHtml);
+        html = existingHtml;
+        break;
+      }
      [text, html] = await attemptScraping(urlToScrap, scraper);
-      if (text && text.length >= 100) break;
+      if (text && text.trim().length >= 100) break;
      console.log(`Falling back to ${scraper}`);
    }