Added check during scraping to deal with pdfs

Checks if the URL is a PDF during the scraping process (single_url.ts). TODO: Run integration tests - Does this strat affect the running time? ps. Some comments need to be removed if we decide to proceed with this strategy.
2024-05-13 09:13:42 -03:00
parent 5a2712fa5a
commit f4348024c6
4 changed files with 49 additions and 15 deletions
@@ -144,14 +144,23 @@ export class WebScraperDataProvider {
      return this.returnOnlyUrlsResponse(links, inProgress);
    }

+    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
+    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);    
+
    let documents = await this.processLinks(links, inProgress);
+    // documents.push(...pdfDocuments);
    return this.cacheAndFinalizeDocuments(documents, links);
  }

  private async handleSingleUrlsMode(
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
-    let documents = await this.processLinks(this.urls, inProgress);
+    const links = this.urls;
+    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
+    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
+
+    let documents = await this.processLinks(links, inProgress);
+    // documents.push(...pdfDocuments);
    return documents;
  }

@@ -163,7 +172,11 @@ export class WebScraperDataProvider {
      return this.returnOnlyUrlsResponse(links, inProgress);
    }

+    // let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
+    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
+
    let documents = await this.processLinks(links, inProgress);
+    // documents.push(...pdfDocuments);
    return this.cacheAndFinalizeDocuments(documents, links);
  }

@@ -220,6 +233,19 @@ export class WebScraperDataProvider {
    );
  }

+  private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
+    const checks = links.map(async (link) => ({
+      link,
+      isPdf: await isUrlAPdf({ url: link })
+    }));
+  
+    const results = await Promise.all(checks);
+    const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
+    const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
+  
+    return [pdfLinks, notPdfLinks];
+  }
+
  private applyPathReplacements(documents: Document[]): Document[] {
    return this.replaceAllPathsWithAbsolutePaths
      ? replacePathsWithAbsolutePaths(documents)