Cleaned up

This commit is contained in:
rafaelsideguide
2024-05-13 16:13:10 -03:00
parent f4348024c6
commit 8eb2e95f19
3 changed files with 18 additions and 65 deletions
+1 -25
View File
@@ -10,7 +10,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
import { WebCrawler } from "./crawler";
import { getValue, setValue } from "../../services/redis";
import { getImageDescription } from "./utils/imageDescription";
import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import {
replaceImgPathsWithAbsolutePaths,
replacePathsWithAbsolutePaths,
@@ -144,11 +144,7 @@ export class WebScraperDataProvider {
return this.returnOnlyUrlsResponse(links, inProgress);
}
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
let documents = await this.processLinks(links, inProgress);
// documents.push(...pdfDocuments);
return this.cacheAndFinalizeDocuments(documents, links);
}
@@ -156,11 +152,8 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
const links = this.urls;
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
let documents = await this.processLinks(links, inProgress);
// documents.push(...pdfDocuments);
return documents;
}
@@ -172,11 +165,7 @@ export class WebScraperDataProvider {
return this.returnOnlyUrlsResponse(links, inProgress);
}
// let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
let documents = await this.processLinks(links, inProgress);
// documents.push(...pdfDocuments);
return this.cacheAndFinalizeDocuments(documents, links);
}
@@ -233,19 +222,6 @@ export class WebScraperDataProvider {
);
}
private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
const checks = links.map(async (link) => ({
link,
isPdf: await isUrlAPdf({ url: link })
}));
const results = await Promise.all(checks);
const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
return [pdfLinks, notPdfLinks];
}
private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths
? replacePathsWithAbsolutePaths(documents)