Added check during scraping to deal with pdfs
Checks if the URL is a PDF during the scraping process (single_url.ts). TODO: Run integration tests - Does this strat affect the running time? ps. Some comments need to be removed if we decide to proceed with this strategy.
This commit is contained in:
@@ -144,14 +144,23 @@ export class WebScraperDataProvider {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
|
||||
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
// documents.push(...pdfDocuments);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
||||
private async handleSingleUrlsMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let documents = await this.processLinks(this.urls, inProgress);
|
||||
const links = this.urls;
|
||||
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
|
||||
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
// documents.push(...pdfDocuments);
|
||||
return documents;
|
||||
}
|
||||
|
||||
@@ -163,7 +172,11 @@ export class WebScraperDataProvider {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
// let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
|
||||
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
// documents.push(...pdfDocuments);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
||||
@@ -220,6 +233,19 @@ export class WebScraperDataProvider {
|
||||
);
|
||||
}
|
||||
|
||||
private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
|
||||
const checks = links.map(async (link) => ({
|
||||
link,
|
||||
isPdf: await isUrlAPdf({ url: link })
|
||||
}));
|
||||
|
||||
const results = await Promise.all(checks);
|
||||
const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
|
||||
const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
|
||||
|
||||
return [pdfLinks, notPdfLinks];
|
||||
}
|
||||
|
||||
private applyPathReplacements(documents: Document[]): Document[] {
|
||||
return this.replaceAllPathsWithAbsolutePaths
|
||||
? replacePathsWithAbsolutePaths(documents)
|
||||
|
||||
Reference in New Issue
Block a user