Merge branch 'main' into detect-pdfs

This commit is contained in:
Nicolas
2024-05-17 09:55:51 -07:00
28 changed files with 1403 additions and 118 deletions
+10 -2
View File
@@ -118,7 +118,8 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
export async function scrapSingleUrl(
urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
existingHtml: string = ""
): Promise<Document> {
urlToScrap = urlToScrap.trim();
@@ -215,8 +216,15 @@ export async function scrapSingleUrl(
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it
if (existingHtml && existingHtml.trim().length >= 100) {
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
text = await parseMarkdown(cleanedHtml);
html = existingHtml;
break;
}
[text, html] = await attemptScraping(urlToScrap, scraper);
if (text && text.length >= 100) break;
if (text && text.trim().length >= 100) break;
console.log(`Falling back to ${scraper}`);
}