From a150aa820c28bfb5439b8761ddf5d5752c31007b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 3 Oct 2024 15:21:42 -0300 Subject: [PATCH] Nick: shouldnt fallback on a 400 + error code should be correct on page status code --- apps/api/src/scraper/WebScraper/single_url.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index db8c46b1..13ca7dd2 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -410,6 +410,7 @@ export async function scrapSingleUrl( if (attempt.pageStatusCode) { pageStatusCode = attempt.pageStatusCode; } + if (attempt.pageError && (attempt.pageStatusCode >= 400 || scrapersInOrder.indexOf(scraper) === scrapersInOrder.length - 1)) { // force pageError if it's the last scraper and it failed too pageError = attempt.pageError; @@ -424,8 +425,8 @@ export async function scrapSingleUrl( Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); break; } - if (pageStatusCode && (pageStatusCode == 404)) { - Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`); + if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400)) { + Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`); break; } // const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; @@ -477,6 +478,7 @@ export async function scrapSingleUrl( message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error), stack: error.stack, }); + return { content: "", markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,