From 90c54c32fdef78f37169a6a9b82db2baf676d7f8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 3 Jul 2024 18:01:17 -0300 Subject: [PATCH] Nick: refactor --- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- apps/api/src/scraper/WebScraper/global.ts | 1 + .../src/scraper/WebScraper/scrapers/fetch.ts | 70 ++++ .../scraper/WebScraper/scrapers/fireEngine.ts | 119 ++++++ .../scraper/WebScraper/scrapers/playwright.ts | 98 +++++ .../WebScraper/scrapers/scrapingBee.ts | 80 ++++ apps/api/src/scraper/WebScraper/single_url.ts | 380 +----------------- 7 files changed, 391 insertions(+), 359 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/global.ts create mode 100644 apps/api/src/scraper/WebScraper/scrapers/fetch.ts create mode 100644 apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts create mode 100644 apps/api/src/scraper/WebScraper/scrapers/playwright.ts create mode 100644 apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 831970ea..99fff9e4 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -4,7 +4,7 @@ import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import async from "async"; import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; -import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; +import { scrapSingleUrl } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; diff --git a/apps/api/src/scraper/WebScraper/global.ts b/apps/api/src/scraper/WebScraper/global.ts new file mode 100644 index 00000000..7233fe78 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/global.ts @@ -0,0 +1 @@ +export const universalTimeout = 15000; \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts new file mode 100644 index 00000000..562fa6e7 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -0,0 +1,70 @@ +import axios from "axios"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +export async function scrapWithFetch( + url: string, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: "fetch", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const response = await axios.get(url, { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout, + transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically + }); + + if (response.status !== 200) { + console.error( + `[Axios] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.statusText; + logParams.response_code = response.status; + return { + content: "", + pageStatusCode: response.status, + pageError: response.statusText, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + } else { + const text = response.data; + const result = { content: text, pageStatusCode: 200 }; + logParams.success = true; + logParams.html = text; + logParams.response_code = 200; + logParams.error_message = null; + return result; + } + } catch (error) { + if (error.code === "ECONNABORTED") { + logParams.error_message = "Request timed out"; + console.log(`[Axios] Request timed out for ${url}`); + } else { + logParams.error_message = error.message || error; + console.error(`[Axios] Error fetching url: ${url} -> ${error}`); + } + return { content: "" }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } +} diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts new file mode 100644 index 00000000..f6121861 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -0,0 +1,119 @@ +import axios from "axios"; +import { FireEngineResponse } from "../../../lib/entities"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +export async function scrapWithFireEngine({ + url, + waitFor = 0, + screenshot = false, + pageOptions = { parsePDF: true }, + headers, + options, +}: { + url: string; + waitFor?: number; + screenshot?: boolean; + pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; + headers?: Record; + options?: any; +}): Promise { + const logParams = { + url, + scraper: "fire-engine", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const reqParams = await generateRequestParams(url); + const waitParam = reqParams["params"]?.wait ?? waitFor; + const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; + console.log( + `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` + ); + + const response = await axios.post( + process.env.FIRE_ENGINE_BETA_URL + "/scrape", + { + url: url, + wait: waitParam, + screenshot: screenshotParam, + headers: headers, + pageOptions: pageOptions, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam, + } + ); + + if (response.status !== 200) { + console.error( + `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.data?.pageError; + logParams.response_code = response.data?.pageStatusCode; + return { + html: "", + screenshot: "", + pageStatusCode: response.data?.pageStatusCode, + pageError: response.data?.pageError, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( + url, + pageOptions?.parsePDF + ); + logParams.success = true; + // We shouldnt care about the pdf logging here I believe + return { html: content, screenshot: "", pageStatusCode, pageError }; + } else { + const data = response.data; + logParams.success = + (data.pageStatusCode >= 200 && data.pageStatusCode < 300) || + data.pageStatusCode === 404; + logParams.html = data.content ?? ""; + logParams.response_code = data.pageStatusCode; + logParams.error_message = data.pageError; + return { + html: data.content ?? "", + screenshot: data.screenshot ?? "", + pageStatusCode: data.pageStatusCode, + pageError: data.pageError, + }; + } + } catch (error) { + if (error.code === "ECONNABORTED") { + console.log(`[Fire-Engine] Request timed out for ${url}`); + logParams.error_message = "Request timed out"; + } else { + console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + logParams.error_message = error.message || error; + } + return { html: "", screenshot: "" }; + } finally { + const endTime = Date.now(); + const time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape({ + url: logParams.url, + scraper: logParams.scraper, + success: logParams.success, + response_code: logParams.response_code, + time_taken_seconds, + error_message: logParams.error_message, + html: logParams.html, + }); + } +} diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts new file mode 100644 index 00000000..fd1aef53 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -0,0 +1,98 @@ +import axios from "axios"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +export async function scrapWithPlaywright( + url: string, + waitFor: number = 0, + headers?: Record, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: "playwright", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const reqParams = await generateRequestParams(url); + // If the user has passed a wait parameter in the request, use that + const waitParam = reqParams["params"]?.wait ?? waitFor; + + const response = await axios.post( + process.env.PLAYWRIGHT_MICROSERVICE_URL, + { + url: url, + wait_after_load: waitParam, + headers: headers, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time + transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically + } + ); + + if (response.status !== 200) { + console.error( + `[Playwright] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.data?.pageError; + logParams.response_code = response.data?.pageStatusCode; + return { + content: "", + pageStatusCode: response.data?.pageStatusCode, + pageError: response.data?.pageError, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + } else { + const textData = response.data; + try { + const data = JSON.parse(textData); + const html = data.content; + logParams.success = true; + logParams.html = html; + logParams.response_code = data.pageStatusCode; + logParams.error_message = data.pageError; + return { + content: html ?? "", + pageStatusCode: data.pageStatusCode, + pageError: data.pageError, + }; + } catch (jsonError) { + logParams.error_message = jsonError.message || jsonError; + console.error( + `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` + ); + return { content: "" }; + } + } + } catch (error) { + if (error.code === "ECONNABORTED") { + logParams.error_message = "Request timed out"; + console.log(`[Playwright] Request timed out for ${url}`); + } else { + logParams.error_message = error.message || error; + console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + } + return { content: "" }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } +} diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts new file mode 100644 index 00000000..5ab0e061 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -0,0 +1,80 @@ +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; +import { ScrapingBeeClient } from "scrapingbee"; + + +export async function scrapWithScrapingBee( + url: string, + wait_browser: string = "domcontentloaded", + timeout: number = universalTimeout, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } + ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + try { + const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); + const clientParams = await generateRequestParams( + url, + wait_browser, + timeout + ); + const response = await client.get({ + ...clientParams, + params: { + ...clientParams.params, + transparent_status_code: "True", + }, + }); + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + return { content, pageStatusCode, pageError }; + } else { + let text = ""; + try { + const decoder = new TextDecoder(); + text = decoder.decode(response.data); + logParams.success = true; + } catch (decodeError) { + console.error( + `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` + ); + logParams.error_message = decodeError.message || decodeError; + } + logParams.response_code = response.status; + logParams.html = text; + logParams.success = response.status >= 200 && response.status < 300 || response.status === 404; + logParams.error_message = response.statusText != "OK" ? response.statusText : undefined; + return { + content: text, + pageStatusCode: response.status, + pageError: + response.statusText != "OK" ? response.statusText : undefined, + }; + } + } catch (error) { + console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); + logParams.error_message = error.message || error; + logParams.response_code = error.response?.status; + return { + content: "", + pageStatusCode: error.response?.status, + pageError: error.response?.statusText, + }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } + } \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c7a74552..cc162456 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -1,14 +1,21 @@ import * as cheerio from "cheerio"; -import { ScrapingBeeClient } from "scrapingbee"; import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; -import { Document, PageOptions, FireEngineResponse, ExtractorOptions } from "../../lib/entities"; +import { + Document, + PageOptions, + FireEngineResponse, + ExtractorOptions, +} from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { handleCustomScraping } from "./custom/handleCustomScraping"; import { removeUnwantedElements } from "./utils/removeUnwantedElements"; -import axios from "axios"; +import { scrapWithFetch } from "./scrapers/fetch"; +import { scrapWithFireEngine } from "./scrapers/fireEngine"; +import { scrapWithPlaywright } from "./scrapers/playwright"; +import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; dotenv.config(); @@ -20,8 +27,6 @@ const baseScrapers = [ "fetch", ] as const; -const universalTimeout = 15000; - export async function generateRequestParams( url: string, wait_browser: string = "domcontentloaded", @@ -45,355 +50,6 @@ export async function generateRequestParams( return defaultParams; } } -import { logScrape } from "../../services/logging/scrape_log"; - -export async function scrapWithFireEngine({ - url, - waitFor = 0, - screenshot = false, - pageOptions = { parsePDF: true }, - headers, - options, -}: { - url: string; - waitFor?: number; - screenshot?: boolean; - pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; - headers?: Record; - options?: any; -}): Promise { - - const logParams = { - url, - scraper: "fire-engine", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: "", - html: "", - startTime: Date.now(), - }; - - - try { - const reqParams = await generateRequestParams(url); - const waitParam = reqParams["params"]?.wait ?? waitFor; - const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; - console.log( - `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` - ); - - const response = await axios.post( - process.env.FIRE_ENGINE_BETA_URL + "/scrape", - { - url: url, - wait: waitParam, - screenshot: screenshotParam, - headers: headers, - pageOptions: pageOptions, - }, - { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout + waitParam, - } - ); - - if (response.status !== 200) { - console.error( - `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` - ); - logParams.error_message = response.data?.pageError; - logParams.response_code = response.data?.pageStatusCode; - return { - html: "", - screenshot: "", - pageStatusCode: response.data?.pageStatusCode, - pageError: response.data?.pageError, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( - url, - pageOptions?.parsePDF - ); - logParams.success = true; - // We shouldnt care about the pdf logging here I believe - return { html: content, screenshot: "", pageStatusCode, pageError }; - } else { - const data = response.data; - logParams.success = data.pageStatusCode >= 200 && data.pageStatusCode < 300 || data.pageStatusCode === 404; - logParams.html = data.content ?? ""; - logParams.response_code = data.pageStatusCode; - logParams.error_message = data.pageError; - return { - html: data.content ?? "", - screenshot: data.screenshot ?? "", - pageStatusCode: data.pageStatusCode, - pageError: data.pageError, - }; - } - } catch (error) { - if (error.code === "ECONNABORTED") { - console.log(`[Fire-Engine] Request timed out for ${url}`); - logParams.error_message = "Request timed out"; - } else { - console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); - logParams.error_message = error.message || error; - } - return { html: "", screenshot: "" }; - } finally { - const endTime = Date.now(); - const time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape({ - url: logParams.url, - scraper: logParams.scraper, - success: logParams.success, - response_code: logParams.response_code, - time_taken_seconds, - error_message: logParams.error_message, - html: logParams.html, - }); - } -} - -export async function scrapWithScrapingBee( - url: string, - wait_browser: string = "domcontentloaded", - timeout: number = universalTimeout, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - const logParams = { - url, - scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: "", - html: "", - startTime: Date.now(), - }; - try { - const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); - const clientParams = await generateRequestParams( - url, - wait_browser, - timeout - ); - const response = await client.get({ - ...clientParams, - params: { - ...clientParams.params, - transparent_status_code: "True", - }, - }); - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - logParams.success = true; - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); - return { content, pageStatusCode, pageError }; - } else { - let text = ""; - try { - const decoder = new TextDecoder(); - text = decoder.decode(response.data); - logParams.success = true; - } catch (decodeError) { - console.error( - `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` - ); - logParams.error_message = decodeError.message || decodeError; - } - logParams.response_code = response.status; - logParams.html = text; - logParams.success = response.status >= 200 && response.status < 300 || response.status === 404; - logParams.error_message = response.statusText != "OK" ? response.statusText : undefined; - return { - content: text, - pageStatusCode: response.status, - pageError: - response.statusText != "OK" ? response.statusText : undefined, - }; - } - } catch (error) { - console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); - logParams.error_message = error.message || error; - logParams.response_code = error.response?.status; - return { - content: "", - pageStatusCode: error.response?.status, - pageError: error.response?.statusText, - }; - } finally { - const endTime = Date.now(); - logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams); - } -} - -export async function scrapWithPlaywright( - url: string, - waitFor: number = 0, - headers?: Record, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - const logParams = { - url, - scraper: "playwright", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: "", - html: "", - startTime: Date.now(), - }; - - - try { - const reqParams = await generateRequestParams(url); - // If the user has passed a wait parameter in the request, use that - const waitParam = reqParams["params"]?.wait ?? waitFor; - - const response = await axios.post( - process.env.PLAYWRIGHT_MICROSERVICE_URL, - { - url: url, - wait_after_load: waitParam, - headers: headers, - }, - { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time - transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically - } - ); - - if (response.status !== 200) { - console.error( - `[Playwright] Error fetching url: ${url} with status: ${response.status}` - ); - logParams.error_message = response.data?.pageError; - logParams.response_code = response.data?.pageStatusCode; - return { - content: "", - pageStatusCode: response.data?.pageStatusCode, - pageError: response.data?.pageError, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - logParams.success = true; - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - } else { - const textData = response.data; - try { - const data = JSON.parse(textData); - const html = data.content; - logParams.success = true; - logParams.html = html; - logParams.response_code = data.pageStatusCode; - logParams.error_message = data.pageError; - return { - content: html ?? "", - pageStatusCode: data.pageStatusCode, - pageError: data.pageError, - }; - } catch (jsonError) { - logParams.error_message = jsonError.message || jsonError; - console.error( - `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` - ); - return { content: "" }; - } - } - } catch (error) { - if (error.code === "ECONNABORTED") { - logParams.error_message = "Request timed out"; - console.log(`[Playwright] Request timed out for ${url}`); - } else { - logParams.error_message = error.message || error; - console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); - } - return { content: "" }; - } finally { - const endTime = Date.now(); - logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams); - } -} - -export async function scrapWithFetch( - url: string, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - const logParams = { - url, - scraper: "fetch", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: "", - html: "", - startTime: Date.now(), - }; - - - try { - const response = await axios.get(url, { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout, - transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically - }); - - if (response.status !== 200) { - console.error( - `[Axios] Error fetching url: ${url} with status: ${response.status}` - ); - logParams.error_message = response.statusText; - logParams.response_code = response.status; - return { - content: "", - pageStatusCode: response.status, - pageError: response.statusText, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - logParams.success = true; - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - } else { - const text = response.data; - const result = { content: text, pageStatusCode: 200 }; - logParams.success = true; - logParams.html = text; - logParams.response_code = 200; - logParams.error_message = null; - return result; - } - } catch (error) { - if (error.code === "ECONNABORTED") { - logParams.error_message = "Request timed out"; - console.log(`[Axios] Request timed out for ${url}`); - } else { - logParams.error_message = error.message || error; - console.error(`[Axios] Error fetching url: ${url} -> ${error}`); - } - return { content: "" }; - } finally { - const endTime = Date.now(); - logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams); - } -} /** * Get the order of scrapers to be used for scraping a URL @@ -464,7 +120,7 @@ export async function scrapSingleUrl( headers: undefined, }, extractorOptions: ExtractorOptions = { - mode: "llm-extraction-from-markdown" + mode: "llm-extraction-from-markdown", }, existingHtml: string = "" ): Promise { @@ -628,7 +284,7 @@ export async function scrapSingleUrl( html = attempt.html ?? ""; rawHtml = attempt.rawHtml ?? ""; screenshot = attempt.screenshot ?? ""; - + if (attempt.pageStatusCode) { pageStatusCode = attempt.pageStatusCode; } @@ -659,7 +315,11 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, + rawHtml: + pageOptions.includeRawHtml || + extractorOptions.mode === "llm-extraction-from-raw-html" + ? rawHtml + : undefined, metadata: { ...metadata, screenshot: screenshot, @@ -673,7 +333,11 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, + rawHtml: + pageOptions.includeRawHtml || + extractorOptions.mode === "llm-extraction-from-raw-html" + ? rawHtml + : undefined, metadata: { ...metadata, sourceURL: urlToScrap,