fix(scrapeURL/engines): better timeouts

This commit is contained in:
Gergő Móricz
2024-12-15 18:58:29 +01:00
parent a5256827c0
commit 0f3a27bf27
7 changed files with 31 additions and 23 deletions
@@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export async function scrapeURLWithFetch( export async function scrapeURLWithFetch(
meta: Meta, meta: Meta,
timeToRun: number | undefined
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = 20000; const timeout = timeToRun ?? 300000;
const response = await Promise.race([ const response = await Promise.race([
fetch(meta.url, { fetch(meta.url, {
@@ -18,8 +18,6 @@ import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities"; import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export const defaultTimeout = 10000;
// This function does not take `Meta` on purpose. It may not access any // This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the // meta values to construct the request -- that must be done by the
// `scrapeURLWithFireEngine*` functions. // `scrapeURLWithFireEngine*` functions.
@@ -31,7 +29,7 @@ async function performFireEngineScrape<
>( >(
logger: Logger, logger: Logger,
request: FireEngineScrapeRequestCommon & Engine, request: FireEngineScrapeRequestCommon & Engine,
timeout = defaultTimeout, timeout: number,
): Promise<FireEngineCheckStatusSuccess> { ): Promise<FireEngineCheckStatusSuccess> {
const scrape = await fireEngineScrape( const scrape = await fireEngineScrape(
logger.child({ method: "fireEngineScrape" }), logger.child({ method: "fireEngineScrape" }),
@@ -51,11 +49,7 @@ async function performFireEngineScrape<
}); });
} }
const userParam = request.timeout ?? 0; if (Date.now() - startTime > timeout) {
// Use 70% of the user-provided timeout as the timeout for fire-engine check status
const fireEngineTimeout = timeout + Math.round(userParam * 0.7);
const fullTimeout = Math.max(fireEngineTimeout, timeout);
if (Date.now() - startTime > fullTimeout) {
logger.info( logger.info(
"Fire-engine was unable to scrape the page before timing out.", "Fire-engine was unable to scrape the page before timing out.",
{ errors, timeout }, { errors, timeout },
@@ -98,6 +92,7 @@ async function performFireEngineScrape<
export async function scrapeURLWithFireEngineChromeCDP( export async function scrapeURLWithFireEngineChromeCDP(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const actions: Action[] = [ const actions: Action[] = [
// Transform waitFor option into an action (unsupported by chrome-cdp) // Transform waitFor option into an action (unsupported by chrome-cdp)
@@ -125,7 +120,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
...(meta.options.actions ?? []), ...(meta.options.actions ?? []),
]; ];
const timeout = (meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3)); const timeout = timeToRun ?? 300000;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestChromeCDP = { FireEngineScrapeRequestChromeCDP = {
@@ -212,8 +207,9 @@ export async function scrapeURLWithFireEngineChromeCDP(
export async function scrapeURLWithFireEnginePlaywright( export async function scrapeURLWithFireEnginePlaywright(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3); const timeout = timeToRun ?? 300000;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = { FireEngineScrapeRequestPlaywright = {
@@ -271,8 +267,9 @@ export async function scrapeURLWithFireEnginePlaywright(
export async function scrapeURLWithFireEngineTLSClient( export async function scrapeURLWithFireEngineTLSClient(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = meta.options.timeout === undefined ? 30000 : Math.round(meta.options.timeout / 3); const timeout = timeToRun ?? 30000;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestTLSClient = { FireEngineScrapeRequestTLSClient = {
@@ -105,7 +105,7 @@ export type EngineScrapeResult = {
}; };
const engineHandlers: { const engineHandlers: {
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>; [E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult>;
} = { } = {
cache: scrapeCache, cache: scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
@@ -372,6 +372,7 @@ export function buildFallbackList(meta: Meta): {
export async function scrapeURLWithEngine( export async function scrapeURLWithEngine(
meta: Meta, meta: Meta,
engine: Engine, engine: Engine,
timeToRun: number | undefined
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const fn = engineHandlers[engine]; const fn = engineHandlers[engine];
const logger = meta.logger.child({ const logger = meta.logger.child({
@@ -383,5 +384,5 @@ export async function scrapeURLWithEngine(
logger, logger,
}; };
return await fn(_meta); return await fn(_meta, timeToRun);
} }
@@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string };
async function scrapePDFWithLlamaParse( async function scrapePDFWithLlamaParse(
meta: Meta, meta: Meta,
tempFilePath: string, tempFilePath: string,
timeToRun: number | undefined,
): Promise<PDFProcessorResult> { ): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with LlamaIndex", { meta.logger.debug("Processing PDF document with LlamaIndex", {
tempFilePath, tempFilePath,
@@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse(
// TODO: timeout, retries // TODO: timeout, retries
const startedAt = Date.now(); const startedAt = Date.now();
const timeout = timeToRun ?? 300000;
while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) { while (Date.now() <= startedAt + timeout) {
try { try {
const result = await robustFetch({ const result = await robustFetch({
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
@@ -122,7 +124,7 @@ async function scrapePDFWithParsePDF(
}; };
} }
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> { export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
if (!meta.options.parsePDF) { if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url); const file = await fetchFileToBuffer(meta.url);
const content = file.buffer.toString("base64"); const content = file.buffer.toString("base64");
@@ -148,6 +150,7 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
}), }),
}, },
tempFilePath, tempFilePath,
timeToRun,
); );
} catch (error) { } catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") { if (error instanceof Error && error.message === "LlamaParse timed out") {
@@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch";
export async function scrapeURLWithPlaywright( export async function scrapeURLWithPlaywright(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = 20000 + meta.options.waitFor; const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
const response = await Promise.race([ const response = await Promise.race([
await robustFetch({ await robustFetch({
@@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright(
}), }),
}), }),
(async () => { (async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), 20000)); await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
throw new TimeoutError( throw new TimeoutError(
"Playwright was unable to scrape the page before timing out", "Playwright was unable to scrape the page before timing out",
{ cause: { timeout } }, { cause: { timeout } },
@@ -9,16 +9,17 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
export function scrapeURLWithScrapingBee( export function scrapeURLWithScrapingBee(
wait_browser: "domcontentloaded" | "networkidle2", wait_browser: "domcontentloaded" | "networkidle2",
): (meta: Meta) => Promise<EngineScrapeResult> { ): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
return async (meta: Meta): Promise<EngineScrapeResult> => { return async (meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> => {
let response: AxiosResponse<any>; let response: AxiosResponse<any>;
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
try { try {
response = await client.get({ response = await client.get({
url: meta.url, url: meta.url,
params: { params: {
timeout: 15000, // TODO: dynamic timeout based on request timeout timeout,
wait_browser: wait_browser, wait_browser: wait_browser,
wait: Math.min(meta.options.waitFor, 35000), wait: meta.options.waitFor,
transparent_status_code: true, transparent_status_code: true,
json_response: true, json_response: true,
screenshot: meta.options.formats.includes("screenshot"), screenshot: meta.options.formats.includes("screenshot"),
+5 -1
View File
@@ -202,11 +202,15 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
const results: EngineResultsTracker = {}; const results: EngineResultsTracker = {};
let result: EngineScrapeResultWithContext | null = null; let result: EngineScrapeResultWithContext | null = null;
const timeToRun = meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3))
: undefined
for (const { engine, unsupportedFeatures } of fallbackList) { for (const { engine, unsupportedFeatures } of fallbackList) {
const startedAt = Date.now(); const startedAt = Date.now();
try { try {
meta.logger.info("Scraping via " + engine + "..."); meta.logger.info("Scraping via " + engine + "...");
const _engineResult = await scrapeURLWithEngine(meta, engine); const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
if (_engineResult.markdown === undefined) { if (_engineResult.markdown === undefined) {
// Some engines emit Markdown directly. // Some engines emit Markdown directly.
_engineResult.markdown = await parseMarkdown(_engineResult.html); _engineResult.markdown = await parseMarkdown(_engineResult.html);