Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
T

59 lines
1.5 KiB
TypeScript
Raw Normal View History

import * as undici from "undici";
2024-11-07 20:57:33 +01:00
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { TimeoutError } from "../../error";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
2025-01-10 18:35:10 -03:00
import {
InsecureConnectionError,
makeSecureDispatcher,
} from "../utils/safeFetch";
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
export async function scrapeURLWithFetch(
2024-12-11 19:51:08 -03:00
meta: Meta,
2024-12-17 16:58:57 -03:00
timeToRun: number | undefined,
2024-12-11 19:46:11 -03:00
): Promise<EngineScrapeResult> {
2024-12-15 18:58:29 +01:00
const timeout = timeToRun ?? 300000;
2024-11-07 20:57:33 +01:00
let response: undici.Response;
try {
response = await Promise.race([
undici.fetch(meta.url, {
dispatcher: await makeSecureDispatcher(meta.url),
redirect: "follow",
headers: meta.options.headers,
signal: meta.internalOptions.abort,
}),
(async () => {
2025-01-10 18:35:10 -03:00
await new Promise((resolve) =>
setTimeout(() => resolve(null), timeout),
);
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
{ cause: { timeout } },
);
})(),
]);
} catch (error) {
2025-01-10 18:35:10 -03:00
if (
error instanceof TypeError &&
error.cause instanceof InsecureConnectionError
) {
throw error.cause;
} else {
throw error;
}
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
specialtyScrapeCheck(
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
2024-12-11 19:51:08 -03:00
Object.fromEntries(response.headers as any),
2024-12-11 19:46:11 -03:00
);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return {
url: response.url,
html: await response.text(),
2024-12-11 19:51:08 -03:00
statusCode: response.status,
2024-12-11 19:46:11 -03:00
// TODO: error?
};
2024-11-07 20:57:33 +01:00
}