Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
T

38 lines
1012 B
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { TimeoutError } from "../../error";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
2024-12-11 19:46:11 -03:00
export async function scrapeURLWithFetch(
2024-12-11 19:51:08 -03:00
meta: Meta,
2024-12-15 18:58:29 +01:00
timeToRun: number | undefined
2024-12-11 19:46:11 -03:00
): Promise<EngineScrapeResult> {
2024-12-15 18:58:29 +01:00
const timeout = timeToRun ?? 300000;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const response = await Promise.race([
fetch(meta.url, {
redirect: "follow",
2024-12-11 19:51:08 -03:00
headers: meta.options.headers,
2024-12-11 19:46:11 -03:00
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
2024-12-11 19:51:08 -03:00
{ cause: { timeout } },
2024-12-11 19:46:11 -03:00
);
2024-12-11 19:51:08 -03:00
})(),
2024-12-11 19:46:11 -03:00
]);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
specialtyScrapeCheck(
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
2024-12-11 19:51:08 -03:00
Object.fromEntries(response.headers as any),
2024-12-11 19:46:11 -03:00
);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return {
url: response.url,
html: await response.text(),
2024-12-11 19:51:08 -03:00
statusCode: response.status,
2024-12-11 19:46:11 -03:00
// TODO: error?
};
2024-11-07 20:57:33 +01:00
}