Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
T

49 lines
1.4 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { z } from "zod";
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { TimeoutError } from "../../error";
import { robustFetch } from "../../lib/fetch";
2024-12-11 19:46:11 -03:00
export async function scrapeURLWithPlaywright(
2024-12-11 19:51:08 -03:00
meta: Meta,
2024-12-15 18:58:29 +01:00
timeToRun: number | undefined,
2024-12-11 19:46:11 -03:00
): Promise<EngineScrapeResult> {
2024-12-15 18:58:29 +01:00
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const response = await Promise.race([
await robustFetch({
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
headers: {
2024-12-11 19:51:08 -03:00
"Content-Type": "application/json",
2024-12-11 19:46:11 -03:00
},
body: {
url: meta.url,
wait_after_load: meta.options.waitFor,
timeout,
2024-12-11 19:51:08 -03:00
headers: meta.options.headers,
2024-12-11 19:46:11 -03:00
},
method: "POST",
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
schema: z.object({
content: z.string(),
pageStatusCode: z.number(),
2024-12-11 19:51:08 -03:00
pageError: z.string().optional(),
}),
2024-12-11 19:46:11 -03:00
}),
(async () => {
2024-12-15 18:58:29 +01:00
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
2024-12-11 19:46:11 -03:00
throw new TimeoutError(
"Playwright was unable to scrape the page before timing out",
2024-12-11 19:51:08 -03:00
{ cause: { timeout } },
2024-12-11 19:46:11 -03:00
);
2024-12-11 19:51:08 -03:00
})(),
2024-12-11 19:46:11 -03:00
]);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return {
url: meta.url, // TODO: impove redirect following
html: response.content,
statusCode: response.pageStatusCode,
2024-12-11 19:51:08 -03:00
error: response.pageError,
2024-12-11 19:46:11 -03:00
};
2024-11-07 20:57:33 +01:00
}