Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
T

48 lines
1.3 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { z } from "zod";
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { TimeoutError } from "../../error";
import { robustFetch } from "../../lib/fetch";
2024-12-11 19:46:11 -03:00
export async function scrapeURLWithPlaywright(
meta: Meta
): Promise<EngineScrapeResult> {
const timeout = 20000 + meta.options.waitFor;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const response = await Promise.race([
await robustFetch({
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
headers: {
"Content-Type": "application/json"
},
body: {
url: meta.url,
wait_after_load: meta.options.waitFor,
timeout,
headers: meta.options.headers
},
method: "POST",
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
schema: z.object({
content: z.string(),
pageStatusCode: z.number(),
pageError: z.string().optional()
})
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
throw new TimeoutError(
"Playwright was unable to scrape the page before timing out",
{ cause: { timeout } }
);
})()
]);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return {
url: meta.url, // TODO: impove redirect following
html: response.content,
statusCode: response.pageStatusCode,
error: response.pageError
};
2024-11-07 20:57:33 +01:00
}