2024-11-07 20:57:33 +01:00
|
|
|
import { z } from "zod";
|
|
|
|
|
import { EngineScrapeResult } from "..";
|
|
|
|
|
import { Meta } from "../..";
|
|
|
|
|
import { TimeoutError } from "../../error";
|
|
|
|
|
import { robustFetch } from "../../lib/fetch";
|
|
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
export async function scrapeURLWithPlaywright(
|
|
|
|
|
meta: Meta
|
|
|
|
|
): Promise<EngineScrapeResult> {
|
|
|
|
|
const timeout = 20000 + meta.options.waitFor;
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
const response = await Promise.race([
|
|
|
|
|
await robustFetch({
|
|
|
|
|
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
|
|
|
|
|
headers: {
|
|
|
|
|
"Content-Type": "application/json"
|
|
|
|
|
},
|
|
|
|
|
body: {
|
|
|
|
|
url: meta.url,
|
|
|
|
|
wait_after_load: meta.options.waitFor,
|
|
|
|
|
timeout,
|
|
|
|
|
headers: meta.options.headers
|
|
|
|
|
},
|
|
|
|
|
method: "POST",
|
|
|
|
|
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
|
|
|
|
schema: z.object({
|
|
|
|
|
content: z.string(),
|
|
|
|
|
pageStatusCode: z.number(),
|
|
|
|
|
pageError: z.string().optional()
|
|
|
|
|
})
|
|
|
|
|
}),
|
|
|
|
|
(async () => {
|
|
|
|
|
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
|
|
|
|
|
throw new TimeoutError(
|
|
|
|
|
"Playwright was unable to scrape the page before timing out",
|
|
|
|
|
{ cause: { timeout } }
|
|
|
|
|
);
|
|
|
|
|
})()
|
|
|
|
|
]);
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
return {
|
|
|
|
|
url: meta.url, // TODO: impove redirect following
|
|
|
|
|
html: response.content,
|
|
|
|
|
statusCode: response.pageStatusCode,
|
|
|
|
|
error: response.pageError
|
|
|
|
|
};
|
2024-11-07 20:57:33 +01:00
|
|
|
}
|