Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts
T

109 lines
2.7 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { Logger } from "winston";
import * as Sentry from "@sentry/node";
import { z } from "zod";
import { Action } from "../../../../lib/entities";
import { robustFetch } from "../../lib/fetch";
import { MockState } from "../../lib/mock";
2024-11-07 20:57:33 +01:00
export type FireEngineScrapeRequestCommon = {
2024-12-11 19:46:11 -03:00
url: string;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
headers?: { [K: string]: string };
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
blockMedia?: boolean; // default: true
blockAds?: boolean; // default: true
// pageOptions?: any; // unused, .scrollXPaths is considered on FE side
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
// useProxy?: boolean; // unused, default: true
// customProxy?: string; // unused
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
// disableSmartWaitCache?: boolean; // unused, default: false
// skipDnsCheck?: boolean; // unused, default: false
2024-12-11 19:46:11 -03:00
priority?: number; // default: 1
// team_id?: string; // unused
logRequest?: boolean; // default: true
instantReturn?: boolean; // default: false
geolocation?: { country?: string; languages?: string[] };
timeout?: number;
};
2024-11-07 20:57:33 +01:00
export type FireEngineScrapeRequestChromeCDP = {
2024-12-11 19:46:11 -03:00
engine: "chrome-cdp";
skipTlsVerification?: boolean;
actions?: Action[];
blockMedia?: true; // cannot be false
mobile?: boolean;
disableSmartWaitCache?: boolean;
2024-11-07 20:57:33 +01:00
};
export type FireEngineScrapeRequestPlaywright = {
2024-12-11 19:46:11 -03:00
engine: "playwright";
blockAds?: boolean; // default: true
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
// mutually exclusive, default: false
screenshot?: boolean;
fullPageScreenshot?: boolean;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
wait?: number; // default: 0
2024-11-07 20:57:33 +01:00
};
export type FireEngineScrapeRequestTLSClient = {
2024-12-11 19:46:11 -03:00
engine: "tlsclient";
atsv?: boolean; // v0 only, default: false
disableJsDom?: boolean; // v0 only, default: false
// blockAds?: boolean; // default: true
2024-11-07 20:57:33 +01:00
};
const schema = z.object({
2024-12-11 19:46:11 -03:00
jobId: z.string(),
2024-12-11 19:51:08 -03:00
processing: z.boolean(),
2024-11-07 20:57:33 +01:00
});
2024-12-11 19:46:11 -03:00
export async function fireEngineScrape<
Engine extends
| FireEngineScrapeRequestChromeCDP
| FireEngineScrapeRequestPlaywright
2024-12-11 19:51:08 -03:00
| FireEngineScrapeRequestTLSClient,
2024-12-11 19:46:11 -03:00
>(
logger: Logger,
2024-12-11 19:51:08 -03:00
request: FireEngineScrapeRequestCommon & Engine,
mock: MockState | null,
2024-11-07 20:57:33 +01:00
): Promise<z.infer<typeof schema>> {
2024-12-11 19:46:11 -03:00
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
// TODO: retries
const scrapeRequest = await Sentry.startSpan(
{
name: "fire-engine: Scrape",
attributes: {
2024-12-11 19:51:08 -03:00
url: request.url,
},
2024-12-11 19:46:11 -03:00
},
async (span) => {
return await robustFetch({
url: `${fireEngineURL}/scrape`,
method: "POST",
headers: {
...(Sentry.isInitialized()
? {
"sentry-trace": Sentry.spanToTraceHeader(span),
2024-12-11 19:51:08 -03:00
baggage: Sentry.spanToBaggageHeader(span),
2024-12-11 19:46:11 -03:00
}
2024-12-11 19:51:08 -03:00
: {}),
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
body: request,
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
schema,
2024-12-11 19:51:08 -03:00
tryCount: 3,
mock,
2024-12-11 19:46:11 -03:00
});
2024-12-11 19:51:08 -03:00
},
2024-12-11 19:46:11 -03:00
);
return scrapeRequest;
}