Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/index.ts
T

480 lines
12 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { ScrapeActionContent } from "../../../lib/entities";
import { Meta } from "..";
import { scrapeDOCX } from "./docx";
2024-12-11 19:46:11 -03:00
import {
scrapeURLWithFireEngineChromeCDP,
scrapeURLWithFireEnginePlaywright,
2024-12-11 19:51:08 -03:00
scrapeURLWithFireEngineTLSClient,
2024-12-11 19:46:11 -03:00
} from "./fire-engine";
2024-11-07 20:57:33 +01:00
import { scrapePDF } from "./pdf";
import { scrapeURLWithFetch } from "./fetch";
import { scrapeURLWithPlaywright } from "./playwright";
2024-11-14 19:47:12 +01:00
import { scrapeCache } from "./cache";
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
export type Engine =
| "fire-engine;chrome-cdp"
| "fire-engine(retry);chrome-cdp"
| "fire-engine;chrome-cdp;stealth"
| "fire-engine(retry);chrome-cdp;stealth"
2024-12-11 19:46:11 -03:00
| "fire-engine;playwright"
| "fire-engine;playwright;stealth"
2024-12-11 19:46:11 -03:00
| "fire-engine;tlsclient"
| "fire-engine;tlsclient;stealth"
2024-12-11 19:46:11 -03:00
| "playwright"
| "fetch"
| "pdf"
| "docx"
| "cache";
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const useFireEngine =
process.env.FIRE_ENGINE_BETA_URL !== "" &&
process.env.FIRE_ENGINE_BETA_URL !== undefined;
const usePlaywright =
process.env.PLAYWRIGHT_MICROSERVICE_URL !== "" &&
process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
const useCache =
process.env.CACHE_REDIS_URL !== "" &&
process.env.CACHE_REDIS_URL !== undefined;
2024-12-03 21:07:45 -03:00
2024-11-07 20:57:33 +01:00
export const engines: Engine[] = [
2025-01-10 18:35:10 -03:00
...(useCache ? ["cache" as const] : []),
2024-12-11 19:46:11 -03:00
...(useFireEngine
? [
"fire-engine;chrome-cdp" as const,
"fire-engine;chrome-cdp;stealth" as const,
"fire-engine(retry);chrome-cdp" as const,
"fire-engine(retry);chrome-cdp;stealth" as const,
2024-12-11 19:46:11 -03:00
"fire-engine;playwright" as const,
"fire-engine;playwright;stealth" as const,
2024-12-11 19:51:08 -03:00
"fire-engine;tlsclient" as const,
"fire-engine;tlsclient;stealth" as const,
2024-12-11 19:46:11 -03:00
]
: []),
...(usePlaywright ? ["playwright" as const] : []),
"fetch",
"pdf",
2024-12-11 19:51:08 -03:00
"docx",
2024-11-07 20:57:33 +01:00
];
export const featureFlags = [
2024-12-11 19:46:11 -03:00
"actions",
"waitFor",
"screenshot",
"screenshot@fullScreen",
"pdf",
"docx",
"atsv",
"location",
"mobile",
"skipTlsVerification",
2024-12-11 19:51:08 -03:00
"useFastMode",
"stealthProxy",
2024-11-07 20:57:33 +01:00
] as const;
2024-12-11 19:46:11 -03:00
export type FeatureFlag = (typeof featureFlags)[number];
2024-11-07 20:57:33 +01:00
export const featureFlagOptions: {
2024-12-11 19:46:11 -03:00
[F in FeatureFlag]: {
priority: number;
};
2024-11-07 20:57:33 +01:00
} = {
2024-12-11 19:46:11 -03:00
actions: { priority: 20 },
waitFor: { priority: 1 },
screenshot: { priority: 10 },
"screenshot@fullScreen": { priority: 10 },
pdf: { priority: 100 },
docx: { priority: 100 },
atsv: { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
useFastMode: { priority: 90 },
location: { priority: 10 },
mobile: { priority: 10 },
2024-12-11 19:51:08 -03:00
skipTlsVerification: { priority: 10 },
stealthProxy: { priority: 20 },
2024-11-07 20:57:33 +01:00
} as const;
export type EngineScrapeResult = {
2024-12-11 19:46:11 -03:00
url: string;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
html: string;
markdown?: string;
statusCode: number;
error?: string;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
screenshot?: string;
actions?: {
screenshots: string[];
scrapes: ScrapeActionContent[];
javascriptReturns: {
type: string;
value: unknown
}[];
2024-12-11 19:46:11 -03:00
};
};
2024-11-07 20:57:33 +01:00
const engineHandlers: {
2024-12-17 16:58:57 -03:00
[E in Engine]: (
meta: Meta,
timeToRun: number | undefined,
) => Promise<EngineScrapeResult>;
2024-11-07 20:57:33 +01:00
} = {
2024-12-11 19:46:11 -03:00
cache: scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
"fire-engine(retry);chrome-cdp": scrapeURLWithFireEngineChromeCDP,
"fire-engine;chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP,
"fire-engine(retry);chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP,
2024-12-11 19:46:11 -03:00
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
"fire-engine;playwright;stealth": scrapeURLWithFireEnginePlaywright,
2024-12-11 19:46:11 -03:00
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
"fire-engine;tlsclient;stealth": scrapeURLWithFireEngineTLSClient,
2024-12-11 19:46:11 -03:00
playwright: scrapeURLWithPlaywright,
fetch: scrapeURLWithFetch,
pdf: scrapePDF,
2024-12-11 19:51:08 -03:00
docx: scrapeDOCX,
2024-11-07 20:57:33 +01:00
};
export const engineOptions: {
2024-12-11 19:46:11 -03:00
[E in Engine]: {
// A list of feature flags the engine supports.
features: { [F in FeatureFlag]: boolean };
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
// This defines the order of engines in general. The engine with the highest quality will be used the most.
// Negative quality numbers are reserved for specialty engines, e.g. PDF, DOCX, stealth proxies
2024-12-11 19:46:11 -03:00
quality: number;
};
2024-11-07 20:57:33 +01:00
} = {
2024-12-11 19:46:11 -03:00
cache: {
features: {
actions: false,
waitFor: true,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false, // TODO: figure this out
docx: false, // TODO: figure this out
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
2024-12-11 19:51:08 -03:00
useFastMode: false,
stealthProxy: false,
2024-11-14 19:47:12 +01:00
},
2024-12-11 19:51:08 -03:00
quality: 1000, // cache should always be tried first
2024-12-11 19:46:11 -03:00
},
"fire-engine;chrome-cdp": {
features: {
actions: true,
waitFor: true, // through actions transform
screenshot: true, // through actions transform
"screenshot@fullScreen": true, // through actions transform
pdf: false,
docx: false,
atsv: false,
location: true,
mobile: true,
skipTlsVerification: true,
2024-12-11 19:51:08 -03:00
useFastMode: false,
stealthProxy: false,
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:51:08 -03:00
quality: 50,
2024-12-11 19:46:11 -03:00
},
"fire-engine(retry);chrome-cdp": {
features: {
actions: true,
waitFor: true, // through actions transform
screenshot: true, // through actions transform
"screenshot@fullScreen": true, // through actions transform
pdf: false,
docx: false,
atsv: false,
location: true,
mobile: true,
skipTlsVerification: true,
useFastMode: false,
stealthProxy: false,
},
quality: 45,
},
"fire-engine;chrome-cdp;stealth": {
features: {
actions: true,
waitFor: true, // through actions transform
screenshot: true, // through actions transform
"screenshot@fullScreen": true, // through actions transform
pdf: false,
docx: false,
atsv: false,
location: true,
mobile: true,
skipTlsVerification: true,
useFastMode: false,
stealthProxy: true,
},
quality: -1,
},
"fire-engine(retry);chrome-cdp;stealth": {
features: {
actions: true,
waitFor: true, // through actions transform
screenshot: true, // through actions transform
"screenshot@fullScreen": true, // through actions transform
pdf: false,
docx: false,
atsv: false,
location: true,
mobile: true,
skipTlsVerification: true,
useFastMode: false,
stealthProxy: true,
},
quality: -5,
},
2024-12-11 19:46:11 -03:00
"fire-engine;playwright": {
features: {
actions: false,
waitFor: true,
screenshot: true,
"screenshot@fullScreen": true,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
2024-12-11 19:51:08 -03:00
useFastMode: false,
stealthProxy: false,
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:51:08 -03:00
quality: 40,
2024-12-11 19:46:11 -03:00
},
"fire-engine;playwright;stealth": {
features: {
actions: false,
waitFor: true,
screenshot: true,
"screenshot@fullScreen": true,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false,
stealthProxy: true,
},
quality: -10,
},
2024-12-11 19:46:11 -03:00
playwright: {
features: {
actions: false,
waitFor: true,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
2024-12-11 19:51:08 -03:00
useFastMode: false,
stealthProxy: false,
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:51:08 -03:00
quality: 20,
2024-12-11 19:46:11 -03:00
},
"fire-engine;tlsclient": {
features: {
actions: false,
waitFor: false,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false,
docx: false,
atsv: true,
location: true,
mobile: false,
skipTlsVerification: false,
2024-12-11 19:51:08 -03:00
useFastMode: true,
stealthProxy: false,
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:51:08 -03:00
quality: 10,
2024-12-11 19:46:11 -03:00
},
"fire-engine;tlsclient;stealth": {
features: {
actions: false,
waitFor: false,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false,
docx: false,
atsv: true,
location: true,
mobile: false,
skipTlsVerification: false,
useFastMode: true,
stealthProxy: true,
},
quality: -15,
},
2024-12-11 19:46:11 -03:00
fetch: {
features: {
actions: false,
waitFor: false,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
2024-12-11 19:51:08 -03:00
useFastMode: true,
stealthProxy: false,
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:51:08 -03:00
quality: 5,
2024-12-11 19:46:11 -03:00
},
pdf: {
features: {
actions: false,
waitFor: false,
screenshot: false,
"screenshot@fullScreen": false,
pdf: true,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
2024-12-11 19:51:08 -03:00
useFastMode: true,
stealthProxy: true, // kinda...
2024-11-07 20:57:33 +01:00
},
quality: -20,
2024-12-11 19:46:11 -03:00
},
docx: {
features: {
actions: false,
waitFor: false,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false,
docx: true,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
2024-12-11 19:51:08 -03:00
useFastMode: true,
stealthProxy: true, // kinda...
2024-11-07 20:57:33 +01:00
},
quality: -20,
2024-12-11 19:51:08 -03:00
},
2024-11-07 20:57:33 +01:00
};
export function buildFallbackList(meta: Meta): {
2024-12-11 19:46:11 -03:00
engine: Engine;
unsupportedFeatures: Set<FeatureFlag>;
2024-11-07 20:57:33 +01:00
}[] {
2025-02-20 00:41:22 +01:00
const _engines: Engine[] = [
...engines,
// enable fire-engine in self-hosted testing environment when mocks are supplied
...((!useFireEngine && meta.mock !== null) ? ["fire-engine;chrome-cdp", "fire-engine(retry);chrome-cdp", "fire-engine;chrome-cdp;stealth", "fire-engine(retry);chrome-cdp;stealth", "fire-engine;playwright", "fire-engine;tlsclient", "fire-engine;playwright;stealth", "fire-engine;tlsclient;stealth"] as Engine[] : [])
2025-02-20 00:41:22 +01:00
];
2025-01-23 08:58:18 +01:00
2025-01-03 21:19:40 -03:00
if (meta.internalOptions.useCache !== true) {
2025-01-23 08:58:18 +01:00
const cacheIndex = _engines.indexOf("cache");
2025-01-03 22:15:23 -03:00
if (cacheIndex !== -1) {
2025-01-23 08:58:18 +01:00
_engines.splice(cacheIndex, 1);
2025-01-03 22:15:23 -03:00
}
} else {
2025-01-03 21:19:40 -03:00
meta.logger.debug("Cache engine enabled by useCache option");
}
2024-12-11 19:46:11 -03:00
const prioritySum = [...meta.featureFlags].reduce(
(a, x) => a + featureFlagOptions[x].priority,
2024-12-11 19:51:08 -03:00
0,
2024-12-11 19:46:11 -03:00
);
const priorityThreshold = Math.floor(prioritySum / 2);
let selectedEngines: {
engine: Engine;
supportScore: number;
unsupportedFeatures: Set<FeatureFlag>;
}[] = [];
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const currentEngines =
meta.internalOptions.forceEngine !== undefined
2025-01-23 08:58:18 +01:00
? (Array.isArray(meta.internalOptions.forceEngine) ? meta.internalOptions.forceEngine : [meta.internalOptions.forceEngine])
: _engines;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
for (const engine of currentEngines) {
const supportedFlags = new Set([
...Object.entries(engineOptions[engine].features)
.filter(
2024-12-11 19:51:08 -03:00
([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true,
2024-12-11 19:46:11 -03:00
)
2024-12-11 19:51:08 -03:00
.map(([k, _]) => k),
2024-12-11 19:46:11 -03:00
]);
const supportScore = [...supportedFlags].reduce(
(a, x) => a + featureFlagOptions[x].priority,
2024-12-11 19:51:08 -03:00
0,
2024-12-11 19:46:11 -03:00
);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const unsupportedFeatures = new Set([...meta.featureFlags]);
for (const flag of meta.featureFlags) {
if (supportedFlags.has(flag)) {
unsupportedFeatures.delete(flag);
}
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (supportScore >= priorityThreshold) {
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
meta.logger.debug(`Engine ${engine} meets feature priority threshold`, {
supportScore,
prioritySum,
priorityThreshold,
featureFlags: [...meta.featureFlags],
2024-12-11 19:51:08 -03:00
unsupportedFeatures,
2024-12-11 19:46:11 -03:00
});
} else {
meta.logger.debug(
`Engine ${engine} does not meet feature priority threshold`,
{
supportScore,
prioritySum,
priorityThreshold,
featureFlags: [...meta.featureFlags],
2024-12-11 19:51:08 -03:00
unsupportedFeatures,
},
2024-12-11 19:46:11 -03:00
);
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (selectedEngines.some((x) => engineOptions[x.engine].quality > 0)) {
selectedEngines = selectedEngines.filter(
2024-12-11 19:51:08 -03:00
(x) => engineOptions[x.engine].quality > 0,
2024-12-11 19:46:11 -03:00
);
}
2024-11-07 20:57:33 +01:00
2025-01-23 08:58:18 +01:00
if (meta.internalOptions.forceEngine === undefined) { // retain force engine order
selectedEngines.sort(
(a, b) =>
b.supportScore - a.supportScore ||
engineOptions[b.engine].quality - engineOptions[a.engine].quality,
);
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return selectedEngines;
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
export async function scrapeURLWithEngine(
meta: Meta,
2024-12-11 19:51:08 -03:00
engine: Engine,
2024-12-17 16:58:57 -03:00
timeToRun: number | undefined,
2024-12-11 19:46:11 -03:00
): Promise<EngineScrapeResult> {
const fn = engineHandlers[engine];
const logger = meta.logger.child({
method: fn.name ?? "scrapeURLWithEngine",
2024-12-11 19:51:08 -03:00
engine,
2024-12-11 19:46:11 -03:00
});
const _meta = {
...meta,
2024-12-11 19:51:08 -03:00
logger,
2024-12-11 19:46:11 -03:00
};
2024-11-07 20:57:33 +01:00
2024-12-15 18:58:29 +01:00
return await fn(_meta, timeToRun);
2024-11-07 20:57:33 +01:00
}