Files
firecrawl/apps/api/src/scraper/scrapeURL/engines/index.ts
T

388 lines
9.2 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { ScrapeActionContent } from "../../../lib/entities";
import { Meta } from "..";
import { scrapeDOCX } from "./docx";
2024-12-11 19:46:11 -03:00
import {
scrapeURLWithFireEngineChromeCDP,
scrapeURLWithFireEnginePlaywright,
scrapeURLWithFireEngineTLSClient
} from "./fire-engine";
2024-11-07 20:57:33 +01:00
import { scrapePDF } from "./pdf";
import { scrapeURLWithScrapingBee } from "./scrapingbee";
import { scrapeURLWithFetch } from "./fetch";
import { scrapeURLWithPlaywright } from "./playwright";
2024-11-14 19:47:12 +01:00
import { scrapeCache } from "./cache";
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
export type Engine =
| "fire-engine;chrome-cdp"
| "fire-engine;playwright"
| "fire-engine;tlsclient"
| "scrapingbee"
| "scrapingbeeLoad"
| "playwright"
| "fetch"
| "pdf"
| "docx"
| "cache";
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const useScrapingBee =
process.env.SCRAPING_BEE_API_KEY !== "" &&
process.env.SCRAPING_BEE_API_KEY !== undefined;
const useFireEngine =
process.env.FIRE_ENGINE_BETA_URL !== "" &&
process.env.FIRE_ENGINE_BETA_URL !== undefined;
const usePlaywright =
process.env.PLAYWRIGHT_MICROSERVICE_URL !== "" &&
process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
const useCache =
process.env.CACHE_REDIS_URL !== "" &&
process.env.CACHE_REDIS_URL !== undefined;
2024-12-03 21:07:45 -03:00
2024-11-07 20:57:33 +01:00
export const engines: Engine[] = [
2024-12-11 19:46:11 -03:00
// ...(useCache ? [ "cache" as const ] : []),
...(useFireEngine
? [
"fire-engine;chrome-cdp" as const,
"fire-engine;playwright" as const,
"fire-engine;tlsclient" as const
]
: []),
...(useScrapingBee
? ["scrapingbee" as const, "scrapingbeeLoad" as const]
: []),
...(usePlaywright ? ["playwright" as const] : []),
"fetch",
"pdf",
"docx"
2024-11-07 20:57:33 +01:00
];
export const featureFlags = [
2024-12-11 19:46:11 -03:00
"actions",
"waitFor",
"screenshot",
"screenshot@fullScreen",
"pdf",
"docx",
"atsv",
"location",
"mobile",
"skipTlsVerification",
"useFastMode"
2024-11-07 20:57:33 +01:00
] as const;
2024-12-11 19:46:11 -03:00
export type FeatureFlag = (typeof featureFlags)[number];
2024-11-07 20:57:33 +01:00
export const featureFlagOptions: {
2024-12-11 19:46:11 -03:00
[F in FeatureFlag]: {
priority: number;
};
2024-11-07 20:57:33 +01:00
} = {
2024-12-11 19:46:11 -03:00
actions: { priority: 20 },
waitFor: { priority: 1 },
screenshot: { priority: 10 },
"screenshot@fullScreen": { priority: 10 },
pdf: { priority: 100 },
docx: { priority: 100 },
atsv: { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
useFastMode: { priority: 90 },
location: { priority: 10 },
mobile: { priority: 10 },
skipTlsVerification: { priority: 10 }
2024-11-07 20:57:33 +01:00
} as const;
export type EngineScrapeResult = {
2024-12-11 19:46:11 -03:00
url: string;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
html: string;
markdown?: string;
statusCode: number;
error?: string;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
screenshot?: string;
actions?: {
screenshots: string[];
scrapes: ScrapeActionContent[];
};
};
2024-11-07 20:57:33 +01:00
const engineHandlers: {
2024-12-11 19:46:11 -03:00
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>;
2024-11-07 20:57:33 +01:00
} = {
2024-12-11 19:46:11 -03:00
cache: scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
scrapingbee: scrapeURLWithScrapingBee("domcontentloaded"),
scrapingbeeLoad: scrapeURLWithScrapingBee("networkidle2"),
playwright: scrapeURLWithPlaywright,
fetch: scrapeURLWithFetch,
pdf: scrapePDF,
docx: scrapeDOCX
2024-11-07 20:57:33 +01:00
};
export const engineOptions: {
2024-12-11 19:46:11 -03:00
[E in Engine]: {
// A list of feature flags the engine supports.
features: { [F in FeatureFlag]: boolean };
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
// This defines the order of engines in general. The engine with the highest quality will be used the most.
// Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
quality: number;
};
2024-11-07 20:57:33 +01:00
} = {
2024-12-11 19:46:11 -03:00
cache: {
features: {
actions: false,
waitFor: true,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false, // TODO: figure this out
docx: false, // TODO: figure this out
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false
2024-11-14 19:47:12 +01:00
},
2024-12-11 19:46:11 -03:00
quality: 1000 // cache should always be tried first
},
"fire-engine;chrome-cdp": {
features: {
actions: true,
waitFor: true, // through actions transform
screenshot: true, // through actions transform
"screenshot@fullScreen": true, // through actions transform
pdf: false,
docx: false,
atsv: false,
location: true,
mobile: true,
skipTlsVerification: true,
useFastMode: false
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
quality: 50
},
"fire-engine;playwright": {
features: {
actions: false,
waitFor: true,
screenshot: true,
"screenshot@fullScreen": true,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
quality: 40
},
scrapingbee: {
features: {
actions: false,
waitFor: true,
screenshot: true,
"screenshot@fullScreen": true,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
quality: 30
},
scrapingbeeLoad: {
features: {
actions: false,
waitFor: true,
screenshot: true,
"screenshot@fullScreen": true,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
quality: 29
},
playwright: {
features: {
actions: false,
waitFor: true,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
quality: 20
},
"fire-engine;tlsclient": {
features: {
actions: false,
waitFor: false,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false,
docx: false,
atsv: true,
location: true,
mobile: false,
skipTlsVerification: false,
useFastMode: true
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
quality: 10
},
fetch: {
features: {
actions: false,
waitFor: false,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: true
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
quality: 5
},
pdf: {
features: {
actions: false,
waitFor: false,
screenshot: false,
"screenshot@fullScreen": false,
pdf: true,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: true
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
quality: -10
},
docx: {
features: {
actions: false,
waitFor: false,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false,
docx: true,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: true
2024-11-07 20:57:33 +01:00
},
2024-12-11 19:46:11 -03:00
quality: -10
}
2024-11-07 20:57:33 +01:00
};
export function buildFallbackList(meta: Meta): {
2024-12-11 19:46:11 -03:00
engine: Engine;
unsupportedFeatures: Set<FeatureFlag>;
2024-11-07 20:57:33 +01:00
}[] {
2024-12-11 19:46:11 -03:00
const prioritySum = [...meta.featureFlags].reduce(
(a, x) => a + featureFlagOptions[x].priority,
0
);
const priorityThreshold = Math.floor(prioritySum / 2);
let selectedEngines: {
engine: Engine;
supportScore: number;
unsupportedFeatures: Set<FeatureFlag>;
}[] = [];
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const currentEngines =
meta.internalOptions.forceEngine !== undefined
? [meta.internalOptions.forceEngine]
: engines;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
for (const engine of currentEngines) {
const supportedFlags = new Set([
...Object.entries(engineOptions[engine].features)
.filter(
([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true
)
.map(([k, _]) => k)
]);
const supportScore = [...supportedFlags].reduce(
(a, x) => a + featureFlagOptions[x].priority,
0
);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const unsupportedFeatures = new Set([...meta.featureFlags]);
for (const flag of meta.featureFlags) {
if (supportedFlags.has(flag)) {
unsupportedFeatures.delete(flag);
}
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (supportScore >= priorityThreshold) {
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
meta.logger.debug(`Engine ${engine} meets feature priority threshold`, {
supportScore,
prioritySum,
priorityThreshold,
featureFlags: [...meta.featureFlags],
unsupportedFeatures
});
} else {
meta.logger.debug(
`Engine ${engine} does not meet feature priority threshold`,
{
supportScore,
prioritySum,
priorityThreshold,
featureFlags: [...meta.featureFlags],
unsupportedFeatures
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
);
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (selectedEngines.some((x) => engineOptions[x.engine].quality > 0)) {
selectedEngines = selectedEngines.filter(
(x) => engineOptions[x.engine].quality > 0
);
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
selectedEngines.sort(
(a, b) =>
b.supportScore - a.supportScore ||
engineOptions[b.engine].quality - engineOptions[a.engine].quality
);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return selectedEngines;
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
export async function scrapeURLWithEngine(
meta: Meta,
engine: Engine
): Promise<EngineScrapeResult> {
const fn = engineHandlers[engine];
const logger = meta.logger.child({
method: fn.name ?? "scrapeURLWithEngine",
engine
});
const _meta = {
...meta,
logger
};
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return await fn(_meta);
2024-11-07 20:57:33 +01:00
}