Files
firecrawl/apps/api/src/scraper/scrapeURL/index.ts
T

493 lines
14 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { Logger } from "winston";
import * as Sentry from "@sentry/node";
import { Document, ScrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
import { logger as _logger } from "../../lib/logger";
2024-12-11 19:46:11 -03:00
import {
buildFallbackList,
Engine,
EngineScrapeResult,
FeatureFlag,
2024-12-11 19:51:08 -03:00
scrapeURLWithEngine,
2024-12-11 19:46:11 -03:00
} from "./engines";
2024-11-07 20:57:33 +01:00
import { parseMarkdown } from "../../lib/html-to-markdown";
2024-12-11 19:46:11 -03:00
import {
2024-12-15 15:43:12 -03:00
ActionError,
2024-12-11 19:46:11 -03:00
AddFeatureError,
EngineError,
NoEnginesLeftError,
PDFAntibotError,
2024-12-11 19:46:11 -03:00
RemoveFeatureError,
SiteError,
2024-12-11 19:51:08 -03:00
TimeoutError,
UnsupportedFileError,
2024-12-11 19:46:11 -03:00
} from "./error";
2024-11-07 20:57:33 +01:00
import { executeTransformers } from "./transformers";
import { LLMRefusalError } from "./transformers/llmExtract";
import { urlSpecificParams } from "./lib/urlSpecificParams";
import { loadMock, MockState } from "./lib/mock";
2025-04-17 09:23:53 -07:00
import { CostTracking } from "../../lib/extract/extraction-service";
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
export type ScrapeUrlResponse = (
| {
success: true;
document: Document;
}
| {
success: false;
error: any;
}
) & {
logs: any[];
engines: EngineResultsTracker;
};
2024-11-07 20:57:33 +01:00
export type Meta = {
2024-12-11 19:46:11 -03:00
id: string;
url: string;
options: ScrapeOptions;
internalOptions: InternalOptions;
logger: Logger;
logs: any[];
featureFlags: Set<FeatureFlag>;
mock: MockState | null;
pdfPrefetch: {
filePath: string;
url?: string;
status: number;
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
2025-04-17 09:23:53 -07:00
costTracking: CostTracking;
2024-12-11 19:46:11 -03:00
};
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
function buildFeatureFlags(
url: string,
options: ScrapeOptions,
2024-12-11 19:51:08 -03:00
internalOptions: InternalOptions,
2024-12-11 19:46:11 -03:00
): Set<FeatureFlag> {
const flags: Set<FeatureFlag> = new Set();
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (options.actions !== undefined) {
flags.add("actions");
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (options.formats.includes("screenshot")) {
flags.add("screenshot");
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (options.formats.includes("screenshot@fullPage")) {
flags.add("screenshot@fullScreen");
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (options.waitFor !== 0) {
flags.add("waitFor");
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (internalOptions.atsv) {
flags.add("atsv");
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (options.location || options.geolocation) {
flags.add("location");
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (options.mobile) {
flags.add("mobile");
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (options.skipTlsVerification) {
flags.add("skipTlsVerification");
}
2024-11-07 20:57:33 +01:00
2024-12-13 22:30:57 +01:00
if (options.fastMode) {
2024-12-11 19:46:11 -03:00
flags.add("useFastMode");
}
2024-11-07 20:57:33 +01:00
if (options.proxy === "stealth") {
flags.add("stealthProxy");
}
2024-12-11 19:46:11 -03:00
const urlO = new URL(url);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (urlO.pathname.endsWith(".pdf")) {
flags.add("pdf");
}
if (urlO.pathname.endsWith(".docx")) {
flags.add("docx");
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
return flags;
2024-11-07 20:57:33 +01:00
}
// The meta object contains all required information to perform a scrape.
// For example, the scrape ID, URL, options, feature flags, logs that occur while scraping.
// The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required)
// Having a meta object that is treated as immutable helps the code stay clean and easily tracable,
// while also retaining the benefits that WebScraper had from its OOP design.
async function buildMetaObject(
2024-12-11 19:46:11 -03:00
id: string,
url: string,
options: ScrapeOptions,
2024-12-11 19:51:08 -03:00
internalOptions: InternalOptions,
2025-04-17 09:23:53 -07:00
costTracking: CostTracking,
): Promise<Meta> {
2024-12-11 19:46:11 -03:00
const specParams =
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
if (specParams !== undefined) {
options = Object.assign(options, specParams.scrapeOptions);
internalOptions = Object.assign(
internalOptions,
2024-12-11 19:51:08 -03:00
specParams.internalOptions,
2024-12-11 19:46:11 -03:00
);
}
const logger = _logger.child({
2024-12-11 19:46:11 -03:00
module: "ScrapeURL",
scrapeId: id,
2024-12-11 19:51:08 -03:00
scrapeURL: url,
2024-12-11 19:46:11 -03:00
});
const logs: any[] = [];
return {
id,
url,
options,
internalOptions,
logger,
2024-12-11 19:46:11 -03:00
logs,
2024-12-11 19:51:08 -03:00
featureFlags: buildFeatureFlags(url, options, internalOptions),
2025-01-22 18:47:44 -03:00
mock:
options.useMock !== undefined
? await loadMock(options.useMock, _logger)
: null,
pdfPrefetch: undefined,
2025-04-17 09:23:53 -07:00
costTracking,
2024-12-11 19:46:11 -03:00
};
2024-11-07 20:57:33 +01:00
}
export type InternalOptions = {
2025-04-02 19:52:43 +02:00
teamId: string;
2025-04-17 09:23:53 -07:00
2024-12-11 19:46:11 -03:00
priority?: number; // Passed along to fire-engine
2025-01-23 08:58:18 +01:00
forceEngine?: Engine | Engine[];
2024-12-11 19:46:11 -03:00
atsv?: boolean; // anti-bot solver, beta
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
v0CrawlOnlyUrls?: boolean;
v0DisableJsDom?: boolean;
2025-01-03 21:19:40 -03:00
useCache?: boolean;
2024-12-11 19:46:11 -03:00
disableSmartWaitCache?: boolean; // Passed along to fire-engine
2024-12-30 21:42:01 -03:00
isBackgroundIndex?: boolean;
2025-01-03 23:07:15 -03:00
fromCache?: boolean; // Indicates if the document was retrieved from cache
abort?: AbortSignal;
2025-04-02 19:52:43 +02:00
urlInvisibleInCurrentCrawl?: boolean;
2024-11-07 20:57:33 +01:00
};
2024-12-11 19:46:11 -03:00
export type EngineResultsTracker = {
[E in Engine]?: (
| {
state: "error";
error: any;
unexpected: boolean;
}
| {
state: "success";
result: EngineScrapeResult & { markdown: string };
factors: Record<string, boolean>;
unsupportedFeatures: Set<FeatureFlag>;
}
| {
state: "timeout";
}
) & {
startedAt: number;
finishedAt: number;
};
};
2024-11-07 20:57:33 +01:00
export type EngineScrapeResultWithContext = {
2024-12-11 19:46:11 -03:00
engine: Engine;
unsupportedFeatures: Set<FeatureFlag>;
result: EngineScrapeResult & { markdown: string };
2024-11-07 20:57:33 +01:00
};
function safeguardCircularError<T>(error: T): T {
2024-12-11 19:46:11 -03:00
if (typeof error === "object" && error !== null && (error as any).results) {
const newError = structuredClone(error);
delete (newError as any).results;
return newError;
} else {
return error;
}
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
// TODO: handle sitemap data, see WebScraper/index.ts:280
// TODO: ScrapeEvents
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const fallbackList = buildFallbackList(meta);
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
const results: EngineResultsTracker = {};
let result: EngineScrapeResultWithContext | null = null;
2024-11-07 20:57:33 +01:00
2024-12-17 16:58:57 -03:00
const timeToRun =
meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
: (!meta.options.actions && !meta.options.jsonOptions && !meta.options.extract)
? Math.round(120000 / Math.min(fallbackList.length, 2))
: undefined;
2024-12-15 18:58:29 +01:00
for (const { engine, unsupportedFeatures } of fallbackList) {
meta.internalOptions.abort?.throwIfAborted();
2024-12-11 19:46:11 -03:00
const startedAt = Date.now();
try {
meta.logger.info("Scraping via " + engine + "...");
const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
2024-12-11 19:46:11 -03:00
if (_engineResult.markdown === undefined) {
// Some engines emit Markdown directly.
_engineResult.markdown = await parseMarkdown(_engineResult.html);
}
const engineResult = _engineResult as EngineScrapeResult & {
markdown: string;
};
// Success factors
const isLongEnough = engineResult.markdown.length > 0;
2024-12-11 19:46:11 -03:00
const isGoodStatusCode =
(engineResult.statusCode >= 200 && engineResult.statusCode < 300) ||
engineResult.statusCode === 304;
const hasNoPageError = engineResult.error === undefined;
results[engine] = {
state: "success",
result: engineResult,
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
unsupportedFeatures,
startedAt,
2024-12-11 19:51:08 -03:00
finishedAt: Date.now(),
2024-12-11 19:46:11 -03:00
};
// NOTE: TODO: what to do when status code is bad is tough...
// we cannot just rely on text because error messages can be brief and not hit the limit
// should we just use all the fallbacks and pick the one with the longest text? - mogery
if (isLongEnough || !isGoodStatusCode) {
meta.logger.info("Scrape via " + engine + " deemed successful.", {
2024-12-11 19:51:08 -03:00
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
2024-12-11 19:46:11 -03:00
});
result = {
engine,
unsupportedFeatures,
2024-12-11 19:51:08 -03:00
result: engineResult as EngineScrapeResult & { markdown: string },
2024-12-11 19:46:11 -03:00
};
break;
}
} catch (error) {
if (error instanceof EngineError) {
meta.logger.info("Engine " + engine + " could not scrape the page.", {
2024-12-11 19:51:08 -03:00
error,
2024-12-11 19:46:11 -03:00
});
results[engine] = {
state: "error",
error: safeguardCircularError(error),
unexpected: false,
startedAt,
2024-12-11 19:51:08 -03:00
finishedAt: Date.now(),
2024-12-11 19:46:11 -03:00
};
} else if (error instanceof TimeoutError) {
meta.logger.info("Engine " + engine + " timed out while scraping.", {
2024-12-11 19:51:08 -03:00
error,
2024-12-11 19:46:11 -03:00
});
results[engine] = {
state: "timeout",
startedAt,
2024-12-11 19:51:08 -03:00
finishedAt: Date.now(),
2024-12-11 19:46:11 -03:00
};
} else if (
error instanceof AddFeatureError ||
error instanceof RemoveFeatureError
) {
throw error;
} else if (error instanceof LLMRefusalError) {
results[engine] = {
state: "error",
error: safeguardCircularError(error),
unexpected: true,
startedAt,
2024-12-11 19:51:08 -03:00
finishedAt: Date.now(),
2024-12-11 19:46:11 -03:00
};
error.results = results;
meta.logger.warn("LLM refusal encountered", { error });
throw error;
} else if (error instanceof SiteError) {
throw error;
2024-12-15 15:43:12 -03:00
} else if (error instanceof ActionError) {
throw error;
} else if (error instanceof UnsupportedFileError) {
throw error;
} else if (error instanceof PDFAntibotError) {
throw error;
} else if (error instanceof TimeoutSignal) {
throw error;
2024-12-11 19:46:11 -03:00
} else {
Sentry.captureException(error);
meta.logger.warn(
2024-12-11 19:46:11 -03:00
"An unexpected error happened while scraping with " + engine + ".",
2024-12-11 19:51:08 -03:00
{ error },
2024-12-11 19:46:11 -03:00
);
results[engine] = {
state: "error",
error: safeguardCircularError(error),
unexpected: true,
startedAt,
2024-12-11 19:51:08 -03:00
finishedAt: Date.now(),
2024-12-11 19:46:11 -03:00
};
}
}
}
if (result === null) {
throw new NoEnginesLeftError(
fallbackList.map((x) => x.engine),
2024-12-11 19:51:08 -03:00
results,
2024-12-11 19:46:11 -03:00
);
}
let document: Document = {
markdown: result.result.markdown,
rawHtml: result.result.html,
screenshot: result.result.screenshot,
actions: result.result.actions,
metadata: {
sourceURL: meta.url,
url: result.result.url,
statusCode: result.result.statusCode,
2024-12-11 19:51:08 -03:00
error: result.result.error,
},
2024-12-11 19:46:11 -03:00
};
if (result.unsupportedFeatures.size > 0) {
const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`;
meta.logger.warn(warning, {
engine: result.engine,
2024-12-11 19:51:08 -03:00
unsupportedFeatures: result.unsupportedFeatures,
2024-12-11 19:46:11 -03:00
});
document.warning =
document.warning !== undefined
? document.warning + " " + warning
: warning;
}
document = await executeTransformers(meta, document);
return {
success: true,
document,
logs: meta.logs,
2024-12-11 19:51:08 -03:00
engines: results,
2024-12-11 19:46:11 -03:00
};
2024-11-07 20:57:33 +01:00
}
export async function scrapeURL(
2024-12-11 19:46:11 -03:00
id: string,
url: string,
options: ScrapeOptions,
2025-04-02 19:52:43 +02:00
internalOptions: InternalOptions,
2025-04-17 09:23:53 -07:00
costTracking: CostTracking,
2024-11-07 20:57:33 +01:00
): Promise<ScrapeUrlResponse> {
2025-04-17 09:23:53 -07:00
const meta = await buildMetaObject(id, url, options, internalOptions, costTracking);
2024-12-11 19:46:11 -03:00
try {
while (true) {
try {
return await scrapeURLLoop(meta);
} catch (error) {
if (
error instanceof AddFeatureError &&
meta.internalOptions.forceEngine === undefined
) {
meta.logger.debug(
"More feature flags requested by scraper: adding " +
error.featureFlags.join(", "),
2024-12-11 19:51:08 -03:00
{ error, existingFlags: meta.featureFlags },
2024-12-11 19:46:11 -03:00
);
meta.featureFlags = new Set(
2024-12-11 19:51:08 -03:00
[...meta.featureFlags].concat(error.featureFlags),
2024-12-11 19:46:11 -03:00
);
if (error.pdfPrefetch) {
meta.pdfPrefetch = error.pdfPrefetch;
}
2024-12-11 19:46:11 -03:00
} else if (
error instanceof RemoveFeatureError &&
meta.internalOptions.forceEngine === undefined
) {
meta.logger.debug(
"Incorrect feature flags reported by scraper: removing " +
error.featureFlags.join(","),
2024-12-11 19:51:08 -03:00
{ error, existingFlags: meta.featureFlags },
2024-12-11 19:46:11 -03:00
);
meta.featureFlags = new Set(
[...meta.featureFlags].filter(
2024-12-11 19:51:08 -03:00
(x) => !error.featureFlags.includes(x),
),
2024-12-11 19:46:11 -03:00
);
} else if (
error instanceof PDFAntibotError &&
meta.internalOptions.forceEngine === undefined
) {
if (meta.pdfPrefetch !== undefined) {
meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
throw error;
} else {
meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
meta.featureFlags = new Set(
[...meta.featureFlags].filter(
(x) => x !== "pdf",
),
);
}
2024-11-07 20:57:33 +01:00
} else {
2024-12-11 19:46:11 -03:00
throw error;
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
}
}
} catch (error) {
let results: EngineResultsTracker = {};
if (error instanceof NoEnginesLeftError) {
meta.logger.warn("scrapeURL: All scraping engines failed!", { error });
results = error.results;
} else if (error instanceof LLMRefusalError) {
meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
results = error.results!;
} else if (
error instanceof Error &&
error.message.includes("Invalid schema for response_format")
) {
// TODO: seperate into custom error
meta.logger.warn("scrapeURL: LLM schema error", { error });
// TODO: results?
} else if (error instanceof SiteError) {
meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
2024-12-15 15:43:12 -03:00
} else if (error instanceof ActionError) {
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
} else if (error instanceof UnsupportedFileError) {
2025-01-10 18:35:10 -03:00
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
error,
});
} else if (error instanceof TimeoutSignal) {
throw error;
2024-12-11 19:46:11 -03:00
} else {
Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error });
// TODO: results?
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
return {
success: false,
error,
logs: meta.logs,
2024-12-11 19:51:08 -03:00
engines: results,
2024-12-11 19:46:11 -03:00
};
}
2024-11-07 20:57:33 +01:00
}