@@ -285,11 +285,19 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({
|
if (typeof error === "string" && error.startsWith("{\"type\":\"all\",")) {
|
||||||
error:
|
return res.status(500).json({
|
||||||
|
success: false,
|
||||||
|
error: "All scraping methods failed for URL: " + req.body.url,
|
||||||
|
details: JSON.parse(error).errors as string[],
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
return res.status(500).json({
|
||||||
|
error:
|
||||||
typeof error === "string"
|
typeof error === "string"
|
||||||
? error
|
? error
|
||||||
: error?.message ?? "Internal Server Error",
|
: error?.message ?? "Internal Server Error",
|
||||||
});
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -64,22 +64,21 @@ export async function scrapeController(
|
|||||||
success: false,
|
success: false,
|
||||||
error: "Request timed out",
|
error: "Request timed out",
|
||||||
});
|
});
|
||||||
} else {
|
} else if (typeof e === "string" && e.startsWith("{\"type\":\"all\",")) {
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
error: "All scraping methods failed for URL: " + req.body.url,
|
||||||
extractorOptions && extractorOptions.mode !== "markdown"
|
details: JSON.parse(e).errors as string[],
|
||||||
? " - Could be due to LLM parsing issues"
|
|
||||||
: ""
|
|
||||||
}`,
|
|
||||||
});
|
});
|
||||||
|
} else {
|
||||||
|
throw e;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await job.remove();
|
await job.remove();
|
||||||
|
|
||||||
if (!doc) {
|
if (!doc) {
|
||||||
console.error("!!! PANIC DOC IS", doc, job);
|
// console.error("!!! PANIC DOC IS", doc, job);
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
warning: "No page found",
|
warning: "No page found",
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { ExtractorOptions, PageOptions } from "../../lib/entities";
|
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
|
|
||||||
@@ -57,6 +57,33 @@ export const extractOptions = z.object({
|
|||||||
|
|
||||||
export type ExtractOptions = z.infer<typeof extractOptions>;
|
export type ExtractOptions = z.infer<typeof extractOptions>;
|
||||||
|
|
||||||
|
export const actionsSchema = z.array(z.union([
|
||||||
|
z.object({
|
||||||
|
type: z.literal("wait"),
|
||||||
|
milliseconds: z.number().int().positive().finite(),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("click"),
|
||||||
|
selector: z.string(),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("screenshot"),
|
||||||
|
fullPage: z.boolean().default(false),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("typeText"),
|
||||||
|
text: z.string(),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("pressKey"),
|
||||||
|
key: z.string(),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("scroll"),
|
||||||
|
direction: z.enum(["up", "down"]),
|
||||||
|
}),
|
||||||
|
]));
|
||||||
|
|
||||||
export const scrapeOptions = z.object({
|
export const scrapeOptions = z.object({
|
||||||
formats: z
|
formats: z
|
||||||
.enum([
|
.enum([
|
||||||
@@ -80,6 +107,7 @@ export const scrapeOptions = z.object({
|
|||||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||||
extract: extractOptions.optional(),
|
extract: extractOptions.optional(),
|
||||||
parsePDF: z.boolean().default(true),
|
parsePDF: z.boolean().default(true),
|
||||||
|
actions: actionsSchema.optional(),
|
||||||
}).strict(strictMessage)
|
}).strict(strictMessage)
|
||||||
|
|
||||||
|
|
||||||
@@ -185,6 +213,9 @@ export type Document = {
|
|||||||
rawHtml?: string;
|
rawHtml?: string;
|
||||||
links?: string[];
|
links?: string[];
|
||||||
screenshot?: string;
|
screenshot?: string;
|
||||||
|
actions?: {
|
||||||
|
screenshots: string[];
|
||||||
|
};
|
||||||
metadata: {
|
metadata: {
|
||||||
title?: string;
|
title?: string;
|
||||||
description?: string;
|
description?: string;
|
||||||
@@ -336,6 +367,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||||||
screenshot: x.formats.includes("screenshot"),
|
screenshot: x.formats.includes("screenshot"),
|
||||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||||
parsePDF: x.parsePDF,
|
parsePDF: x.parsePDF,
|
||||||
|
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -370,6 +402,7 @@ export function legacyDocumentConverter(doc: any): Document {
|
|||||||
html: doc.html,
|
html: doc.html,
|
||||||
extract: doc.llm_extraction,
|
extract: doc.llm_extraction,
|
||||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||||
|
actions: doc.actions ?? undefined,
|
||||||
metadata: {
|
metadata: {
|
||||||
...doc.metadata,
|
...doc.metadata,
|
||||||
pageError: undefined,
|
pageError: undefined,
|
||||||
|
|||||||
@@ -10,6 +10,26 @@ export interface Progress {
|
|||||||
currentDocument?: Document;
|
currentDocument?: Document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export type Action = {
|
||||||
|
type: "wait",
|
||||||
|
milliseconds: number,
|
||||||
|
} | {
|
||||||
|
type: "click",
|
||||||
|
selector: string,
|
||||||
|
} | {
|
||||||
|
type: "screenshot",
|
||||||
|
fullPage?: boolean,
|
||||||
|
} | {
|
||||||
|
type: "typeText",
|
||||||
|
text: string,
|
||||||
|
} | {
|
||||||
|
type: "pressKey",
|
||||||
|
key: string,
|
||||||
|
} | {
|
||||||
|
type: "scroll",
|
||||||
|
direction: "up" | "down"
|
||||||
|
};
|
||||||
|
|
||||||
export type PageOptions = {
|
export type PageOptions = {
|
||||||
includeMarkdown?: boolean;
|
includeMarkdown?: boolean;
|
||||||
includeExtract?: boolean;
|
includeExtract?: boolean;
|
||||||
@@ -29,7 +49,8 @@ export type PageOptions = {
|
|||||||
includeLinks?: boolean;
|
includeLinks?: boolean;
|
||||||
useFastMode?: boolean; // beta
|
useFastMode?: boolean; // beta
|
||||||
disableJsDom?: boolean; // beta
|
disableJsDom?: boolean; // beta
|
||||||
atsv?: boolean; // beta
|
atsv?: boolean; // anti-bot solver, beta
|
||||||
|
actions?: Action[]; // beta
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
@@ -98,6 +119,9 @@ export class Document {
|
|||||||
childrenLinks?: string[];
|
childrenLinks?: string[];
|
||||||
provider?: string;
|
provider?: string;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
|
actions?: {
|
||||||
|
screenshots: string[];
|
||||||
|
}
|
||||||
|
|
||||||
index?: number;
|
index?: number;
|
||||||
linksOnPage?: string[]; // Add this new field as a separate property
|
linksOnPage?: string[]; // Add this new field as a separate property
|
||||||
@@ -137,7 +161,7 @@ export class SearchResult {
|
|||||||
|
|
||||||
export interface FireEngineResponse {
|
export interface FireEngineResponse {
|
||||||
html: string;
|
html: string;
|
||||||
screenshot: string;
|
screenshots?: string[];
|
||||||
pageStatusCode?: number;
|
pageStatusCode?: number;
|
||||||
pageError?: string;
|
pageError?: string;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -591,7 +591,8 @@ export class WebScraperDataProvider {
|
|||||||
screenshot: options.pageOptions?.screenshot ?? false,
|
screenshot: options.pageOptions?.screenshot ?? false,
|
||||||
useFastMode: options.pageOptions?.useFastMode ?? false,
|
useFastMode: options.pageOptions?.useFastMode ?? false,
|
||||||
disableJsDom: options.pageOptions?.disableJsDom ?? false,
|
disableJsDom: options.pageOptions?.disableJsDom ?? false,
|
||||||
atsv: options.pageOptions?.atsv ?? false
|
atsv: options.pageOptions?.atsv ?? false,
|
||||||
|
actions: options.pageOptions?.actions ?? undefined,
|
||||||
};
|
};
|
||||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||||
this.replaceAllPathsWithAbsolutePaths =
|
this.replaceAllPathsWithAbsolutePaths =
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
||||||
import { logScrape } from "../../../services/logging/scrape_log";
|
import { logScrape } from "../../../services/logging/scrape_log";
|
||||||
import { generateRequestParams } from "../single_url";
|
import { generateRequestParams } from "../single_url";
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||||
@@ -20,6 +20,7 @@ import * as Sentry from "@sentry/node";
|
|||||||
*/
|
*/
|
||||||
export async function scrapWithFireEngine({
|
export async function scrapWithFireEngine({
|
||||||
url,
|
url,
|
||||||
|
actions,
|
||||||
waitFor = 0,
|
waitFor = 0,
|
||||||
screenshot = false,
|
screenshot = false,
|
||||||
fullPageScreenshot = false,
|
fullPageScreenshot = false,
|
||||||
@@ -31,6 +32,7 @@ export async function scrapWithFireEngine({
|
|||||||
teamId,
|
teamId,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
|
actions?: Action[];
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
fullPageScreenshot?: boolean;
|
fullPageScreenshot?: boolean;
|
||||||
@@ -75,7 +77,7 @@ export async function scrapWithFireEngine({
|
|||||||
}
|
}
|
||||||
|
|
||||||
Logger.info(
|
Logger.info(
|
||||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||||
);
|
);
|
||||||
|
|
||||||
// atsv is only available for beta customers
|
// atsv is only available for beta customers
|
||||||
@@ -101,10 +103,10 @@ export async function scrapWithFireEngine({
|
|||||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||||
{
|
{
|
||||||
url: url,
|
url: url,
|
||||||
|
headers: headers,
|
||||||
wait: waitParam,
|
wait: waitParam,
|
||||||
screenshot: screenshotParam,
|
screenshot: screenshotParam,
|
||||||
fullPageScreenshot: fullPageScreenshotParam,
|
fullPageScreenshot: fullPageScreenshotParam,
|
||||||
headers: headers,
|
|
||||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||||
priority,
|
priority,
|
||||||
engine,
|
engine,
|
||||||
@@ -112,6 +114,7 @@ export async function scrapWithFireEngine({
|
|||||||
...fireEngineOptionsParam,
|
...fireEngineOptionsParam,
|
||||||
atsv: pageOptions?.atsv ?? false,
|
atsv: pageOptions?.atsv ?? false,
|
||||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||||
|
actions: actions,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
@@ -125,8 +128,10 @@ export async function scrapWithFireEngine({
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0);
|
||||||
|
|
||||||
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
|
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal) {
|
||||||
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
||||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||||
}
|
}
|
||||||
@@ -143,12 +148,12 @@ export async function scrapWithFireEngine({
|
|||||||
|
|
||||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
||||||
logParams.error_message = "Request timed out";
|
logParams.error_message = "Request timed out";
|
||||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: "" };
|
return { html: "", pageStatusCode: null, pageError: "" };
|
||||||
}
|
}
|
||||||
|
|
||||||
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
|
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
|
||||||
Logger.debug(
|
Logger.debug(
|
||||||
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}`
|
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}\t ${checkStatusResponse.data.error}`
|
||||||
);
|
);
|
||||||
|
|
||||||
logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
|
logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
|
||||||
@@ -162,7 +167,6 @@ export async function scrapWithFireEngine({
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
html: "",
|
html: "",
|
||||||
screenshot: "",
|
|
||||||
pageStatusCode,
|
pageStatusCode,
|
||||||
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
|
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
|
||||||
};
|
};
|
||||||
@@ -178,7 +182,7 @@ export async function scrapWithFireEngine({
|
|||||||
logParams.success = true;
|
logParams.success = true;
|
||||||
logParams.response_code = pageStatusCode;
|
logParams.response_code = pageStatusCode;
|
||||||
logParams.error_message = pageError;
|
logParams.error_message = pageError;
|
||||||
return { html: content, screenshot: "", pageStatusCode, pageError };
|
return { html: content, pageStatusCode, pageError };
|
||||||
} else {
|
} else {
|
||||||
const data = checkStatusResponse.data;
|
const data = checkStatusResponse.data;
|
||||||
|
|
||||||
@@ -190,7 +194,7 @@ export async function scrapWithFireEngine({
|
|||||||
logParams.error_message = data.pageError ?? data.error;
|
logParams.error_message = data.pageError ?? data.error;
|
||||||
return {
|
return {
|
||||||
html: data.content ?? "",
|
html: data.content ?? "",
|
||||||
screenshot: data.screenshot ?? "",
|
screenshots: data.screenshots,
|
||||||
pageStatusCode: data.pageStatusCode,
|
pageStatusCode: data.pageStatusCode,
|
||||||
pageError: data.pageError ?? data.error,
|
pageError: data.pageError ?? data.error,
|
||||||
};
|
};
|
||||||
@@ -203,7 +207,7 @@ export async function scrapWithFireEngine({
|
|||||||
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
|
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
|
||||||
logParams.error_message = error.message || error;
|
logParams.error_message = error.message || error;
|
||||||
}
|
}
|
||||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
|
return { html: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
} finally {
|
} finally {
|
||||||
const endTime = Date.now();
|
const endTime = Date.now();
|
||||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||||
|
|||||||
@@ -69,8 +69,13 @@ function getScrapingFallbackOrder(
|
|||||||
defaultScraper?: string,
|
defaultScraper?: string,
|
||||||
isWaitPresent: boolean = false,
|
isWaitPresent: boolean = false,
|
||||||
isScreenshotPresent: boolean = false,
|
isScreenshotPresent: boolean = false,
|
||||||
isHeadersPresent: boolean = false
|
isHeadersPresent: boolean = false,
|
||||||
|
isActionsPresent: boolean = false,
|
||||||
) {
|
) {
|
||||||
|
if (isActionsPresent) {
|
||||||
|
return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
|
||||||
|
}
|
||||||
|
|
||||||
const availableScrapers = baseScrapers.filter((scraper) => {
|
const availableScrapers = baseScrapers.filter((scraper) => {
|
||||||
switch (scraper) {
|
switch (scraper) {
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
@@ -148,7 +153,8 @@ export async function scrapSingleUrl(
|
|||||||
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
|
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
|
||||||
useFastMode: pageOptions.useFastMode ?? false,
|
useFastMode: pageOptions.useFastMode ?? false,
|
||||||
disableJsDom: pageOptions.disableJsDom ?? false,
|
disableJsDom: pageOptions.disableJsDom ?? false,
|
||||||
atsv: pageOptions.atsv ?? false
|
atsv: pageOptions.atsv ?? false,
|
||||||
|
actions: pageOptions.actions ?? undefined,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (extractorOptions) {
|
if (extractorOptions) {
|
||||||
@@ -170,6 +176,9 @@ export async function scrapSingleUrl(
|
|||||||
let scraperResponse: {
|
let scraperResponse: {
|
||||||
text: string;
|
text: string;
|
||||||
screenshot: string;
|
screenshot: string;
|
||||||
|
actions?: {
|
||||||
|
screenshots: string[];
|
||||||
|
};
|
||||||
metadata: { pageStatusCode?: number; pageError?: string | null };
|
metadata: { pageStatusCode?: number; pageError?: string | null };
|
||||||
} = { text: "", screenshot: "", metadata: {} };
|
} = { text: "", screenshot: "", metadata: {} };
|
||||||
let screenshot = "";
|
let screenshot = "";
|
||||||
@@ -195,9 +204,23 @@ export async function scrapSingleUrl(
|
|||||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
const response = await scrapWithFireEngine({
|
const response = await scrapWithFireEngine({
|
||||||
url,
|
url,
|
||||||
waitFor: pageOptions.waitFor,
|
...(engine === "chrome-cdp" ? ({
|
||||||
screenshot: pageOptions.screenshot,
|
actions: [
|
||||||
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
...(pageOptions.waitFor ? [{
|
||||||
|
type: "wait" as const,
|
||||||
|
milliseconds: pageOptions.waitFor,
|
||||||
|
}] : []),
|
||||||
|
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
|
||||||
|
type: "screenshot" as const,
|
||||||
|
fullPage: !!pageOptions.fullPageScreenshot,
|
||||||
|
}] : []),
|
||||||
|
...(pageOptions.actions ?? []),
|
||||||
|
],
|
||||||
|
}) : ({
|
||||||
|
waitFor: pageOptions.waitFor,
|
||||||
|
screenshot: pageOptions.screenshot,
|
||||||
|
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
||||||
|
})),
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
headers: pageOptions.headers,
|
headers: pageOptions.headers,
|
||||||
fireEngineOptions: {
|
fireEngineOptions: {
|
||||||
@@ -209,7 +232,14 @@ export async function scrapSingleUrl(
|
|||||||
teamId,
|
teamId,
|
||||||
});
|
});
|
||||||
scraperResponse.text = response.html;
|
scraperResponse.text = response.html;
|
||||||
scraperResponse.screenshot = response.screenshot;
|
if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
|
||||||
|
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
|
||||||
|
}
|
||||||
|
if (pageOptions.actions) {
|
||||||
|
scraperResponse.actions = {
|
||||||
|
screenshots: response.screenshots ?? [],
|
||||||
|
};
|
||||||
|
}
|
||||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||||
scraperResponse.metadata.pageError = response.pageError;
|
scraperResponse.metadata.pageError = response.pageError;
|
||||||
}
|
}
|
||||||
@@ -267,13 +297,14 @@ export async function scrapSingleUrl(
|
|||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
customScrapedContent = await scrapWithFireEngine({
|
customScrapedContent = await scrapWithFireEngine({
|
||||||
url: customScraperResult.url,
|
url: customScraperResult.url,
|
||||||
waitFor: customScraperResult.waitAfterLoad,
|
actions: customScraperResult.waitAfterLoad ? ([
|
||||||
screenshot: false,
|
{
|
||||||
|
type: "wait",
|
||||||
|
milliseconds: customScraperResult.waitAfterLoad,
|
||||||
|
}
|
||||||
|
]) : ([]),
|
||||||
pageOptions: customScraperResult.pageOptions,
|
pageOptions: customScraperResult.pageOptions,
|
||||||
});
|
});
|
||||||
if (screenshot) {
|
|
||||||
customScrapedContent.screenshot = screenshot;
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
case "pdf":
|
case "pdf":
|
||||||
const { content, pageStatusCode, pageError } =
|
const { content, pageStatusCode, pageError } =
|
||||||
@@ -283,7 +314,6 @@ export async function scrapSingleUrl(
|
|||||||
);
|
);
|
||||||
customScrapedContent = {
|
customScrapedContent = {
|
||||||
html: content,
|
html: content,
|
||||||
screenshot,
|
|
||||||
pageStatusCode,
|
pageStatusCode,
|
||||||
pageError,
|
pageError,
|
||||||
};
|
};
|
||||||
@@ -293,7 +323,6 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
if (customScrapedContent) {
|
if (customScrapedContent) {
|
||||||
scraperResponse.text = customScrapedContent.html;
|
scraperResponse.text = customScrapedContent.html;
|
||||||
screenshot = customScrapedContent.screenshot;
|
|
||||||
}
|
}
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||||
@@ -313,19 +342,24 @@ export async function scrapSingleUrl(
|
|||||||
html: cleanedHtml,
|
html: cleanedHtml,
|
||||||
rawHtml: scraperResponse.text,
|
rawHtml: scraperResponse.text,
|
||||||
screenshot: scraperResponse.screenshot,
|
screenshot: scraperResponse.screenshot,
|
||||||
|
actions: scraperResponse.actions,
|
||||||
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
||||||
pageError: scraperResponse.metadata.pageError || undefined,
|
pageError: scraperResponse.metadata.pageError || undefined,
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = {
|
let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
|
||||||
text: "",
|
text: "",
|
||||||
html: "",
|
html: "",
|
||||||
rawHtml: "",
|
rawHtml: "",
|
||||||
screenshot: "",
|
screenshot: "",
|
||||||
|
actions: undefined,
|
||||||
pageStatusCode: 200,
|
pageStatusCode: 200,
|
||||||
pageError: undefined,
|
pageError: undefined,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const errors: Record<string, string> = {};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
let urlKey = urlToScrap;
|
let urlKey = urlToScrap;
|
||||||
try {
|
try {
|
||||||
@@ -338,7 +372,8 @@ export async function scrapSingleUrl(
|
|||||||
defaultScraper,
|
defaultScraper,
|
||||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
|
||||||
|
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
|
||||||
);
|
);
|
||||||
|
|
||||||
for (const scraper of scrapersInOrder) {
|
for (const scraper of scrapersInOrder) {
|
||||||
@@ -355,6 +390,7 @@ export async function scrapSingleUrl(
|
|||||||
html = attempt.html ?? "";
|
html = attempt.html ?? "";
|
||||||
rawHtml = attempt.rawHtml ?? "";
|
rawHtml = attempt.rawHtml ?? "";
|
||||||
screenshot = attempt.screenshot ?? "";
|
screenshot = attempt.screenshot ?? "";
|
||||||
|
actions = attempt.actions ?? undefined;
|
||||||
|
|
||||||
if (attempt.pageStatusCode) {
|
if (attempt.pageStatusCode) {
|
||||||
pageStatusCode = attempt.pageStatusCode;
|
pageStatusCode = attempt.pageStatusCode;
|
||||||
@@ -365,6 +401,12 @@ export async function scrapSingleUrl(
|
|||||||
pageError = undefined;
|
pageError = undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (attempt.pageError) {
|
||||||
|
errors[scraper] = attempt.pageError;
|
||||||
|
} else {
|
||||||
|
errors[scraper] = null;
|
||||||
|
}
|
||||||
|
|
||||||
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
|
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
|
||||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
||||||
break;
|
break;
|
||||||
@@ -392,54 +434,41 @@ export async function scrapSingleUrl(
|
|||||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||||
}
|
}
|
||||||
|
|
||||||
let document: Document;
|
let document: Document = {
|
||||||
if (screenshot && screenshot.length > 0) {
|
content: text,
|
||||||
document = {
|
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||||
content: text,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
rawHtml:
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
pageOptions.includeRawHtml ||
|
||||||
rawHtml:
|
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||||
pageOptions.includeRawHtml ||
|
? rawHtml
|
||||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
: undefined,
|
||||||
? rawHtml
|
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||||
: undefined,
|
actions,
|
||||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
metadata: {
|
||||||
metadata: {
|
...metadata,
|
||||||
...metadata,
|
...(screenshot && screenshot.length > 0 ? ({
|
||||||
screenshot: screenshot,
|
screenshot,
|
||||||
sourceURL: urlToScrap,
|
}) : {}),
|
||||||
pageStatusCode: pageStatusCode,
|
sourceURL: urlToScrap,
|
||||||
pageError: pageError,
|
pageStatusCode: pageStatusCode,
|
||||||
},
|
pageError: pageError,
|
||||||
};
|
},
|
||||||
} else {
|
};
|
||||||
document = {
|
|
||||||
content: text,
|
|
||||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
|
||||||
rawHtml:
|
|
||||||
pageOptions.includeRawHtml ||
|
|
||||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
|
||||||
? rawHtml
|
|
||||||
: undefined,
|
|
||||||
metadata: {
|
|
||||||
...metadata,
|
|
||||||
sourceURL: urlToScrap,
|
|
||||||
pageStatusCode: pageStatusCode,
|
|
||||||
pageError: pageError,
|
|
||||||
},
|
|
||||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return document;
|
return document;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
Logger.error(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
||||||
ScrapeEvents.insert(jobId, {
|
ScrapeEvents.insert(jobId, {
|
||||||
type: "error",
|
type: "error",
|
||||||
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
|
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
|
||||||
stack: error.stack,
|
stack: error.stack,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (error instanceof Error && error.message.startsWith("All scraping methods failed")) {
|
||||||
|
throw new Error(JSON.stringify({"type": "all", "errors": Object.values(errors)}));
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
content: "",
|
content: "",
|
||||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
|
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ export interface FirecrawlDocumentMetadata {
|
|||||||
* Document interface for Firecrawl.
|
* Document interface for Firecrawl.
|
||||||
* Represents a document retrieved or processed by Firecrawl.
|
* Represents a document retrieved or processed by Firecrawl.
|
||||||
*/
|
*/
|
||||||
export interface FirecrawlDocument<T = any> {
|
export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | never) = never> {
|
||||||
url?: string;
|
url?: string;
|
||||||
markdown?: string;
|
markdown?: string;
|
||||||
html?: string;
|
html?: string;
|
||||||
@@ -67,6 +67,7 @@ export interface FirecrawlDocument<T = any> {
|
|||||||
extract?: T;
|
extract?: T;
|
||||||
screenshot?: string;
|
screenshot?: string;
|
||||||
metadata?: FirecrawlDocumentMetadata;
|
metadata?: FirecrawlDocumentMetadata;
|
||||||
|
actions: ActionsSchema;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -83,19 +84,44 @@ export interface CrawlScrapeOptions {
|
|||||||
timeout?: number;
|
timeout?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any> extends CrawlScrapeOptions {
|
export type Action = {
|
||||||
|
type: "wait",
|
||||||
|
milliseconds: number,
|
||||||
|
} | {
|
||||||
|
type: "click",
|
||||||
|
selector: string,
|
||||||
|
} | {
|
||||||
|
type: "screenshot",
|
||||||
|
fullPage?: boolean,
|
||||||
|
} | {
|
||||||
|
type: "typeText",
|
||||||
|
text: string,
|
||||||
|
} | {
|
||||||
|
type: "pressKey",
|
||||||
|
key: string,
|
||||||
|
} | {
|
||||||
|
type: "scroll",
|
||||||
|
direction: "up" | "down",
|
||||||
|
};
|
||||||
|
|
||||||
|
export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchema extends (Action[] | undefined) = undefined> extends CrawlScrapeOptions {
|
||||||
extract?: {
|
extract?: {
|
||||||
prompt?: string;
|
prompt?: string;
|
||||||
schema?: LLMSchema;
|
schema?: LLMSchema;
|
||||||
systemPrompt?: string;
|
systemPrompt?: string;
|
||||||
};
|
};
|
||||||
|
actions?: ActionsSchema;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ActionsResult {
|
||||||
|
screenshots: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Response interface for scraping operations.
|
* Response interface for scraping operations.
|
||||||
* Defines the structure of the response received after a scraping operation.
|
* Defines the structure of the response received after a scraping operation.
|
||||||
*/
|
*/
|
||||||
export interface ScrapeResponse<LLMResult = any> extends FirecrawlDocument<LLMResult> {
|
export interface ScrapeResponse<LLMResult = any, ActionsSchema extends (ActionsResult | never) = never> extends FirecrawlDocument<LLMResult, ActionsSchema> {
|
||||||
success: true;
|
success: true;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
error?: string;
|
error?: string;
|
||||||
@@ -200,10 +226,10 @@ export default class FirecrawlApp {
|
|||||||
* @param params - Additional parameters for the scrape request.
|
* @param params - Additional parameters for the scrape request.
|
||||||
* @returns The response from the scrape operation.
|
* @returns The response from the scrape operation.
|
||||||
*/
|
*/
|
||||||
async scrapeUrl<T extends zt.ZodSchema>(
|
async scrapeUrl<T extends zt.ZodSchema, ActionsSchema extends (Action[] | undefined) = undefined>(
|
||||||
url: string,
|
url: string,
|
||||||
params?: ScrapeParams<T>
|
params?: ScrapeParams<T, ActionsSchema>
|
||||||
): Promise<ScrapeResponse<zt.infer<T>> | ErrorResponse> {
|
): Promise<ScrapeResponse<zt.infer<T>, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse> {
|
||||||
const headers: AxiosRequestHeaders = {
|
const headers: AxiosRequestHeaders = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
Authorization: `Bearer ${this.apiKey}`,
|
Authorization: `Bearer ${this.apiKey}`,
|
||||||
|
|||||||
Reference in New Issue
Block a user