feat(fire-engine): port waitFor and screenshot to use actions

This commit is contained in:
Gergő Móricz
2024-09-18 20:04:54 +02:00
parent c28e1e2959
commit 42d677fe3c
3 changed files with 38 additions and 21 deletions
+13 -1
View File
@@ -10,6 +10,17 @@ export interface Progress {
currentDocument?: Document; currentDocument?: Document;
} }
export type Action = {
type: "wait",
milliseconds: number,
} | {
type: "click",
selector: string,
} | {
type: "screenshot",
fullPage?: boolean,
};
export type PageOptions = { export type PageOptions = {
includeMarkdown?: boolean; includeMarkdown?: boolean;
includeExtract?: boolean; includeExtract?: boolean;
@@ -29,7 +40,8 @@ export type PageOptions = {
includeLinks?: boolean; includeLinks?: boolean;
useFastMode?: boolean; // beta useFastMode?: boolean; // beta
disableJsDom?: boolean; // beta disableJsDom?: boolean; // beta
atsv?: boolean; // beta atsv?: boolean; // anti-bot solver, beta
actions?: Action[]; // beta
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
@@ -1,5 +1,5 @@
import axios from "axios"; import axios from "axios";
import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities"; import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
import { logScrape } from "../../../services/logging/scrape_log"; import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url"; import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { fetchAndProcessPdf } from "../utils/pdfProcessor";
@@ -20,9 +20,7 @@ import * as Sentry from "@sentry/node";
*/ */
export async function scrapWithFireEngine({ export async function scrapWithFireEngine({
url, url,
waitFor = 0, actions,
screenshot = false,
fullPageScreenshot = false,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false }, pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
fireEngineOptions = {}, fireEngineOptions = {},
headers, headers,
@@ -31,9 +29,7 @@ export async function scrapWithFireEngine({
teamId, teamId,
}: { }: {
url: string; url: string;
waitFor?: number; actions?: Action[];
screenshot?: boolean;
fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean }; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
fireEngineOptions?: FireEngineOptions; fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
@@ -54,10 +50,7 @@ export async function scrapWithFireEngine({
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
let waitParam = reqParams["params"]?.wait ?? waitFor;
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp"; let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
@@ -75,7 +68,7 @@ export async function scrapWithFireEngine({
} }
Logger.info( Logger.info(
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
); );
// atsv is only available for beta customers // atsv is only available for beta customers
@@ -101,9 +94,6 @@ export async function scrapWithFireEngine({
process.env.FIRE_ENGINE_BETA_URL + endpoint, process.env.FIRE_ENGINE_BETA_URL + endpoint,
{ {
url: url, url: url,
wait: waitParam,
screenshot: screenshotParam,
fullPageScreenshot: fullPageScreenshotParam,
headers: headers, headers: headers,
disableJsDom: pageOptions?.disableJsDom ?? false, disableJsDom: pageOptions?.disableJsDom ?? false,
priority, priority,
@@ -112,6 +102,7 @@ export async function scrapWithFireEngine({
...fireEngineOptionsParam, ...fireEngineOptionsParam,
atsv: pageOptions?.atsv ?? false, atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [], scrollXPaths: pageOptions?.scrollXPaths ?? [],
actions: actions,
}, },
{ {
headers: { headers: {
@@ -125,8 +116,10 @@ export async function scrapWithFireEngine({
); );
}); });
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => x.milliseconds + a, 0);
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) { while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal) {
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
} }
+17 -5
View File
@@ -195,9 +195,17 @@ export async function scrapSingleUrl(
if (process.env.FIRE_ENGINE_BETA_URL) { if (process.env.FIRE_ENGINE_BETA_URL) {
const response = await scrapWithFireEngine({ const response = await scrapWithFireEngine({
url, url,
waitFor: pageOptions.waitFor, actions: [
screenshot: pageOptions.screenshot, ...(pageOptions.waitFor ? [{
fullPageScreenshot: pageOptions.fullPageScreenshot, type: "wait" as const,
milliseconds: pageOptions.waitFor,
}] : []),
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
type: "screenshot" as const,
fullPage: !!pageOptions.fullPageScreenshot,
}] : []),
...(pageOptions.actions ?? []),
],
pageOptions: pageOptions, pageOptions: pageOptions,
headers: pageOptions.headers, headers: pageOptions.headers,
fireEngineOptions: { fireEngineOptions: {
@@ -267,8 +275,12 @@ export async function scrapSingleUrl(
case "fire-engine": case "fire-engine":
customScrapedContent = await scrapWithFireEngine({ customScrapedContent = await scrapWithFireEngine({
url: customScraperResult.url, url: customScraperResult.url,
waitFor: customScraperResult.waitAfterLoad, actions: customScraperResult.waitAfterLoad ? ([
screenshot: false, {
type: "wait",
milliseconds: customScraperResult.waitAfterLoad,
}
]) : ([]),
pageOptions: customScraperResult.pageOptions, pageOptions: customScraperResult.pageOptions,
}); });
if (screenshot) { if (screenshot) {