feat(fire-engine): port waitFor and screenshot to use actions
This commit is contained in:
@@ -10,6 +10,17 @@ export interface Progress {
|
|||||||
currentDocument?: Document;
|
currentDocument?: Document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export type Action = {
|
||||||
|
type: "wait",
|
||||||
|
milliseconds: number,
|
||||||
|
} | {
|
||||||
|
type: "click",
|
||||||
|
selector: string,
|
||||||
|
} | {
|
||||||
|
type: "screenshot",
|
||||||
|
fullPage?: boolean,
|
||||||
|
};
|
||||||
|
|
||||||
export type PageOptions = {
|
export type PageOptions = {
|
||||||
includeMarkdown?: boolean;
|
includeMarkdown?: boolean;
|
||||||
includeExtract?: boolean;
|
includeExtract?: boolean;
|
||||||
@@ -29,7 +40,8 @@ export type PageOptions = {
|
|||||||
includeLinks?: boolean;
|
includeLinks?: boolean;
|
||||||
useFastMode?: boolean; // beta
|
useFastMode?: boolean; // beta
|
||||||
disableJsDom?: boolean; // beta
|
disableJsDom?: boolean; // beta
|
||||||
atsv?: boolean; // beta
|
atsv?: boolean; // anti-bot solver, beta
|
||||||
|
actions?: Action[]; // beta
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
||||||
import { logScrape } from "../../../services/logging/scrape_log";
|
import { logScrape } from "../../../services/logging/scrape_log";
|
||||||
import { generateRequestParams } from "../single_url";
|
import { generateRequestParams } from "../single_url";
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||||
@@ -20,9 +20,7 @@ import * as Sentry from "@sentry/node";
|
|||||||
*/
|
*/
|
||||||
export async function scrapWithFireEngine({
|
export async function scrapWithFireEngine({
|
||||||
url,
|
url,
|
||||||
waitFor = 0,
|
actions,
|
||||||
screenshot = false,
|
|
||||||
fullPageScreenshot = false,
|
|
||||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
|
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
|
||||||
fireEngineOptions = {},
|
fireEngineOptions = {},
|
||||||
headers,
|
headers,
|
||||||
@@ -31,9 +29,7 @@ export async function scrapWithFireEngine({
|
|||||||
teamId,
|
teamId,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
waitFor?: number;
|
actions?: Action[];
|
||||||
screenshot?: boolean;
|
|
||||||
fullPageScreenshot?: boolean;
|
|
||||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
|
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
|
||||||
fireEngineOptions?: FireEngineOptions;
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
@@ -54,10 +50,7 @@ export async function scrapWithFireEngine({
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
let waitParam = reqParams["params"]?.wait ?? waitFor;
|
|
||||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
|
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
|
||||||
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
|
||||||
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
|
||||||
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||||
|
|
||||||
|
|
||||||
@@ -75,7 +68,7 @@ export async function scrapWithFireEngine({
|
|||||||
}
|
}
|
||||||
|
|
||||||
Logger.info(
|
Logger.info(
|
||||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||||
);
|
);
|
||||||
|
|
||||||
// atsv is only available for beta customers
|
// atsv is only available for beta customers
|
||||||
@@ -101,9 +94,6 @@ export async function scrapWithFireEngine({
|
|||||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||||
{
|
{
|
||||||
url: url,
|
url: url,
|
||||||
wait: waitParam,
|
|
||||||
screenshot: screenshotParam,
|
|
||||||
fullPageScreenshot: fullPageScreenshotParam,
|
|
||||||
headers: headers,
|
headers: headers,
|
||||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||||
priority,
|
priority,
|
||||||
@@ -112,6 +102,7 @@ export async function scrapWithFireEngine({
|
|||||||
...fireEngineOptionsParam,
|
...fireEngineOptionsParam,
|
||||||
atsv: pageOptions?.atsv ?? false,
|
atsv: pageOptions?.atsv ?? false,
|
||||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||||
|
actions: actions,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
@@ -125,8 +116,10 @@ export async function scrapWithFireEngine({
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => x.milliseconds + a, 0);
|
||||||
|
|
||||||
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
|
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal) {
|
||||||
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
||||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -195,9 +195,17 @@ export async function scrapSingleUrl(
|
|||||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
const response = await scrapWithFireEngine({
|
const response = await scrapWithFireEngine({
|
||||||
url,
|
url,
|
||||||
waitFor: pageOptions.waitFor,
|
actions: [
|
||||||
screenshot: pageOptions.screenshot,
|
...(pageOptions.waitFor ? [{
|
||||||
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
type: "wait" as const,
|
||||||
|
milliseconds: pageOptions.waitFor,
|
||||||
|
}] : []),
|
||||||
|
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
|
||||||
|
type: "screenshot" as const,
|
||||||
|
fullPage: !!pageOptions.fullPageScreenshot,
|
||||||
|
}] : []),
|
||||||
|
...(pageOptions.actions ?? []),
|
||||||
|
],
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
headers: pageOptions.headers,
|
headers: pageOptions.headers,
|
||||||
fireEngineOptions: {
|
fireEngineOptions: {
|
||||||
@@ -267,8 +275,12 @@ export async function scrapSingleUrl(
|
|||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
customScrapedContent = await scrapWithFireEngine({
|
customScrapedContent = await scrapWithFireEngine({
|
||||||
url: customScraperResult.url,
|
url: customScraperResult.url,
|
||||||
waitFor: customScraperResult.waitAfterLoad,
|
actions: customScraperResult.waitAfterLoad ? ([
|
||||||
screenshot: false,
|
{
|
||||||
|
type: "wait",
|
||||||
|
milliseconds: customScraperResult.waitAfterLoad,
|
||||||
|
}
|
||||||
|
]) : ([]),
|
||||||
pageOptions: customScraperResult.pageOptions,
|
pageOptions: customScraperResult.pageOptions,
|
||||||
});
|
});
|
||||||
if (screenshot) {
|
if (screenshot) {
|
||||||
|
|||||||
Reference in New Issue
Block a user