Merge pull request #399 from mendableai/nsc/sitemap-fix-fire-engine

Sitemap fallback fixes w/ fire-engine
This commit is contained in:
Nicolas
2024-07-12 23:24:15 -04:00
committed by GitHub
5 changed files with 32 additions and 5 deletions
+8
View File
@@ -129,3 +129,11 @@ export interface FireEngineResponse {
pageError?: string; pageError?: string;
} }
export interface FireEngineOptions{
mobileProxy?: boolean;
method?: string;
engine?: string;
blockMedia?: boolean;
blockAds?: boolean;
}
@@ -8,7 +8,6 @@ import { scrapSingleUrl } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout"; import { axiosTimeout } from "../../../src/lib/timeout";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
export class WebCrawler { export class WebCrawler {
private initialUrl: string; private initialUrl: string;
@@ -1,5 +1,5 @@
import axios from "axios"; import axios from "axios";
import { FireEngineResponse } from "../../../lib/entities"; import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
import { logScrape } from "../../../services/logging/scrape_log"; import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url"; import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { fetchAndProcessPdf } from "../utils/pdfProcessor";
@@ -20,6 +20,7 @@ export async function scrapWithFireEngine({
waitFor = 0, waitFor = 0,
screenshot = false, screenshot = false,
pageOptions = { parsePDF: true }, pageOptions = { parsePDF: true },
fireEngineOptions = {},
headers, headers,
options, options,
}: { }: {
@@ -27,6 +28,7 @@ export async function scrapWithFireEngine({
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
options?: any; options?: any;
}): Promise<FireEngineResponse> { }): Promise<FireEngineResponse> {
@@ -45,18 +47,25 @@ export async function scrapWithFireEngine({
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
const waitParam = reqParams["params"]?.wait ?? waitFor; const waitParam = reqParams["params"]?.wait ?? waitFor;
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
let endpoint = fireEngineOptionsParam.method === "get" ? "/request" : "/scrape";
console.log( console.log(
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
); );
console.log(fireEngineOptionsParam)
const response = await axios.post( const response = await axios.post(
process.env.FIRE_ENGINE_BETA_URL + "/scrape", process.env.FIRE_ENGINE_BETA_URL + endpoint,
{ {
url: url, url: url,
wait: waitParam, wait: waitParam,
screenshot: screenshotParam, screenshot: screenshotParam,
headers: headers, headers: headers,
pageOptions: pageOptions, pageOptions: pageOptions,
...fireEngineOptionsParam,
}, },
{ {
headers: { headers: {
@@ -70,6 +79,7 @@ export async function scrapWithFireEngine({
console.error( console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
); );
logParams.error_message = response.data?.pageError; logParams.error_message = response.data?.pageError;
logParams.response_code = response.data?.pageStatusCode; logParams.response_code = response.data?.pageStatusCode;
+1 -1
View File
@@ -21,7 +21,7 @@ export async function getLinksFromSitemap(
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data; content = response.data;
} else if (mode === 'fire-engine') { } else if (mode === 'fire-engine') {
const response = await scrapWithFireEngine({ url: sitemapUrl }); const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } });
content = response.html; content = response.html;
} }
} catch (error) { } catch (error) {
@@ -192,4 +192,14 @@ export const urlSpecificParams = {
"ir.veeva.com":{ "ir.veeva.com":{
defaultScraper: "fire-engine", defaultScraper: "fire-engine",
}, },
"eonhealth.com":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
mobileProxy: true,
method: "get",
engine: "request",
},
},
},
}; };