Merge pull request #399 from mendableai/nsc/sitemap-fix-fire-engine
Sitemap fallback fixes w/ fire-engine
This commit is contained in:
@@ -129,3 +129,11 @@ export interface FireEngineResponse {
|
|||||||
pageError?: string;
|
pageError?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export interface FireEngineOptions{
|
||||||
|
mobileProxy?: boolean;
|
||||||
|
method?: string;
|
||||||
|
engine?: string;
|
||||||
|
blockMedia?: boolean;
|
||||||
|
blockAds?: boolean;
|
||||||
|
}
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ import { scrapSingleUrl } from "./single_url";
|
|||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import { FireEngineResponse } from "../../../lib/entities";
|
import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
||||||
import { logScrape } from "../../../services/logging/scrape_log";
|
import { logScrape } from "../../../services/logging/scrape_log";
|
||||||
import { generateRequestParams } from "../single_url";
|
import { generateRequestParams } from "../single_url";
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||||
@@ -20,6 +20,7 @@ export async function scrapWithFireEngine({
|
|||||||
waitFor = 0,
|
waitFor = 0,
|
||||||
screenshot = false,
|
screenshot = false,
|
||||||
pageOptions = { parsePDF: true },
|
pageOptions = { parsePDF: true },
|
||||||
|
fireEngineOptions = {},
|
||||||
headers,
|
headers,
|
||||||
options,
|
options,
|
||||||
}: {
|
}: {
|
||||||
@@ -27,6 +28,7 @@ export async function scrapWithFireEngine({
|
|||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
||||||
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
options?: any;
|
options?: any;
|
||||||
}): Promise<FireEngineResponse> {
|
}): Promise<FireEngineResponse> {
|
||||||
@@ -45,18 +47,25 @@ export async function scrapWithFireEngine({
|
|||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||||
|
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||||
|
|
||||||
|
let endpoint = fireEngineOptionsParam.method === "get" ? "/request" : "/scrape";
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
|
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
|
||||||
);
|
);
|
||||||
|
|
||||||
|
console.log(fireEngineOptionsParam)
|
||||||
|
|
||||||
const response = await axios.post(
|
const response = await axios.post(
|
||||||
process.env.FIRE_ENGINE_BETA_URL + "/scrape",
|
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||||
{
|
{
|
||||||
url: url,
|
url: url,
|
||||||
wait: waitParam,
|
wait: waitParam,
|
||||||
screenshot: screenshotParam,
|
screenshot: screenshotParam,
|
||||||
headers: headers,
|
headers: headers,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
...fireEngineOptionsParam,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
@@ -70,6 +79,7 @@ export async function scrapWithFireEngine({
|
|||||||
console.error(
|
console.error(
|
||||||
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
|
|
||||||
logParams.error_message = response.data?.pageError;
|
logParams.error_message = response.data?.pageError;
|
||||||
logParams.response_code = response.data?.pageStatusCode;
|
logParams.response_code = response.data?.pageStatusCode;
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ export async function getLinksFromSitemap(
|
|||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
content = response.data;
|
content = response.data;
|
||||||
} else if (mode === 'fire-engine') {
|
} else if (mode === 'fire-engine') {
|
||||||
const response = await scrapWithFireEngine({ url: sitemapUrl });
|
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } });
|
||||||
content = response.html;
|
content = response.html;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|||||||
@@ -192,4 +192,14 @@ export const urlSpecificParams = {
|
|||||||
"ir.veeva.com":{
|
"ir.veeva.com":{
|
||||||
defaultScraper: "fire-engine",
|
defaultScraper: "fire-engine",
|
||||||
},
|
},
|
||||||
|
"eonhealth.com":{
|
||||||
|
defaultScraper: "fire-engine",
|
||||||
|
params:{
|
||||||
|
fireEngineOptions:{
|
||||||
|
mobileProxy: true,
|
||||||
|
method: "get",
|
||||||
|
engine: "request",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user