From e098e88ea741f11ac5e2d2cfda28f32c93f1c8d2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 12 Jul 2024 22:02:08 -0400 Subject: [PATCH 1/3] Nick: --- apps/api/src/lib/entities.ts | 8 ++++++++ apps/api/src/scraper/WebScraper/crawler.ts | 1 - apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 5 ++++- apps/api/src/scraper/WebScraper/sitemap.ts | 2 +- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 3cd59b6c..089d373c 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -129,3 +129,11 @@ export interface FireEngineResponse { pageError?: string; } + +export interface FireEngineOptions{ + mobileProxy?: boolean; + method?: string; + engine?: string; + blockMedia?: boolean; + blockAds?: boolean; +} diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index acaa432e..80705dbd 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -8,7 +8,6 @@ import { scrapSingleUrl } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; -import { scrapWithFireEngine } from "./scrapers/fireEngine"; export class WebCrawler { private initialUrl: string; diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 2e971139..cb7783a6 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -1,5 +1,5 @@ import axios from "axios"; -import { FireEngineResponse } from "../../../lib/entities"; +import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities"; import { logScrape } from "../../../services/logging/scrape_log"; import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; @@ -20,6 +20,7 @@ export async function scrapWithFireEngine({ waitFor = 0, screenshot = false, pageOptions = { parsePDF: true }, + fireEngineOptions = {}, headers, options, }: { @@ -27,6 +28,7 @@ export async function scrapWithFireEngine({ waitFor?: number; screenshot?: boolean; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; + fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; }): Promise { @@ -57,6 +59,7 @@ export async function scrapWithFireEngine({ screenshot: screenshotParam, headers: headers, pageOptions: pageOptions, + ...fireEngineOptions, }, { headers: { diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 6d1d28be..1184ef27 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -21,7 +21,7 @@ export async function getLinksFromSitemap( const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } else if (mode === 'fire-engine') { - const response = await scrapWithFireEngine({ url: sitemapUrl }); + const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } }); content = response.html; } } catch (error) { From a3b1703b6821a18b4c987aa9cf53e1f96a24c88a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 12 Jul 2024 22:15:00 -0400 Subject: [PATCH 2/3] Update fireEngine.ts --- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index cb7783a6..f63f73a9 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -47,6 +47,7 @@ export async function scrapWithFireEngine({ const reqParams = await generateRequestParams(url); const waitParam = reqParams["params"]?.wait ?? waitFor; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; + const fireEngineOptionsParam = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; console.log( `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` ); @@ -59,7 +60,7 @@ export async function scrapWithFireEngine({ screenshot: screenshotParam, headers: headers, pageOptions: pageOptions, - ...fireEngineOptions, + ...fireEngineOptionsParam, }, { headers: { From 949791049f722f4cd93e986378813ffb7a9aea2a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 12 Jul 2024 23:20:26 -0400 Subject: [PATCH 3/3] Nick: --- .../src/scraper/WebScraper/scrapers/fireEngine.ts | 12 +++++++++--- .../WebScraper/utils/custom/website_params.ts | 10 ++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 94d49598..30412f40 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -47,13 +47,18 @@ export async function scrapWithFireEngine({ const reqParams = await generateRequestParams(url); const waitParam = reqParams["params"]?.wait ?? waitFor; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; - const fireEngineOptionsParam = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; + const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; + + let endpoint = fireEngineOptionsParam.method === "get" ? "/request" : "/scrape"; + console.log( - `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` + `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` ); + console.log(fireEngineOptionsParam) + const response = await axios.post( - process.env.FIRE_ENGINE_BETA_URL + "/scrape", + process.env.FIRE_ENGINE_BETA_URL + endpoint, { url: url, wait: waitParam, @@ -74,6 +79,7 @@ export async function scrapWithFireEngine({ console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); + logParams.error_message = response.data?.pageError; logParams.response_code = response.data?.pageStatusCode; diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index f86f085f..a1c256cc 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -192,4 +192,14 @@ export const urlSpecificParams = { "ir.veeva.com":{ defaultScraper: "fire-engine", }, + "eonhealth.com":{ + defaultScraper: "fire-engine", + params:{ + fireEngineOptions:{ + mobileProxy: true, + method: "get", + engine: "request", + }, + }, + }, };