From 3edc3a3d1580b7ca10a51dbd852a37def6103c0c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 5 Aug 2024 18:17:37 -0300 Subject: [PATCH 1/2] added fullpagescreenshot capabilities, wip on fire-engine side --- apps/api/openapi.json | 10 ++++++++++ apps/api/src/lib/default-values.ts | 1 + apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 4 ++++ apps/api/src/scraper/WebScraper/single_url.ts | 2 ++ 5 files changed, 18 insertions(+) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index e0b583f0..fb0c4305 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -84,6 +84,11 @@ "description": "Include a screenshot of the top of the page that you are scraping.", "default": false }, + "fullPageScreenshot": { + "type": "boolean", + "description": "Include a full page screenshot of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", @@ -317,6 +322,11 @@ "description": "Include a screenshot of the top of the page that you are scraping.", "default": false }, + "fullPageScreenshot": { + "type": "boolean", + "description": "Include a full page screenshot of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index 3b303781..152f47d7 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -7,6 +7,7 @@ export const defaultPageOptions = { includeHtml: false, waitFor: 0, screenshot: false, + fullPageScreenshot: false, parsePDF: true }; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 9ffa4810..4dc2050d 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -18,6 +18,7 @@ export type PageOptions = { fetchPageContent?: boolean; waitFor?: number; screenshot?: boolean; + fullPageScreenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; parsePDF?: boolean; diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index ba67043c..dfe23a89 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -11,6 +11,7 @@ import { Logger } from "../../../lib/logger"; * @param url The URL to scrape * @param waitFor The time to wait for the page to load * @param screenshot Whether to take a screenshot + * @param fullPageScreenshot Whether to take a full page screenshot * @param pageOptions The options for the page * @param headers The headers to send with the request * @param options The options for the request @@ -20,6 +21,7 @@ export async function scrapWithFireEngine({ url, waitFor = 0, screenshot = false, + fullPageScreenshot = false, pageOptions = { parsePDF: true }, fireEngineOptions = {}, headers, @@ -28,6 +30,7 @@ export async function scrapWithFireEngine({ url: string; waitFor?: number; screenshot?: boolean; + fullPageScreenshot?: boolean; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; fireEngineOptions?: FireEngineOptions; headers?: Record; @@ -71,6 +74,7 @@ export async function scrapWithFireEngine({ url: url, wait: waitParam, screenshot: screenshotParam, + fullPageScreenshot: fullPageScreenshot, headers: headers, pageOptions: pageOptions, ...fireEngineOptionsParam, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4a44b23f..0fa2fc8b 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -128,6 +128,7 @@ export async function scrapSingleUrl( includeRawHtml: false, waitFor: 0, screenshot: false, + fullPageScreenshot: false, headers: undefined, }, extractorOptions: ExtractorOptions = { @@ -171,6 +172,7 @@ export async function scrapSingleUrl( url, waitFor: pageOptions.waitFor, screenshot: pageOptions.screenshot, + fullPageScreenshot: pageOptions.fullPageScreenshot, pageOptions: pageOptions, headers: pageOptions.headers, fireEngineOptions: { From 4d24a99d50358a31a5aa30fde79309545227c3dc Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 6 Aug 2024 09:34:43 -0300 Subject: [PATCH 2/2] fix params --- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 5 +++-- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index dfe23a89..0bb9986f 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -52,6 +52,7 @@ export async function scrapWithFireEngine({ const waitParam = reqParams["params"]?.wait ?? waitFor; const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; + const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; @@ -64,7 +65,7 @@ export async function scrapWithFireEngine({ let engine = engineParam; // do we want fireEngineOptions as first choice? Logger.info( - `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }` + `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` ); @@ -74,7 +75,7 @@ export async function scrapWithFireEngine({ url: url, wait: waitParam, screenshot: screenshotParam, - fullPageScreenshot: fullPageScreenshot, + fullPageScreenshot: fullPageScreenshotParam, headers: headers, pageOptions: pageOptions, ...fireEngineOptionsParam, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0fa2fc8b..12e075fd 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -308,7 +308,7 @@ export async function scrapSingleUrl( const scrapersInOrder = getScrapingFallbackOrder( defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, - pageOptions && pageOptions.screenshot && pageOptions.screenshot === true, + pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true), pageOptions && pageOptions.headers && pageOptions.headers !== undefined );