Merge pull request #504 from mendableai/feat/fullpage-screenshot

[Feat] Added fullpagescreenshot capabilities
This commit is contained in:
Nicolas
2024-08-06 13:52:29 -04:00
committed by GitHub
5 changed files with 21 additions and 2 deletions
+10
View File
@@ -84,6 +84,11 @@
"description": "Include a screenshot of the top of the page that you are scraping.", "description": "Include a screenshot of the top of the page that you are scraping.",
"default": false "default": false
}, },
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": { "waitFor": {
"type": "integer", "type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content", "description": "Wait x amount of milliseconds for the page to load to fetch content",
@@ -317,6 +322,11 @@
"description": "Include a screenshot of the top of the page that you are scraping.", "description": "Include a screenshot of the top of the page that you are scraping.",
"default": false "default": false
}, },
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": { "waitFor": {
"type": "integer", "type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content", "description": "Wait x amount of milliseconds for the page to load to fetch content",
+1
View File
@@ -7,6 +7,7 @@ export const defaultPageOptions = {
includeHtml: false, includeHtml: false,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,
fullPageScreenshot: false,
parsePDF: true parsePDF: true
}; };
+1
View File
@@ -18,6 +18,7 @@ export type PageOptions = {
fetchPageContent?: boolean; fetchPageContent?: boolean;
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
fullPageScreenshot?: boolean;
headers?: Record<string, string>; headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean; parsePDF?: boolean;
@@ -11,6 +11,7 @@ import { Logger } from "../../../lib/logger";
* @param url The URL to scrape * @param url The URL to scrape
* @param waitFor The time to wait for the page to load * @param waitFor The time to wait for the page to load
* @param screenshot Whether to take a screenshot * @param screenshot Whether to take a screenshot
* @param fullPageScreenshot Whether to take a full page screenshot
* @param pageOptions The options for the page * @param pageOptions The options for the page
* @param headers The headers to send with the request * @param headers The headers to send with the request
* @param options The options for the request * @param options The options for the request
@@ -20,6 +21,7 @@ export async function scrapWithFireEngine({
url, url,
waitFor = 0, waitFor = 0,
screenshot = false, screenshot = false,
fullPageScreenshot = false,
pageOptions = { parsePDF: true }, pageOptions = { parsePDF: true },
fireEngineOptions = {}, fireEngineOptions = {},
headers, headers,
@@ -28,6 +30,7 @@ export async function scrapWithFireEngine({
url: string; url: string;
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
fireEngineOptions?: FireEngineOptions; fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
@@ -49,6 +52,7 @@ export async function scrapWithFireEngine({
const waitParam = reqParams["params"]?.wait ?? waitFor; const waitParam = reqParams["params"]?.wait ?? waitFor;
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
@@ -61,7 +65,7 @@ export async function scrapWithFireEngine({
let engine = engineParam; // do we want fireEngineOptions as first choice? let engine = engineParam; // do we want fireEngineOptions as first choice?
Logger.info( Logger.info(
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }` `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
); );
@@ -71,6 +75,7 @@ export async function scrapWithFireEngine({
url: url, url: url,
wait: waitParam, wait: waitParam,
screenshot: screenshotParam, screenshot: screenshotParam,
fullPageScreenshot: fullPageScreenshotParam,
headers: headers, headers: headers,
pageOptions: pageOptions, pageOptions: pageOptions,
...fireEngineOptionsParam, ...fireEngineOptionsParam,
@@ -128,6 +128,7 @@ export async function scrapSingleUrl(
includeRawHtml: false, includeRawHtml: false,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,
fullPageScreenshot: false,
headers: undefined, headers: undefined,
}, },
extractorOptions: ExtractorOptions = { extractorOptions: ExtractorOptions = {
@@ -171,6 +172,7 @@ export async function scrapSingleUrl(
url, url,
waitFor: pageOptions.waitFor, waitFor: pageOptions.waitFor,
screenshot: pageOptions.screenshot, screenshot: pageOptions.screenshot,
fullPageScreenshot: pageOptions.fullPageScreenshot,
pageOptions: pageOptions, pageOptions: pageOptions,
headers: pageOptions.headers, headers: pageOptions.headers,
fireEngineOptions: { fireEngineOptions: {
@@ -306,7 +308,7 @@ export async function scrapSingleUrl(
const scrapersInOrder = getScrapingFallbackOrder( const scrapersInOrder = getScrapingFallbackOrder(
defaultScraper, defaultScraper,
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
pageOptions && pageOptions.screenshot && pageOptions.screenshot === true, pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
pageOptions && pageOptions.headers && pageOptions.headers !== undefined pageOptions && pageOptions.headers && pageOptions.headers !== undefined
); );