Merge pull request #504 from mendableai/feat/fullpage-screenshot
[Feat] Added fullpagescreenshot capabilities
This commit is contained in:
@@ -84,6 +84,11 @@
|
|||||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||||
"default": false
|
"default": false
|
||||||
},
|
},
|
||||||
|
"fullPageScreenshot": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
"waitFor": {
|
"waitFor": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
@@ -317,6 +322,11 @@
|
|||||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||||
"default": false
|
"default": false
|
||||||
},
|
},
|
||||||
|
"fullPageScreenshot": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
"waitFor": {
|
"waitFor": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ export const defaultPageOptions = {
|
|||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
waitFor: 0,
|
waitFor: 0,
|
||||||
screenshot: false,
|
screenshot: false,
|
||||||
|
fullPageScreenshot: false,
|
||||||
parsePDF: true
|
parsePDF: true
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ export type PageOptions = {
|
|||||||
fetchPageContent?: boolean;
|
fetchPageContent?: boolean;
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
|
fullPageScreenshot?: boolean;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
parsePDF?: boolean;
|
parsePDF?: boolean;
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import { Logger } from "../../../lib/logger";
|
|||||||
* @param url The URL to scrape
|
* @param url The URL to scrape
|
||||||
* @param waitFor The time to wait for the page to load
|
* @param waitFor The time to wait for the page to load
|
||||||
* @param screenshot Whether to take a screenshot
|
* @param screenshot Whether to take a screenshot
|
||||||
|
* @param fullPageScreenshot Whether to take a full page screenshot
|
||||||
* @param pageOptions The options for the page
|
* @param pageOptions The options for the page
|
||||||
* @param headers The headers to send with the request
|
* @param headers The headers to send with the request
|
||||||
* @param options The options for the request
|
* @param options The options for the request
|
||||||
@@ -20,6 +21,7 @@ export async function scrapWithFireEngine({
|
|||||||
url,
|
url,
|
||||||
waitFor = 0,
|
waitFor = 0,
|
||||||
screenshot = false,
|
screenshot = false,
|
||||||
|
fullPageScreenshot = false,
|
||||||
pageOptions = { parsePDF: true },
|
pageOptions = { parsePDF: true },
|
||||||
fireEngineOptions = {},
|
fireEngineOptions = {},
|
||||||
headers,
|
headers,
|
||||||
@@ -28,6 +30,7 @@ export async function scrapWithFireEngine({
|
|||||||
url: string;
|
url: string;
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
|
fullPageScreenshot?: boolean;
|
||||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
||||||
fireEngineOptions?: FireEngineOptions;
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
@@ -49,6 +52,7 @@ export async function scrapWithFireEngine({
|
|||||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||||
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||||
|
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||||
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||||
|
|
||||||
|
|
||||||
@@ -61,7 +65,7 @@ export async function scrapWithFireEngine({
|
|||||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||||
|
|
||||||
Logger.info(
|
Logger.info(
|
||||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
@@ -71,6 +75,7 @@ export async function scrapWithFireEngine({
|
|||||||
url: url,
|
url: url,
|
||||||
wait: waitParam,
|
wait: waitParam,
|
||||||
screenshot: screenshotParam,
|
screenshot: screenshotParam,
|
||||||
|
fullPageScreenshot: fullPageScreenshotParam,
|
||||||
headers: headers,
|
headers: headers,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
...fireEngineOptionsParam,
|
...fireEngineOptionsParam,
|
||||||
|
|||||||
@@ -128,6 +128,7 @@ export async function scrapSingleUrl(
|
|||||||
includeRawHtml: false,
|
includeRawHtml: false,
|
||||||
waitFor: 0,
|
waitFor: 0,
|
||||||
screenshot: false,
|
screenshot: false,
|
||||||
|
fullPageScreenshot: false,
|
||||||
headers: undefined,
|
headers: undefined,
|
||||||
},
|
},
|
||||||
extractorOptions: ExtractorOptions = {
|
extractorOptions: ExtractorOptions = {
|
||||||
@@ -171,6 +172,7 @@ export async function scrapSingleUrl(
|
|||||||
url,
|
url,
|
||||||
waitFor: pageOptions.waitFor,
|
waitFor: pageOptions.waitFor,
|
||||||
screenshot: pageOptions.screenshot,
|
screenshot: pageOptions.screenshot,
|
||||||
|
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
headers: pageOptions.headers,
|
headers: pageOptions.headers,
|
||||||
fireEngineOptions: {
|
fireEngineOptions: {
|
||||||
@@ -306,7 +308,7 @@ export async function scrapSingleUrl(
|
|||||||
const scrapersInOrder = getScrapingFallbackOrder(
|
const scrapersInOrder = getScrapingFallbackOrder(
|
||||||
defaultScraper,
|
defaultScraper,
|
||||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||||
pageOptions && pageOptions.screenshot && pageOptions.screenshot === true,
|
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user