Merge pull request #808 from mendableai/feat/skipTlsVerification
feat: skipTlsVerification
This commit is contained in:
@@ -78,7 +78,7 @@ export async function crawlController(
|
|||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
sc.robots = await crawler.getRobotsTxt();
|
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
Logger.debug(
|
Logger.debug(
|
||||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||||
|
|||||||
@@ -117,6 +117,7 @@ export const scrapeOptions = z.object({
|
|||||||
}
|
}
|
||||||
).transform(val => val ? val.toUpperCase() : 'US')
|
).transform(val => val ? val.toUpperCase() : 'US')
|
||||||
}).optional(),
|
}).optional(),
|
||||||
|
skipTlsVerification: z.boolean().default(false),
|
||||||
}).strict(strictMessage)
|
}).strict(strictMessage)
|
||||||
|
|
||||||
|
|
||||||
@@ -433,6 +434,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||||||
parsePDF: x.parsePDF,
|
parsePDF: x.parsePDF,
|
||||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||||
geolocation: x.geolocation,
|
geolocation: x.geolocation,
|
||||||
|
skipTlsVerification: x.skipTlsVerification
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -54,6 +54,7 @@ export type PageOptions = {
|
|||||||
geolocation?: {
|
geolocation?: {
|
||||||
country?: string;
|
country?: string;
|
||||||
};
|
};
|
||||||
|
skipTlsVerification?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
|
|||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { Logger } from "../../../src/lib/logger";
|
||||||
|
import https from "https";
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private jobId: string;
|
private jobId: string;
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
@@ -145,8 +145,14 @@ export class WebCrawler {
|
|||||||
.slice(0, limit);
|
.slice(0, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
public async getRobotsTxt(): Promise<string> {
|
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
|
||||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
let extraArgs = {};
|
||||||
|
if(skipTlsVerification) {
|
||||||
|
extraArgs["httpsAgent"] = new https.Agent({
|
||||||
|
rejectUnauthorized: false
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
|
||||||
return response.data;
|
return response.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -594,6 +594,7 @@ export class WebScraperDataProvider {
|
|||||||
atsv: options.pageOptions?.atsv ?? false,
|
atsv: options.pageOptions?.atsv ?? false,
|
||||||
actions: options.pageOptions?.actions ?? undefined,
|
actions: options.pageOptions?.actions ?? undefined,
|
||||||
geolocation: options.pageOptions?.geolocation ?? undefined,
|
geolocation: options.pageOptions?.geolocation ?? undefined,
|
||||||
|
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
|
||||||
};
|
};
|
||||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||||
this.replaceAllPathsWithAbsolutePaths =
|
this.replaceAllPathsWithAbsolutePaths =
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
|
|||||||
waitFor = 0,
|
waitFor = 0,
|
||||||
screenshot = false,
|
screenshot = false,
|
||||||
fullPageScreenshot = false,
|
fullPageScreenshot = false,
|
||||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" } },
|
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
|
||||||
fireEngineOptions = {},
|
fireEngineOptions = {},
|
||||||
headers,
|
headers,
|
||||||
options,
|
options,
|
||||||
@@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
|
|||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
fullPageScreenshot?: boolean;
|
fullPageScreenshot?: boolean;
|
||||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } };
|
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
|
||||||
fireEngineOptions?: FireEngineOptions;
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
options?: any;
|
options?: any;
|
||||||
@@ -119,6 +119,7 @@ export async function scrapWithFireEngine({
|
|||||||
atsv: pageOptions?.atsv ?? false,
|
atsv: pageOptions?.atsv ?? false,
|
||||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||||
geolocation: pageOptions?.geolocation,
|
geolocation: pageOptions?.geolocation,
|
||||||
|
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
|
||||||
actions: actions,
|
actions: actions,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -157,6 +157,7 @@ export async function scrapSingleUrl(
|
|||||||
atsv: pageOptions.atsv ?? false,
|
atsv: pageOptions.atsv ?? false,
|
||||||
actions: pageOptions.actions ?? undefined,
|
actions: pageOptions.actions ?? undefined,
|
||||||
geolocation: pageOptions.geolocation ?? undefined,
|
geolocation: pageOptions.geolocation ?? undefined,
|
||||||
|
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (extractorOptions) {
|
if (extractorOptions) {
|
||||||
|
|||||||
Reference in New Issue
Block a user