Nick: ignoreSitemap, better crawling algo

This commit is contained in:
Nicolas
2024-06-10 18:12:41 -07:00
parent 1bd0327e1a
commit f6b06ac27a
4 changed files with 57 additions and 41 deletions
+6
View File
@@ -31,6 +31,7 @@ export class WebScraperDataProvider {
private limit: number = 10000;
private concurrentRequests: number = 20;
private generateImgAltText: boolean = false;
private ignoreSitemap: boolean = false;
private pageOptions?: PageOptions;
private extractorOptions?: ExtractorOptions;
private replaceAllPathsWithAbsolutePaths?: boolean = false;
@@ -38,6 +39,7 @@ export class WebScraperDataProvider {
"gpt-4-turbo";
private crawlerMode: string = "default";
authorize(): void {
throw new Error("Method not implemented.");
}
@@ -174,6 +176,9 @@ export class WebScraperDataProvider {
let links = await crawler.start(
inProgress,
this.pageOptions,
{
ignoreSitemap: this.ignoreSitemap,
},
5,
this.limit,
this.maxCrawledDepth
@@ -474,6 +479,7 @@ export class WebScraperDataProvider {
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
// make sure all urls start with https://
this.urls = this.urls.map((url) => {