Merge remote-tracking branch 'origin/main' into feat/allowbackwardcrawling-option

This commit is contained in:
rafaelsideguide
2024-06-12 11:01:05 -03:00
12 changed files with 95 additions and 28 deletions
+6 -5
View File
@@ -303,9 +303,10 @@ export class WebScraperDataProvider {
}
private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths
? replacePathsWithAbsolutePaths(documents)
: replaceImgPathsWithAbsolutePaths(documents);
if (this.replaceAllPathsWithAbsolutePaths) {
documents = replacePathsWithAbsolutePaths(documents);
}
return replaceImgPathsWithAbsolutePaths(documents);
}
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
@@ -474,9 +475,9 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default";