Merge branch 'main' into feat/maxDepthRelative

This commit is contained in:
Eric Ciarla
2024-06-14 11:49:12 -04:00
committed by GitHub
22 changed files with 824 additions and 128 deletions
+14 -8
View File
@@ -245,7 +245,7 @@ export class WebScraperDataProvider {
content: "",
html: this.pageOptions?.includeHtml ? "" : undefined,
markdown: "",
metadata: { sourceURL: url },
metadata: { sourceURL: url, pageStatusCode: 200 },
}));
}
@@ -284,10 +284,10 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all(
pdfLinks.map(async (pdfLink) => {
const pdfContent = await fetchAndProcessPdf(pdfLink);
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
return {
content: pdfContent,
metadata: { sourceURL: pdfLink },
content: content,
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
provider: "web-scraper",
};
})
@@ -296,10 +296,10 @@ export class WebScraperDataProvider {
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all(
docxLinks.map(async (p) => {
const docXDocument = await fetchAndProcessDocx(p);
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
return {
content: docXDocument,
metadata: { sourceURL: p },
content,
metadata: { sourceURL: p, pageStatusCode, pageError },
provider: "web-scraper",
};
})
@@ -479,7 +479,13 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
this.pageOptions = options.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
replaceAllPathsWithAbsolutePaths: false,
parsePDF: true,
removeTags: []
};
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check