Added metadata.pageStatusCode and metadata.pageError properties to the responses

This commit is contained in:
rafaelsideguide
2024-06-13 17:08:40 -03:00
parent d48c0df6c5
commit bb859ae9a7
13 changed files with 494 additions and 75 deletions
+7 -7
View File
@@ -241,7 +241,7 @@ export class WebScraperDataProvider {
content: "",
html: this.pageOptions?.includeHtml ? "" : undefined,
markdown: "",
metadata: { sourceURL: url },
metadata: { sourceURL: url, pageStatusCode: 200 },
}));
}
@@ -280,10 +280,10 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all(
pdfLinks.map(async (pdfLink) => {
const pdfContent = await fetchAndProcessPdf(pdfLink);
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink);
return {
content: pdfContent,
metadata: { sourceURL: pdfLink },
content: content,
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
provider: "web-scraper",
};
})
@@ -292,10 +292,10 @@ export class WebScraperDataProvider {
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all(
docxLinks.map(async (p) => {
const docXDocument = await fetchAndProcessDocx(p);
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
return {
content: docXDocument,
metadata: { sourceURL: p },
content,
metadata: { sourceURL: p, pageStatusCode, pageError },
provider: "web-scraper",
};
})