Merge branch 'main' into patch-1
This commit is contained in:
@@ -1,24 +1,40 @@
|
||||
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
|
||||
import {
|
||||
Document,
|
||||
ExtractorOptions,
|
||||
PageOptions,
|
||||
WebScraperOptions,
|
||||
} from "../../lib/entities";
|
||||
import { Progress } from "../../lib/entities";
|
||||
import { scrapSingleUrl } from "./single_url";
|
||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||
import { WebCrawler } from "./crawler";
|
||||
import { getValue, setValue } from "../../services/redis";
|
||||
import { getImageDescription } from "./utils/gptVision";
|
||||
import { getImageDescription } from "./utils/imageDescription";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
|
||||
import {
|
||||
replaceImgPathsWithAbsolutePaths,
|
||||
replacePathsWithAbsolutePaths,
|
||||
} from "./utils/replacePaths";
|
||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||
|
||||
export class WebScraperDataProvider {
|
||||
private bullJobId: string;
|
||||
private urls: string[] = [""];
|
||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||
private includes: string[];
|
||||
private excludes: string[];
|
||||
private maxCrawledLinks: number;
|
||||
private maxCrawledDepth: number = 10;
|
||||
private returnOnlyUrls: boolean;
|
||||
private limit: number = 10000;
|
||||
private concurrentRequests: number = 20;
|
||||
private generateImgAltText: boolean = false;
|
||||
private pageOptions?: PageOptions;
|
||||
private extractorOptions?: ExtractorOptions;
|
||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
||||
"gpt-4-turbo";
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
@@ -34,14 +50,13 @@ export class WebScraperDataProvider {
|
||||
): Promise<Document[]> {
|
||||
const totalUrls = urls.length;
|
||||
let processedUrls = 0;
|
||||
console.log("Converting urls to documents");
|
||||
console.log("Total urls", urls);
|
||||
|
||||
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
||||
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||
await Promise.all(
|
||||
batchUrls.map(async (url, index) => {
|
||||
const result = await scrapSingleUrl(url, true, this.pageOptions);
|
||||
const result = await scrapSingleUrl(url, this.pageOptions);
|
||||
processedUrls++;
|
||||
if (inProgress) {
|
||||
inProgress({
|
||||
@@ -49,11 +64,26 @@ export class WebScraperDataProvider {
|
||||
total: totalUrls,
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: url,
|
||||
currentDocument: result,
|
||||
});
|
||||
}
|
||||
|
||||
results[i + index] = result;
|
||||
})
|
||||
);
|
||||
try {
|
||||
if (this.mode === "crawl" && this.bullJobId) {
|
||||
const job = await getWebScraperQueue().getJob(this.bullJobId);
|
||||
const jobStatus = await job.getState();
|
||||
if (jobStatus === "failed") {
|
||||
throw new Error(
|
||||
"Job has failed or has been cancelled by the user. Stopping the job..."
|
||||
);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
}
|
||||
}
|
||||
return results.filter((result) => result !== null) as Document[];
|
||||
}
|
||||
@@ -62,156 +92,158 @@ export class WebScraperDataProvider {
|
||||
useCaching: boolean = false,
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
this.validateInitialUrl();
|
||||
|
||||
if (!useCaching) {
|
||||
return this.processDocumentsWithoutCache(inProgress);
|
||||
}
|
||||
|
||||
return this.processDocumentsWithCache(inProgress);
|
||||
}
|
||||
|
||||
private validateInitialUrl(): void {
|
||||
if (this.urls[0].trim() === "") {
|
||||
throw new Error("Url is required");
|
||||
}
|
||||
}
|
||||
|
||||
if (!useCaching) {
|
||||
if (this.mode === "crawl") {
|
||||
const crawler = new WebCrawler({
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
excludes: this.excludes,
|
||||
maxCrawledLinks: this.maxCrawledLinks,
|
||||
limit: this.limit,
|
||||
generateImgAltText: this.generateImgAltText,
|
||||
});
|
||||
let links = await crawler.start(inProgress, 5, this.limit);
|
||||
if (this.returnOnlyUrls) {
|
||||
return links.map((url) => ({
|
||||
content: "",
|
||||
metadata: { sourceURL: url },
|
||||
provider: "web",
|
||||
type: "text",
|
||||
}));
|
||||
}
|
||||
/**
|
||||
* Process documents without cache handling each mode
|
||||
* @param inProgress inProgress
|
||||
* @returns documents
|
||||
*/
|
||||
private async processDocumentsWithoutCache(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
switch (this.mode) {
|
||||
case "crawl":
|
||||
return this.handleCrawlMode(inProgress);
|
||||
case "single_urls":
|
||||
return this.handleSingleUrlsMode(inProgress);
|
||||
case "sitemap":
|
||||
return this.handleSitemapMode(inProgress);
|
||||
default:
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
let pdfDocuments: Document[] = [];
|
||||
for (let pdfLink of pdfLinks) {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
pdfDocuments.push({
|
||||
content: pdfContent,
|
||||
metadata: { sourceURL: pdfLink },
|
||||
provider: "web-scraper"
|
||||
});
|
||||
}
|
||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||
if (this.generateImgAltText) {
|
||||
documents = await this.generatesImgAltText(documents);
|
||||
}
|
||||
documents = documents.concat(pdfDocuments);
|
||||
|
||||
// CACHING DOCUMENTS
|
||||
// - parent document
|
||||
const cachedParentDocumentString = await getValue(
|
||||
"web-scraper-cache:" + this.normalizeUrl(this.urls[0])
|
||||
);
|
||||
if (cachedParentDocumentString != null) {
|
||||
let cachedParentDocument = JSON.parse(cachedParentDocumentString);
|
||||
if (
|
||||
!cachedParentDocument.childrenLinks ||
|
||||
cachedParentDocument.childrenLinks.length < links.length - 1
|
||||
) {
|
||||
cachedParentDocument.childrenLinks = links.filter(
|
||||
(link) => link !== this.urls[0]
|
||||
);
|
||||
await setValue(
|
||||
"web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
|
||||
JSON.stringify(cachedParentDocument),
|
||||
60 * 60 * 24 * 10
|
||||
); // 10 days
|
||||
}
|
||||
} else {
|
||||
let parentDocument = documents.filter(
|
||||
(document) =>
|
||||
this.normalizeUrl(document.metadata.sourceURL) ===
|
||||
this.normalizeUrl(this.urls[0])
|
||||
);
|
||||
await this.setCachedDocuments(parentDocument, links);
|
||||
}
|
||||
|
||||
await this.setCachedDocuments(
|
||||
documents.filter(
|
||||
(document) =>
|
||||
this.normalizeUrl(document.metadata.sourceURL) !==
|
||||
this.normalizeUrl(this.urls[0])
|
||||
),
|
||||
[]
|
||||
);
|
||||
documents = this.removeChildLinks(documents);
|
||||
documents = documents.splice(0, this.limit);
|
||||
return documents;
|
||||
}
|
||||
|
||||
if (this.mode === "single_urls") {
|
||||
let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
|
||||
let pdfDocuments: Document[] = [];
|
||||
for (let pdfLink of pdfLinks) {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
pdfDocuments.push({
|
||||
content: pdfContent,
|
||||
metadata: { sourceURL: pdfLink },
|
||||
provider: "web-scraper"
|
||||
});
|
||||
}
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(
|
||||
this.urls.filter((link) => !link.endsWith(".pdf")),
|
||||
inProgress
|
||||
);
|
||||
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||
if (this.generateImgAltText) {
|
||||
documents = await this.generatesImgAltText(documents);
|
||||
}
|
||||
const baseUrl = new URL(this.urls[0]).origin;
|
||||
documents = await this.getSitemapData(baseUrl, documents);
|
||||
documents = documents.concat(pdfDocuments);
|
||||
|
||||
await this.setCachedDocuments(documents);
|
||||
documents = this.removeChildLinks(documents);
|
||||
documents = documents.splice(0, this.limit);
|
||||
return documents;
|
||||
}
|
||||
if (this.mode === "sitemap") {
|
||||
let links = await getLinksFromSitemap(this.urls[0]);
|
||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
let pdfDocuments: Document[] = [];
|
||||
for (let pdfLink of pdfLinks) {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
pdfDocuments.push({
|
||||
content: pdfContent,
|
||||
metadata: { sourceURL: pdfLink },
|
||||
provider: "web-scraper"
|
||||
});
|
||||
}
|
||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(
|
||||
links.slice(0, this.limit),
|
||||
inProgress
|
||||
);
|
||||
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||
if (this.generateImgAltText) {
|
||||
documents = await this.generatesImgAltText(documents);
|
||||
}
|
||||
documents = documents.concat(pdfDocuments);
|
||||
|
||||
await this.setCachedDocuments(documents);
|
||||
documents = this.removeChildLinks(documents);
|
||||
documents = documents.splice(0, this.limit);
|
||||
return documents;
|
||||
}
|
||||
|
||||
return [];
|
||||
private async handleCrawlMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
const crawler = new WebCrawler({
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
excludes: this.excludes,
|
||||
maxCrawledLinks: this.maxCrawledLinks,
|
||||
maxCrawledDepth: this.maxCrawledDepth,
|
||||
limit: this.limit,
|
||||
generateImgAltText: this.generateImgAltText,
|
||||
});
|
||||
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
||||
private async handleSingleUrlsMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let documents = await this.processLinks(this.urls, inProgress);
|
||||
return documents;
|
||||
}
|
||||
|
||||
private async handleSitemapMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let links = await getLinksFromSitemap(this.urls[0]);
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
||||
private async returnOnlyUrlsResponse(
|
||||
links: string[],
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
inProgress?.({
|
||||
current: links.length,
|
||||
total: links.length,
|
||||
status: "COMPLETED",
|
||||
currentDocumentUrl: this.urls[0],
|
||||
});
|
||||
return links.map((url) => ({
|
||||
content: "",
|
||||
html: this.pageOptions?.includeHtml ? "" : undefined,
|
||||
markdown: "",
|
||||
metadata: { sourceURL: url },
|
||||
}));
|
||||
}
|
||||
|
||||
private async processLinks(
|
||||
links: string[],
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
documents = this.applyPathReplacements(documents);
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
|
||||
if (
|
||||
this.extractorOptions.mode === "llm-extraction" &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(documents, this.extractorOptions);
|
||||
}
|
||||
return documents.concat(pdfDocuments);
|
||||
}
|
||||
|
||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
pdfLinks.map(async (pdfLink) => {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
return {
|
||||
content: pdfContent,
|
||||
metadata: { sourceURL: pdfLink },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
private applyPathReplacements(documents: Document[]): Document[] {
|
||||
return this.replaceAllPathsWithAbsolutePaths
|
||||
? replacePathsWithAbsolutePaths(documents)
|
||||
: replaceImgPathsWithAbsolutePaths(documents);
|
||||
}
|
||||
|
||||
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
||||
return this.generateImgAltText
|
||||
? this.generatesImgAltText(documents)
|
||||
: documents;
|
||||
}
|
||||
|
||||
private async cacheAndFinalizeDocuments(
|
||||
documents: Document[],
|
||||
links: string[]
|
||||
): Promise<Document[]> {
|
||||
await this.setCachedDocuments(documents, links);
|
||||
documents = this.removeChildLinks(documents);
|
||||
return documents.splice(0, this.limit);
|
||||
}
|
||||
|
||||
private async processDocumentsWithCache(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let documents = await this.getCachedDocuments(
|
||||
this.urls.slice(0, this.limit)
|
||||
);
|
||||
@@ -220,22 +252,30 @@ export class WebScraperDataProvider {
|
||||
false,
|
||||
inProgress
|
||||
);
|
||||
newDocuments.forEach((doc) => {
|
||||
if (
|
||||
!documents.some(
|
||||
(d) =>
|
||||
this.normalizeUrl(d.metadata.sourceURL) ===
|
||||
this.normalizeUrl(doc.metadata?.sourceURL)
|
||||
)
|
||||
) {
|
||||
documents.push(doc);
|
||||
}
|
||||
});
|
||||
documents = this.mergeNewDocuments(documents, newDocuments);
|
||||
}
|
||||
documents = this.filterDocsExcludeInclude(documents);
|
||||
documents = this.filterDepth(documents);
|
||||
documents = this.removeChildLinks(documents);
|
||||
documents = documents.splice(0, this.limit);
|
||||
return documents;
|
||||
return documents.splice(0, this.limit);
|
||||
}
|
||||
|
||||
private mergeNewDocuments(
|
||||
existingDocuments: Document[],
|
||||
newDocuments: Document[]
|
||||
): Document[] {
|
||||
newDocuments.forEach((doc) => {
|
||||
if (
|
||||
!existingDocuments.some(
|
||||
(d) =>
|
||||
this.normalizeUrl(d.metadata.sourceURL) ===
|
||||
this.normalizeUrl(doc.metadata?.sourceURL)
|
||||
)
|
||||
) {
|
||||
existingDocuments.push(doc);
|
||||
}
|
||||
});
|
||||
return existingDocuments;
|
||||
}
|
||||
|
||||
private filterDocsExcludeInclude(documents: Document[]): Document[] {
|
||||
@@ -312,7 +352,7 @@ export class WebScraperDataProvider {
|
||||
documents.push(cachedDocument);
|
||||
|
||||
// get children documents
|
||||
for (const childUrl of cachedDocument.childrenLinks) {
|
||||
for (const childUrl of cachedDocument.childrenLinks || []) {
|
||||
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
||||
const childCachedDocumentString = await getValue(
|
||||
"web-scraper-cache:" + normalizedChildUrl
|
||||
@@ -340,18 +380,21 @@ export class WebScraperDataProvider {
|
||||
throw new Error("Urls are required");
|
||||
}
|
||||
|
||||
this.bullJobId = options.bullJobId;
|
||||
this.urls = options.urls;
|
||||
this.mode = options.mode;
|
||||
this.concurrentRequests = options.concurrentRequests ?? 20;
|
||||
this.includes = options.crawlerOptions?.includes ?? [];
|
||||
this.excludes = options.crawlerOptions?.excludes ?? [];
|
||||
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
|
||||
this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10;
|
||||
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
|
||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
||||
|
||||
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||
this.excludes = this.excludes.filter((item) => item !== "");
|
||||
|
||||
@@ -421,7 +464,8 @@ export class WebScraperDataProvider {
|
||||
altText = await getImageDescription(
|
||||
imageUrl,
|
||||
backText,
|
||||
frontText
|
||||
frontText,
|
||||
this.generateImgAltTextModel
|
||||
);
|
||||
}
|
||||
|
||||
@@ -437,39 +481,11 @@ export class WebScraperDataProvider {
|
||||
return documents;
|
||||
};
|
||||
|
||||
replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||
try {
|
||||
documents.forEach((document) => {
|
||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||
const images =
|
||||
document.content.match(
|
||||
/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
|
||||
) || [];
|
||||
|
||||
images.forEach((image: string) => {
|
||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||
let altText = image.match(/\[(.*?)\]/)[1];
|
||||
|
||||
if (!imageUrl.startsWith("data:image")) {
|
||||
if (!imageUrl.startsWith("http")) {
|
||||
if (imageUrl.startsWith("/")) {
|
||||
imageUrl = imageUrl.substring(1);
|
||||
}
|
||||
imageUrl = new URL(imageUrl, baseUrl).toString();
|
||||
}
|
||||
}
|
||||
|
||||
document.content = document.content.replace(
|
||||
image,
|
||||
``
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
return documents;
|
||||
} catch (error) {
|
||||
console.error("Error replacing img paths with absolute paths", error);
|
||||
return documents;
|
||||
}
|
||||
};
|
||||
filterDepth(documents: Document[]): Document[] {
|
||||
return documents.filter((document) => {
|
||||
const url = new URL(document.metadata.sourceURL);
|
||||
const path = url.pathname;
|
||||
return path.split("/").length <= this.maxCrawledDepth;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user