init
This commit is contained in:
Generated
+5812
-4636
File diff suppressed because it is too large
Load Diff
@@ -58,19 +58,26 @@ export async function scrapeHelper(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// make sure doc.content is not empty
|
// make sure doc.content is not empty
|
||||||
const filteredDocs = docs.filter(
|
let filteredDocs = docs.filter(
|
||||||
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||||
);
|
);
|
||||||
if (filteredDocs.length === 0) {
|
if (filteredDocs.length === 0) {
|
||||||
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
|
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Remove rawHtml if pageOptions.rawHtml is false
|
||||||
|
if (!pageOptions.rawHtml) {
|
||||||
|
filteredDocs.forEach(doc => {
|
||||||
|
delete doc.rawHtml;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
let creditsToBeBilled = filteredDocs.length;
|
let creditsToBeBilled = filteredDocs.length;
|
||||||
const creditsPerLLMExtract = 50;
|
const creditsPerLLMExtract = 50;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if (extractorOptions.mode === "llm-extraction") {
|
if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") {
|
||||||
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,8 @@ import { Document, ExtractorOptions } from "../entities";
|
|||||||
// Generate completion using OpenAI
|
// Generate completion using OpenAI
|
||||||
export async function generateCompletions(
|
export async function generateCompletions(
|
||||||
documents: Document[],
|
documents: Document[],
|
||||||
extractionOptions: ExtractorOptions
|
extractionOptions: ExtractorOptions,
|
||||||
|
mode: "markdown" | "raw-html"
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
// const schema = zodToJsonSchema(options.schema)
|
// const schema = zodToJsonSchema(options.schema)
|
||||||
|
|
||||||
@@ -28,6 +29,7 @@ export async function generateCompletions(
|
|||||||
document: document,
|
document: document,
|
||||||
schema: schema,
|
schema: schema,
|
||||||
prompt: prompt,
|
prompt: prompt,
|
||||||
|
mode: mode,
|
||||||
});
|
});
|
||||||
// Validate the JSON output against the schema using AJV
|
// Validate the JSON output against the schema using AJV
|
||||||
const validate = ajv.compile(schema);
|
const validate = ajv.compile(schema);
|
||||||
|
|||||||
@@ -13,26 +13,37 @@ const defaultPrompt =
|
|||||||
"You are a professional web scraper. Extract the contents of the webpage";
|
"You are a professional web scraper. Extract the contents of the webpage";
|
||||||
|
|
||||||
function prepareOpenAIDoc(
|
function prepareOpenAIDoc(
|
||||||
document: Document
|
document: Document,
|
||||||
|
mode: "markdown" | "raw-html"
|
||||||
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
|
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
|
||||||
|
|
||||||
let markdown = document.markdown;
|
let markdown = document.markdown;
|
||||||
|
|
||||||
// Check if the markdown content exists in the document
|
let extractionTarget = document.markdown;
|
||||||
if (!markdown) {
|
|
||||||
|
if (mode === "raw-html") {
|
||||||
|
extractionTarget = document.rawHtml;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the markdown content exists in the document
|
||||||
|
if (!extractionTarget) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
|
`${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// count number of tokens
|
// count number of tokens
|
||||||
const numTokens = numTokensFromString(document.markdown, "gpt-4");
|
const numTokens = numTokensFromString(extractionTarget, "gpt-4");
|
||||||
|
|
||||||
if (numTokens > maxTokens) {
|
if (numTokens > maxTokens) {
|
||||||
// trim the document to the maximum number of tokens, tokens != characters
|
// trim the document to the maximum number of tokens, tokens != characters
|
||||||
markdown = markdown.slice(0, (maxTokens * modifier));
|
extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
|
||||||
}
|
}
|
||||||
|
|
||||||
return [[{ type: "text", text: markdown }], numTokens];
|
return [[{ type: "text", text: extractionTarget }], numTokens];
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function generateOpenAICompletions({
|
export async function generateOpenAICompletions({
|
||||||
@@ -42,6 +53,7 @@ export async function generateOpenAICompletions({
|
|||||||
schema, //TODO - add zod dynamic type checking
|
schema, //TODO - add zod dynamic type checking
|
||||||
prompt = defaultPrompt,
|
prompt = defaultPrompt,
|
||||||
temperature,
|
temperature,
|
||||||
|
mode
|
||||||
}: {
|
}: {
|
||||||
client: OpenAI;
|
client: OpenAI;
|
||||||
model?: string;
|
model?: string;
|
||||||
@@ -49,9 +61,10 @@ export async function generateOpenAICompletions({
|
|||||||
schema: any; // This should be replaced with a proper Zod schema type when available
|
schema: any; // This should be replaced with a proper Zod schema type when available
|
||||||
prompt?: string;
|
prompt?: string;
|
||||||
temperature?: number;
|
temperature?: number;
|
||||||
|
mode: "markdown" | "raw-html";
|
||||||
}): Promise<Document> {
|
}): Promise<Document> {
|
||||||
const openai = client as OpenAI;
|
const openai = client as OpenAI;
|
||||||
const [content, numTokens] = prepareOpenAIDoc(document);
|
const [content, numTokens] = prepareOpenAIDoc(document, mode);
|
||||||
|
|
||||||
const completion = await openai.chat.completions.create({
|
const completion = await openai.chat.completions.create({
|
||||||
model,
|
model,
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ export interface Progress {
|
|||||||
export type PageOptions = {
|
export type PageOptions = {
|
||||||
onlyMainContent?: boolean;
|
onlyMainContent?: boolean;
|
||||||
includeHtml?: boolean;
|
includeHtml?: boolean;
|
||||||
|
rawHtml?: boolean;
|
||||||
fallback?: boolean;
|
fallback?: boolean;
|
||||||
fetchPageContent?: boolean;
|
fetchPageContent?: boolean;
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
@@ -25,7 +26,7 @@ export type PageOptions = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
mode: "markdown" | "llm-extraction";
|
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
||||||
extractionPrompt?: string;
|
extractionPrompt?: string;
|
||||||
extractionSchema?: Record<string, any>;
|
extractionSchema?: Record<string, any>;
|
||||||
}
|
}
|
||||||
@@ -73,6 +74,7 @@ export class Document {
|
|||||||
content: string;
|
content: string;
|
||||||
markdown?: string;
|
markdown?: string;
|
||||||
html?: string;
|
html?: string;
|
||||||
|
rawHtml?: string;
|
||||||
llm_extraction?: Record<string, any>;
|
llm_extraction?: Record<string, any>;
|
||||||
createdAt?: Date;
|
createdAt?: Date;
|
||||||
updatedAt?: Date;
|
updatedAt?: Date;
|
||||||
|
|||||||
@@ -66,6 +66,7 @@ export class WebScraperDataProvider {
|
|||||||
const result = await scrapSingleUrl(
|
const result = await scrapSingleUrl(
|
||||||
url,
|
url,
|
||||||
this.pageOptions,
|
this.pageOptions,
|
||||||
|
this.extractorOptions,
|
||||||
existingHTML
|
existingHTML
|
||||||
);
|
);
|
||||||
processedUrls++;
|
processedUrls++;
|
||||||
@@ -269,10 +270,16 @@ export class WebScraperDataProvider {
|
|||||||
// documents = await this.applyImgAltText(documents);
|
// documents = await this.applyImgAltText(documents);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
this.extractorOptions.mode === "llm-extraction" &&
|
(this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") &&
|
||||||
this.mode === "single_urls"
|
this.mode === "single_urls"
|
||||||
) {
|
) {
|
||||||
documents = await generateCompletions(documents, this.extractorOptions);
|
documents = await generateCompletions(documents, this.extractorOptions, "markdown");
|
||||||
|
}
|
||||||
|
if (
|
||||||
|
(this.extractorOptions.mode === "llm-extraction-from-raw-html") &&
|
||||||
|
this.mode === "single_urls"
|
||||||
|
) {
|
||||||
|
documents = await generateCompletions(documents, this.extractorOptions, "raw-html");
|
||||||
}
|
}
|
||||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
|
|||||||
import { ScrapingBeeClient } from "scrapingbee";
|
import { ScrapingBeeClient } from "scrapingbee";
|
||||||
import { extractMetadata } from "./utils/metadata";
|
import { extractMetadata } from "./utils/metadata";
|
||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
|
import { Document, PageOptions, FireEngineResponse, ExtractorOptions } from "../../lib/entities";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
@@ -303,10 +303,14 @@ export async function scrapSingleUrl(
|
|||||||
pageOptions: PageOptions = {
|
pageOptions: PageOptions = {
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
|
rawHtml: false,
|
||||||
waitFor: 0,
|
waitFor: 0,
|
||||||
screenshot: false,
|
screenshot: false,
|
||||||
headers: undefined
|
headers: undefined
|
||||||
},
|
},
|
||||||
|
extractorOptions: ExtractorOptions = {
|
||||||
|
mode: "llm-extraction-from-markdown"
|
||||||
|
},
|
||||||
existingHtml: string = ""
|
existingHtml: string = ""
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
@@ -465,6 +469,7 @@ export async function scrapSingleUrl(
|
|||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: text,
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
|
rawHtml: pageOptions.rawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
|
||||||
metadata: {
|
metadata: {
|
||||||
...metadata,
|
...metadata,
|
||||||
screenshot: screenshot,
|
screenshot: screenshot,
|
||||||
@@ -478,6 +483,7 @@ export async function scrapSingleUrl(
|
|||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: text,
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
|
rawHtml: pageOptions.rawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
|
||||||
metadata: {
|
metadata: {
|
||||||
...metadata,
|
...metadata,
|
||||||
sourceURL: urlToScrap,
|
sourceURL: urlToScrap,
|
||||||
|
|||||||
Reference in New Issue
Block a user