This commit is contained in:
Eric Ciarla
2024-06-28 16:39:09 -04:00
parent dbfae2d9bf
commit 70fcf2ce03
7 changed files with 5864 additions and 4651 deletions
+5812 -4636
View File
File diff suppressed because it is too large Load Diff
+9 -2
View File
@@ -58,19 +58,26 @@ export async function scrapeHelper(
} }
// make sure doc.content is not empty // make sure doc.content is not empty
const filteredDocs = docs.filter( let filteredDocs = docs.filter(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0 (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
); );
if (filteredDocs.length === 0) { if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200, data: docs[0] }; return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
} }
// Remove rawHtml if pageOptions.rawHtml is false
if (!pageOptions.rawHtml) {
filteredDocs.forEach(doc => {
delete doc.rawHtml;
});
}
let creditsToBeBilled = filteredDocs.length; let creditsToBeBilled = filteredDocs.length;
const creditsPerLLMExtract = 50; const creditsPerLLMExtract = 50;
if (extractorOptions.mode === "llm-extraction") { if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") {
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
} }
+3 -1
View File
@@ -8,7 +8,8 @@ import { Document, ExtractorOptions } from "../entities";
// Generate completion using OpenAI // Generate completion using OpenAI
export async function generateCompletions( export async function generateCompletions(
documents: Document[], documents: Document[],
extractionOptions: ExtractorOptions extractionOptions: ExtractorOptions,
mode: "markdown" | "raw-html"
): Promise<Document[]> { ): Promise<Document[]> {
// const schema = zodToJsonSchema(options.schema) // const schema = zodToJsonSchema(options.schema)
@@ -28,6 +29,7 @@ export async function generateCompletions(
document: document, document: document,
schema: schema, schema: schema,
prompt: prompt, prompt: prompt,
mode: mode,
}); });
// Validate the JSON output against the schema using AJV // Validate the JSON output against the schema using AJV
const validate = ajv.compile(schema); const validate = ajv.compile(schema);
+21 -8
View File
@@ -13,26 +13,37 @@ const defaultPrompt =
"You are a professional web scraper. Extract the contents of the webpage"; "You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc( function prepareOpenAIDoc(
document: Document document: Document,
mode: "markdown" | "raw-html"
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] { ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
let markdown = document.markdown; let markdown = document.markdown;
// Check if the markdown content exists in the document let extractionTarget = document.markdown;
if (!markdown) {
if (mode === "raw-html") {
extractionTarget = document.rawHtml;
}
// Check if the markdown content exists in the document
if (!extractionTarget) {
throw new Error( throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
); );
} }
// count number of tokens // count number of tokens
const numTokens = numTokensFromString(document.markdown, "gpt-4"); const numTokens = numTokensFromString(extractionTarget, "gpt-4");
if (numTokens > maxTokens) { if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters // trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, (maxTokens * modifier)); extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
} }
return [[{ type: "text", text: markdown }], numTokens]; return [[{ type: "text", text: extractionTarget }], numTokens];
} }
export async function generateOpenAICompletions({ export async function generateOpenAICompletions({
@@ -42,6 +53,7 @@ export async function generateOpenAICompletions({
schema, //TODO - add zod dynamic type checking schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt, prompt = defaultPrompt,
temperature, temperature,
mode
}: { }: {
client: OpenAI; client: OpenAI;
model?: string; model?: string;
@@ -49,9 +61,10 @@ export async function generateOpenAICompletions({
schema: any; // This should be replaced with a proper Zod schema type when available schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string; prompt?: string;
temperature?: number; temperature?: number;
mode: "markdown" | "raw-html";
}): Promise<Document> { }): Promise<Document> {
const openai = client as OpenAI; const openai = client as OpenAI;
const [content, numTokens] = prepareOpenAIDoc(document); const [content, numTokens] = prepareOpenAIDoc(document, mode);
const completion = await openai.chat.completions.create({ const completion = await openai.chat.completions.create({
model, model,
+3 -1
View File
@@ -13,6 +13,7 @@ export interface Progress {
export type PageOptions = { export type PageOptions = {
onlyMainContent?: boolean; onlyMainContent?: boolean;
includeHtml?: boolean; includeHtml?: boolean;
rawHtml?: boolean;
fallback?: boolean; fallback?: boolean;
fetchPageContent?: boolean; fetchPageContent?: boolean;
waitFor?: number; waitFor?: number;
@@ -25,7 +26,7 @@ export type PageOptions = {
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
mode: "markdown" | "llm-extraction"; mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
extractionPrompt?: string; extractionPrompt?: string;
extractionSchema?: Record<string, any>; extractionSchema?: Record<string, any>;
} }
@@ -73,6 +74,7 @@ export class Document {
content: string; content: string;
markdown?: string; markdown?: string;
html?: string; html?: string;
rawHtml?: string;
llm_extraction?: Record<string, any>; llm_extraction?: Record<string, any>;
createdAt?: Date; createdAt?: Date;
updatedAt?: Date; updatedAt?: Date;
+9 -2
View File
@@ -66,6 +66,7 @@ export class WebScraperDataProvider {
const result = await scrapSingleUrl( const result = await scrapSingleUrl(
url, url,
this.pageOptions, this.pageOptions,
this.extractorOptions,
existingHTML existingHTML
); );
processedUrls++; processedUrls++;
@@ -269,10 +270,16 @@ export class WebScraperDataProvider {
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
if ( if (
this.extractorOptions.mode === "llm-extraction" && (this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") &&
this.mode === "single_urls" this.mode === "single_urls"
) { ) {
documents = await generateCompletions(documents, this.extractorOptions); documents = await generateCompletions(documents, this.extractorOptions, "markdown");
}
if (
(this.extractorOptions.mode === "llm-extraction-from-raw-html") &&
this.mode === "single_urls"
) {
documents = await generateCompletions(documents, this.extractorOptions, "raw-html");
} }
return documents.concat(pdfDocuments).concat(docxDocuments); return documents.concat(pdfDocuments).concat(docxDocuments);
} }
@@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
import { ScrapingBeeClient } from "scrapingbee"; import { ScrapingBeeClient } from "scrapingbee";
import { extractMetadata } from "./utils/metadata"; import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; import { Document, PageOptions, FireEngineResponse, ExtractorOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { urlSpecificParams } from "./utils/custom/website_params"; import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { fetchAndProcessPdf } from "./utils/pdfProcessor";
@@ -303,10 +303,14 @@ export async function scrapSingleUrl(
pageOptions: PageOptions = { pageOptions: PageOptions = {
onlyMainContent: true, onlyMainContent: true,
includeHtml: false, includeHtml: false,
rawHtml: false,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,
headers: undefined headers: undefined
}, },
extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown"
},
existingHtml: string = "" existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@@ -465,6 +469,7 @@ export async function scrapSingleUrl(
content: text, content: text,
markdown: text, markdown: text,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: pageOptions.rawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
metadata: { metadata: {
...metadata, ...metadata,
screenshot: screenshot, screenshot: screenshot,
@@ -478,6 +483,7 @@ export async function scrapSingleUrl(
content: text, content: text,
markdown: text, markdown: text,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: pageOptions.rawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
metadata: { metadata: {
...metadata, ...metadata,
sourceURL: urlToScrap, sourceURL: urlToScrap,