Files
firecrawl/apps/api/src/lib/LLM-extraction/models.ts
T

92 lines
2.6 KiB
TypeScript
Raw Normal View History

2024-04-30 12:19:43 -07:00
import OpenAI from "openai";
import { Document } from "../../lib/entities";
import { numTokensFromString } from "./helpers";
2024-04-28 15:52:09 -07:00
2024-04-30 12:19:43 -07:00
export type ScraperCompletionResult = {
data: any | null;
url: string;
};
const maxTokens = 32000;
const modifier = 4;
const defaultPrompt =
2024-04-30 12:19:43 -07:00
"You are a professional web scraper. Extract the contents of the webpage";
2024-04-28 15:52:09 -07:00
function prepareOpenAIDoc(
document: Document
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
let markdown = document.markdown;
// Check if the markdown content exists in the document
if (!markdown) {
2024-04-30 16:19:32 -07:00
throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
);
}
// count number of tokens
const numTokens = numTokensFromString(document.markdown, "gpt-4");
if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, (maxTokens * modifier));
}
return [[{ type: "text", text: markdown }], numTokens];
}
2024-04-29 12:12:55 -07:00
export async function generateOpenAICompletions({
2024-04-28 15:52:09 -07:00
client,
2024-05-13 15:23:31 -04:00
model = "gpt-4o",
2024-04-28 15:52:09 -07:00
document,
schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt,
2024-04-30 12:19:43 -07:00
temperature,
2024-04-28 15:52:09 -07:00
}: {
2024-04-30 12:19:43 -07:00
client: OpenAI;
model?: string;
document: Document;
schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string;
temperature?: number;
2024-04-28 15:52:09 -07:00
}): Promise<Document> {
2024-04-30 12:19:43 -07:00
const openai = client as OpenAI;
const [content, numTokens] = prepareOpenAIDoc(document);
2024-04-30 09:20:15 -07:00
const completion = await openai.chat.completions.create({
model,
messages: [
{
2024-04-30 12:19:43 -07:00
role: "system",
content: prompt,
},
2024-04-30 12:19:43 -07:00
{ role: "user", content },
],
tools: [
{
2024-04-30 12:19:43 -07:00
type: "function",
function: {
2024-04-30 12:19:43 -07:00
name: "extract_content",
description: "Extracts the content from the given webpage(s)",
parameters: schema,
},
},
],
2024-04-30 18:36:21 -07:00
tool_choice: { "type": "function", "function": {"name": "extract_content"}},
temperature,
2024-04-30 12:19:43 -07:00
});
const c = completion.choices[0].message.tool_calls[0].function.arguments;
2024-04-28 15:52:09 -07:00
// Extract the LLM extraction content from the completion response
2024-04-28 19:28:28 -07:00
const llmExtraction = JSON.parse(c);
2024-04-28 15:52:09 -07:00
// Return the document with the LLM extraction content added
return {
2024-04-28 15:52:09 -07:00
...document,
2024-04-30 12:19:43 -07:00
llm_extraction: llmExtraction,
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
2024-04-28 15:52:09 -07:00
};
}
2024-04-30 18:36:21 -07:00