Files
firecrawl/apps/api/src/lib/LLM-extraction/models.ts
T

146 lines
4.0 KiB
TypeScript
Raw Normal View History

2024-04-30 12:19:43 -07:00
import OpenAI from "openai";
import { Document } from "../../lib/entities";
import { numTokensFromString } from "./helpers";
2024-04-28 15:52:09 -07:00
2024-04-30 12:19:43 -07:00
export type ScraperCompletionResult = {
data: any | null;
url: string;
};
const maxTokens = 32000;
const modifier = 4;
const defaultPrompt =
2024-04-30 12:19:43 -07:00
"You are a professional web scraper. Extract the contents of the webpage";
2024-04-28 15:52:09 -07:00
function prepareOpenAIDoc(
2024-06-28 16:39:09 -04:00
document: Document,
mode: "markdown" | "raw-html"
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
let markdown = document.markdown;
2024-06-28 16:39:09 -04:00
let extractionTarget = document.markdown;
if (mode === "raw-html") {
extractionTarget = document.rawHtml;
}
// Check if the markdown content exists in the document
if (!extractionTarget) {
return null;
// throw new Error(
// `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
// );
}
// count number of tokens
2024-06-28 16:39:09 -04:00
const numTokens = numTokensFromString(extractionTarget, "gpt-4");
if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters
extractionTarget = extractionTarget.slice(0, maxTokens * modifier);
}
2024-06-28 16:39:09 -04:00
return [[{ type: "text", text: extractionTarget }], numTokens];
}
2024-04-29 12:12:55 -07:00
export async function generateOpenAICompletions({
2024-04-28 15:52:09 -07:00
client,
model = process.env.MODEL_NAME || "gpt-4o-mini",
2024-04-28 15:52:09 -07:00
document,
schema, //TODO - add zod dynamic type checking
systemPrompt = defaultPrompt,
prompt,
2024-04-30 12:19:43 -07:00
temperature,
mode,
2024-04-28 15:52:09 -07:00
}: {
2024-04-30 12:19:43 -07:00
client: OpenAI;
model?: string;
document: Document;
schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string;
systemPrompt?: string;
2024-04-30 12:19:43 -07:00
temperature?: number;
2024-06-28 16:39:09 -04:00
mode: "markdown" | "raw-html";
2024-04-28 15:52:09 -07:00
}): Promise<Document> {
2024-04-30 12:19:43 -07:00
const openai = client as OpenAI;
const preparedDoc = prepareOpenAIDoc(document, mode);
if (preparedDoc === null) {
return {
...document,
warning:
"LLM extraction was not performed since the document's content is empty or missing.",
};
}
const [content, numTokens] = preparedDoc;
2024-04-30 09:20:15 -07:00
let completion;
let llmExtraction;
if (prompt && !schema) {
const jsonCompletion = await openai.chat.completions.create({
model,
messages: [
{
role: "system",
content: systemPrompt,
},
{ role: "user", content },
{
role: "user",
content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
},
],
response_format: { type: "json_object" },
temperature,
});
try {
llmExtraction = JSON.parse(
2024-11-07 20:57:33 +01:00
(jsonCompletion.choices[0].message.content ?? "").trim()
);
} catch (e) {
throw new Error("Invalid JSON");
}
} else {
completion = await openai.chat.completions.create({
model,
messages: [
{
role: "system",
content: systemPrompt,
},
{ role: "user", content },
],
tools: [
{
type: "function",
function: {
name: "extract_content",
description: "Extracts the content from the given webpage(s)",
parameters: schema,
},
},
],
tool_choice: { type: "function", function: { name: "extract_content" } },
temperature,
});
const c = completion.choices[0].message.tool_calls[0].function.arguments;
// Extract the LLM extraction content from the completion response
try {
llmExtraction = JSON.parse(c);
} catch (e) {
throw new Error("Invalid JSON");
}
}
2024-04-28 19:28:28 -07:00
2024-04-28 15:52:09 -07:00
// Return the document with the LLM extraction content added
return {
2024-04-28 15:52:09 -07:00
...document,
2024-04-30 12:19:43 -07:00
llm_extraction: llmExtraction,
warning:
numTokens > maxTokens
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
: undefined,
2024-04-28 15:52:09 -07:00
};
}