Files
firecrawl/apps/api/src/lib/LLM-extraction/index.ts
T

79 lines
2.7 KiB
TypeScript
Raw Normal View History

2024-04-30 12:19:43 -07:00
import OpenAI from "openai";
import Ajv from "ajv";
2024-04-29 12:12:55 -07:00
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
2024-04-28 17:38:20 -07:00
2024-04-30 12:19:43 -07:00
import { generateOpenAICompletions } from "./models";
import { Document, ExtractorOptions } from "../entities";
2024-11-07 20:57:33 +01:00
import { logger } from "../logger";
2024-04-28 15:52:09 -07:00
2024-04-30 12:19:43 -07:00
// Generate completion using OpenAI
2024-04-28 17:38:20 -07:00
export async function generateCompletions(
2024-04-30 12:19:43 -07:00
documents: Document[],
2024-11-07 20:57:33 +01:00
extractionOptions: ExtractorOptions | undefined,
2024-12-11 19:51:08 -03:00
mode: "markdown" | "raw-html",
2024-04-28 17:38:20 -07:00
): Promise<Document[]> {
2024-04-30 12:19:43 -07:00
// const schema = zodToJsonSchema(options.schema)
2024-11-07 20:57:33 +01:00
const schema = extractionOptions?.extractionSchema;
const systemPrompt = extractionOptions?.extractionPrompt;
const prompt = extractionOptions?.userPrompt;
2024-04-30 12:19:43 -07:00
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
const completions = await Promise.all(
documents.map(async (document: Document) => {
switch (switchVariable) {
case "openAI":
const llm = new OpenAI();
try {
const completionResult = await generateOpenAICompletions({
client: llm,
document: document,
schema: schema,
prompt: prompt,
systemPrompt: systemPrompt,
2024-12-11 19:51:08 -03:00
mode: mode,
});
// Validate the JSON output against the schema using AJV
if (schema) {
const validate = ajv.compile(schema);
if (!validate(completionResult.llm_extraction)) {
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
throw new Error(
`JSON parsing error(s): ${validate.errors
?.map((err) => err.message)
.join(
2024-12-11 19:51:08 -03:00
", ",
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`,
);
}
}
2024-04-30 12:19:43 -07:00
return completionResult;
} catch (error) {
2024-11-07 20:57:33 +01:00
logger.error(`Error generating completions: ${error}`);
throw error;
}
2024-04-30 12:19:43 -07:00
default:
throw new Error("Invalid client");
}
2024-12-11 19:51:08 -03:00
}),
2024-04-30 12:19:43 -07:00
);
return completions;
2024-04-28 15:52:09 -07:00
}
2024-12-17 16:58:35 -03:00
// generate basic completion
export async function generateBasicCompletion(prompt: string) {
const openai = new OpenAI();
2024-12-18 21:45:06 -03:00
const model = "gpt-4o";
2024-12-17 16:58:35 -03:00
const completion = await openai.chat.completions.create({
2025-01-11 20:22:36 -03:00
temperature: 0,
2024-12-17 16:58:35 -03:00
model,
messages: [{ role: "user", content: prompt }],
});
return completion.choices[0].message.content;
}