Files
firecrawl/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
T

315 lines
8.8 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import OpenAI from "openai";
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
import { Document, ExtractOptions } from "../../../controllers/v1/types";
import { Logger } from "winston";
import { EngineResultsTracker, Meta } from "..";
import { logger } from "../../../lib/logger";
2024-11-07 20:57:33 +01:00
const maxTokens = 32000;
const modifier = 4;
export class LLMRefusalError extends Error {
2024-12-11 19:46:11 -03:00
public refusal: string;
public results: EngineResultsTracker | undefined;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
constructor(refusal: string) {
super("LLM refused to extract the website's content");
this.refusal = refusal;
}
2024-11-07 20:57:33 +01:00
}
function normalizeSchema(x: any): any {
2024-12-11 19:46:11 -03:00
if (typeof x !== "object" || x === null) return x;
if (x["$defs"] !== null && typeof x["$defs"] === "object") {
x["$defs"] = Object.fromEntries(
Object.entries(x["$defs"]).map(([name, schema]) => [
name,
2024-12-11 19:51:08 -03:00
normalizeSchema(schema),
]),
2024-12-11 19:46:11 -03:00
);
}
if (x && x.anyOf) {
x.anyOf = x.anyOf.map((x) => normalizeSchema(x));
}
if (x && x.oneOf) {
x.oneOf = x.oneOf.map((x) => normalizeSchema(x));
}
if (x && x.allOf) {
x.allOf = x.allOf.map((x) => normalizeSchema(x));
}
if (x && x.not) {
x.not = normalizeSchema(x.not);
}
if (x && x.type === "object") {
return {
...x,
properties: Object.fromEntries(
2024-12-11 19:51:08 -03:00
Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]),
2024-12-11 19:46:11 -03:00
),
required: Object.keys(x.properties),
2024-12-11 19:51:08 -03:00
additionalProperties: false,
2024-12-11 19:46:11 -03:00
};
} else if (x && x.type === "array") {
return {
...x,
2024-12-11 19:51:08 -03:00
items: normalizeSchema(x.items),
2024-12-11 19:46:11 -03:00
};
} else {
return x;
}
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
export async function generateOpenAICompletions(
logger: Logger,
options: ExtractOptions,
markdown?: string,
previousWarning?: string,
2024-12-11 19:51:08 -03:00
isExtractEndpoint?: boolean,
2024-12-11 19:46:11 -03:00
): Promise<{ extract: any; numTokens: number; warning: string | undefined }> {
let extract: any;
let warning: string | undefined;
const openai = new OpenAI();
const model: TiktokenModel =
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
if (markdown === undefined) {
throw new Error("document.markdown is undefined -- this is unexpected");
}
// count number of tokens
let numTokens = 0;
const encoder = encoding_for_model(model as TiktokenModel);
try {
// Encode the message into tokens
const tokens = encoder.encode(markdown);
// Return the number of tokens
numTokens = tokens.length;
} catch (error) {
logger.warn("Calculating num tokens of string failed", { error, markdown });
markdown = markdown.slice(0, maxTokens * modifier);
let w =
"Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +
maxTokens +
") we support.";
warning = previousWarning === undefined ? w : w + " " + previousWarning;
} finally {
// Free the encoder resources after use
encoder.free();
}
if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, maxTokens * modifier);
const w =
"The extraction content would have used more tokens (" +
numTokens +
") than the maximum we allow (" +
maxTokens +
"). -- the input has been automatically trimmed.";
warning = previousWarning === undefined ? w : w + " " + previousWarning;
}
let schema = options.schema;
2024-12-16 11:41:59 -03:00
if (schema) {
schema = removeDefaultProperty(schema);
2025-01-10 18:35:10 -03:00
}
2024-12-16 11:41:59 -03:00
2024-12-11 19:46:11 -03:00
if (schema && schema.type === "array") {
schema = {
type: "object",
properties: {
2024-12-11 19:51:08 -03:00
items: options.schema,
2024-12-11 19:46:11 -03:00
},
required: ["items"],
2024-12-11 19:51:08 -03:00
additionalProperties: false,
2024-12-11 19:46:11 -03:00
};
} else if (schema && typeof schema === "object" && !schema.type) {
schema = {
type: "object",
properties: Object.fromEntries(
2024-12-16 11:41:59 -03:00
Object.entries(schema).map(([key, value]) => {
return [key, removeDefaultProperty(value)];
2025-01-10 18:35:10 -03:00
}),
2024-12-11 19:46:11 -03:00
),
required: Object.keys(schema),
2025-01-10 18:35:10 -03:00
additionalProperties: false,
2024-12-11 19:46:11 -03:00
};
}
schema = normalizeSchema(schema);
const jsonCompletion = await openai.beta.chat.completions.parse({
model,
temperature: 0,
messages: [
{
role: "system",
2024-12-11 19:51:08 -03:00
content: options.systemPrompt,
2024-12-11 19:46:11 -03:00
},
{
role: "user",
2024-12-11 19:51:08 -03:00
content: [{ type: "text", text: markdown }],
2024-12-11 19:46:11 -03:00
},
{
role: "user",
content:
options.prompt !== undefined
2024-12-18 23:45:43 -03:00
? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}. If schema is provided, strictly follow it.`
2024-12-18 16:52:05 -03:00
: "Transform the above content into structured JSON output based on the provided schema if any.",
2024-12-11 19:51:08 -03:00
},
2024-12-11 19:46:11 -03:00
],
response_format: options.schema
? {
type: "json_schema",
json_schema: {
name: "schema",
2024-12-11 19:46:11 -03:00
schema: schema,
2024-12-11 19:51:08 -03:00
strict: true,
},
2024-12-11 19:46:11 -03:00
}
2024-12-11 19:51:08 -03:00
: { type: "json_object" },
2024-12-11 19:46:11 -03:00
});
2024-11-13 18:06:20 -03:00
2024-12-11 19:46:11 -03:00
if (jsonCompletion.choices[0].message.refusal !== null) {
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
extract = jsonCompletion.choices[0].message.parsed;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if (extract === null && jsonCompletion.choices[0].message.content !== null) {
2024-11-07 20:57:33 +01:00
try {
2024-12-11 19:46:11 -03:00
if (!isExtractEndpoint) {
extract = JSON.parse(jsonCompletion.choices[0].message.content);
} else {
const extractData = JSON.parse(
2024-12-11 19:51:08 -03:00
jsonCompletion.choices[0].message.content,
2024-11-13 18:06:20 -03:00
);
2024-12-11 19:46:11 -03:00
extract = options.schema ? extractData.data.extract : extractData;
}
} catch (e) {
logger.error("Failed to parse returned JSON, no schema specified.", {
2024-12-11 19:51:08 -03:00
error: e,
2024-12-11 19:46:11 -03:00
});
throw new LLMRefusalError(
2024-12-11 19:51:08 -03:00
"Failed to parse returned JSON. Please specify a schema in the extract object.",
2024-12-11 19:46:11 -03:00
);
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
}
// If the users actually wants the items object, they can specify it as 'required' in the schema
// otherwise, we just return the items array
if (
options.schema &&
options.schema.type === "array" &&
!schema?.required?.includes("items")
) {
extract = extract?.items;
}
return { extract, warning, numTokens };
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
export async function performLLMExtract(
meta: Meta,
2024-12-11 19:51:08 -03:00
document: Document,
2024-12-11 19:46:11 -03:00
): Promise<Document> {
if (meta.options.formats.includes("extract")) {
const { extract, warning } = await generateOpenAICompletions(
meta.logger.child({
2024-12-11 19:51:08 -03:00
method: "performLLMExtract/generateOpenAICompletions",
2024-12-11 19:46:11 -03:00
}),
meta.options.extract!,
document.markdown,
2024-12-11 19:51:08 -03:00
document.warning,
2024-12-11 19:46:11 -03:00
);
document.extract = extract;
document.warning = warning;
}
return document;
2024-11-07 20:57:33 +01:00
}
2024-12-16 11:41:59 -03:00
2024-12-16 09:30:40 -03:00
export function removeDefaultProperty(schema: any): any {
2025-01-10 18:35:10 -03:00
if (typeof schema !== "object" || schema === null) return schema;
2024-12-09 15:34:50 -03:00
const { default: _, ...rest } = schema;
for (const key in rest) {
2025-01-10 18:35:10 -03:00
if (Array.isArray(rest[key])) {
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
} else if (typeof rest[key] === "object" && rest[key] !== null) {
rest[key] = removeDefaultProperty(rest[key]);
}
2024-12-09 15:34:50 -03:00
}
return rest;
2025-01-10 18:35:10 -03:00
}
export async function generateSchemaFromPrompt(prompt: string): Promise<any> {
const openai = new OpenAI();
const temperatures = [0, 0.1, 0.3]; // Different temperatures to try
let lastError: Error | null = null;
for (const temp of temperatures) {
try {
const result = await openai.beta.chat.completions.parse({
model: "gpt-4o",
temperature: temp,
messages: [
{
role: "system",
content: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt.
Consider:
1. The type of data being requested
2. Required fields vs optional fields
3. Appropriate data types for each field
4. Nested objects and arrays where appropriate
Return a valid JSON schema object with properties that would capture the information requested in the prompt.`,
},
{
role: "user",
content: `Generate a JSON schema for extracting the following information: ${prompt}`,
},
],
response_format: {
type: "json_object",
},
});
if (result.choices[0].message.refusal !== null) {
throw new Error("LLM refused to generate schema");
}
let schema;
try {
schema = JSON.parse(result.choices[0].message.content ?? "");
return schema;
} catch (e) {
throw new Error("Failed to parse schema JSON from LLM response");
}
} catch (error) {
lastError = error as Error;
logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
continue;
}
}
// If we get here, all attempts failed
throw new Error(
`Failed to generate schema after all attempts. Last error: ${lastError?.message}`,
);
}