Files
firecrawl/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
T

400 lines
12 KiB
TypeScript
Raw Normal View History

2024-11-07 20:57:33 +01:00
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
2025-01-22 18:47:44 -03:00
import {
Document,
ExtractOptions,
TokenUsage,
} from "../../../controllers/v1/types";
2024-11-07 20:57:33 +01:00
import { Logger } from "winston";
import { EngineResultsTracker, Meta } from "..";
import { logger } from "../../../lib/logger";
import { modelPrices } from "../../../lib/extract/usage/model-prices";
2025-02-20 18:48:58 -03:00
import { generateObject, generateText, LanguageModel } from 'ai';
import { jsonSchema } from 'ai';
import { getModel } from "../../../lib/generic-ai";
import { z } from "zod";
2024-11-07 20:57:33 +01:00
// Get max tokens from model prices
const getModelLimits = (model: string) => {
const modelConfig = modelPrices[model];
if (!modelConfig) {
// Default fallback values
return {
maxInputTokens: 8192,
maxOutputTokens: 4096,
maxTokens: 12288,
};
}
return {
maxInputTokens: modelConfig.max_input_tokens || modelConfig.max_tokens,
maxOutputTokens: modelConfig.max_output_tokens || modelConfig.max_tokens,
maxTokens: modelConfig.max_tokens,
};
};
2024-11-07 20:57:33 +01:00
export class LLMRefusalError extends Error {
2024-12-11 19:46:11 -03:00
public refusal: string;
public results: EngineResultsTracker | undefined;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
constructor(refusal: string) {
super("LLM refused to extract the website's content");
this.refusal = refusal;
}
2024-11-07 20:57:33 +01:00
}
function normalizeSchema(x: any): any {
2024-12-11 19:46:11 -03:00
if (typeof x !== "object" || x === null) return x;
if (x["$defs"] !== null && typeof x["$defs"] === "object") {
x["$defs"] = Object.fromEntries(
Object.entries(x["$defs"]).map(([name, schema]) => [
name,
2024-12-11 19:51:08 -03:00
normalizeSchema(schema),
]),
2024-12-11 19:46:11 -03:00
);
}
if (x && x.anyOf) {
x.anyOf = x.anyOf.map((x) => normalizeSchema(x));
}
if (x && x.oneOf) {
x.oneOf = x.oneOf.map((x) => normalizeSchema(x));
}
if (x && x.allOf) {
x.allOf = x.allOf.map((x) => normalizeSchema(x));
}
if (x && x.not) {
x.not = normalizeSchema(x.not);
}
if (x && x.type === "object") {
return {
...x,
properties: Object.fromEntries(
2024-12-11 19:51:08 -03:00
Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]),
2024-12-11 19:46:11 -03:00
),
required: Object.keys(x.properties),
2024-12-11 19:51:08 -03:00
additionalProperties: false,
2024-12-11 19:46:11 -03:00
};
} else if (x && x.type === "array") {
return {
...x,
2024-12-11 19:51:08 -03:00
items: normalizeSchema(x.items),
2024-12-11 19:46:11 -03:00
};
} else {
return x;
}
2024-11-07 20:57:33 +01:00
}
export function truncateText(text: string, maxTokens: number): string {
const modifier = 3; // Estimate: 1 token ≈ 3-4 characters for safety
try {
const encoder = encoding_for_model("gpt-4o");
// Continuously trim the text until its token count is within the limit.
while (true) {
const tokens = encoder.encode(text);
if (tokens.length <= maxTokens) {
return text;
}
// Calculate a new length using a more conservative approach
// Instead of scaling the entire text, we'll remove a smaller portion
const ratio = maxTokens / tokens.length;
const newLength = Math.max(
Math.ceil(text.length * ratio),
Math.floor(text.length * 0.8) // Never remove more than 20% at once
);
if (newLength <= 0) {
return "";
}
text = text.slice(0, newLength);
}
} catch (error) {
// Fallback using character-based estimation.
if (text.length <= maxTokens * modifier) {
return text;
}
return text.slice(0, maxTokens * modifier);
}
}
2025-02-20 18:48:58 -03:00
export async function generateCompletions({
logger,
options,
markdown,
previousWarning,
isExtractEndpoint,
model = getModel("gpt-4o-mini"),
}: {
model?: LanguageModel;
logger: Logger;
options: ExtractOptions;
markdown?: string;
previousWarning?: string;
isExtractEndpoint?: boolean;
}): Promise<{
2025-01-22 18:47:44 -03:00
extract: any;
numTokens: number;
warning: string | undefined;
totalUsage: TokenUsage;
model: string;
}> {
2024-12-11 19:46:11 -03:00
let extract: any;
let warning: string | undefined;
if (markdown === undefined) {
throw new Error("document.markdown is undefined -- this is unexpected");
}
2025-02-20 18:48:58 -03:00
const { maxInputTokens, maxOutputTokens } = getModelLimits(model.modelId);
// Ratio of 4 was way too high, now 3.5.
const modifier = 3.5; // tokens to characters ratio
// Calculate 80% of max input tokens (for content)
const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
2024-12-11 19:46:11 -03:00
// count number of tokens
let numTokens = 0;
try {
// Encode the message into tokens
2025-02-20 18:48:58 -03:00
const encoder = encoding_for_model(model.modelId as TiktokenModel);
2025-02-20 16:59:19 +01:00
try {
const tokens = encoder.encode(markdown);
numTokens = tokens.length;
} catch (e) {
throw e;
} finally {
// Free the encoder resources after use
encoder.free();
}
2024-12-11 19:46:11 -03:00
} catch (error) {
2025-02-20 16:59:19 +01:00
logger.warn("Calculating num tokens of string failed", { error });
2024-12-11 19:46:11 -03:00
markdown = markdown.slice(0, maxTokensSafe * modifier);
2024-12-11 19:46:11 -03:00
let w =
"Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +
maxTokensSafe +
2024-12-11 19:46:11 -03:00
") we support.";
warning = previousWarning === undefined ? w : w + " " + previousWarning;
}
if (numTokens > maxTokensSafe) {
2024-12-11 19:46:11 -03:00
// trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, maxTokensSafe * modifier);
2024-12-11 19:46:11 -03:00
const w =
"The extraction content would have used more tokens (" +
numTokens +
") than the maximum we allow (" +
maxTokensSafe +
2024-12-11 19:46:11 -03:00
"). -- the input has been automatically trimmed.";
warning = previousWarning === undefined ? w : w + " " + previousWarning;
}
let schema = options.schema;
2025-02-20 18:48:58 -03:00
// Normalize the bad json schema users write (mogery)
if (schema && !(schema instanceof z.ZodType)) {
// let schema = options.schema;
if (schema) {
schema = removeDefaultProperty(schema);
}
if (schema && schema.type === "array") {
schema = {
type: "object",
properties: {
items: options.schema,
},
required: ["items"],
additionalProperties: false,
};
} else if (schema && typeof schema === "object" && !schema.type) {
schema = {
type: "object",
properties: Object.fromEntries(
Object.entries(schema).map(([key, value]) => {
return [key, removeDefaultProperty(value)];
}),
),
required: Object.keys(schema),
additionalProperties: false,
};
}
schema = normalizeSchema(schema);
2025-01-10 18:35:10 -03:00
}
2024-12-16 11:41:59 -03:00
2025-02-20 18:48:58 -03:00
try {
const prompt = options.prompt !== undefined
? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}`
: `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`;
const repairConfig = {
experimental_repairText: async ({ text, error }) => {
const { text: fixedText } = await generateText({
model: model,
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
system: "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON."
});
return fixedText;
}
2024-12-11 19:46:11 -03:00
};
2025-02-20 18:48:58 -03:00
const generateObjectConfig = {
model: model,
prompt: prompt,
temperature: options.temperature ?? 0,
system: options.systemPrompt,
...(schema && { schema: schema instanceof z.ZodType ? schema : jsonSchema(schema) }),
...(!schema && { output: 'no-schema' as const }),
...repairConfig,
...(!schema && {
onError: (error: Error) => {
console.error(error);
2024-12-11 19:46:11 -03:00
}
2025-02-20 18:48:58 -03:00
})
} satisfies Parameters<typeof generateObject>[0];
const result = await generateObject(generateObjectConfig);
extract = result.object;
// If the users actually wants the items object, they can specify it as 'required' in the schema
// otherwise, we just return the items array
if (
options.schema &&
options.schema.type === "array" &&
!schema?.required?.includes("items")
) {
extract = extract?.items;
}
2024-11-07 20:57:33 +01:00
2025-02-20 18:48:58 -03:00
// Since generateObject doesn't provide token usage, we'll estimate it
const promptTokens = numTokens;
const completionTokens = result?.usage?.completionTokens ?? 0;
2024-11-07 20:57:33 +01:00
2025-02-20 18:48:58 -03:00
return {
extract,
warning,
numTokens,
totalUsage: {
promptTokens,
completionTokens,
totalTokens: promptTokens + completionTokens,
},
model: model.modelId,
};
} catch (error) {
if (error.message?.includes('refused')) {
throw new LLMRefusalError(error.message);
2024-11-07 20:57:33 +01:00
}
2025-02-20 18:48:58 -03:00
throw error;
2024-12-11 19:46:11 -03:00
}
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
export async function performLLMExtract(
meta: Meta,
2024-12-11 19:51:08 -03:00
document: Document,
2024-12-11 19:46:11 -03:00
): Promise<Document> {
if (meta.options.formats.includes("extract")) {
meta.internalOptions.abort?.throwIfAborted();
2025-02-20 18:48:58 -03:00
const { extract, warning } = await generateCompletions({
logger: meta.logger.child({
method: "performLLMExtract/generateCompletions",
2024-12-11 19:46:11 -03:00
}),
2025-02-20 18:48:58 -03:00
options: meta.options.extract!,
markdown: document.markdown,
previousWarning: document.warning
});
2025-01-22 18:47:44 -03:00
2025-01-18 17:17:42 -03:00
if (meta.options.formats.includes("json")) {
document.json = extract;
} else {
document.extract = extract;
}
2024-12-11 19:46:11 -03:00
document.warning = warning;
}
return document;
2024-11-07 20:57:33 +01:00
}
2024-12-16 11:41:59 -03:00
2024-12-16 09:30:40 -03:00
export function removeDefaultProperty(schema: any): any {
2025-01-10 18:35:10 -03:00
if (typeof schema !== "object" || schema === null) return schema;
2024-12-09 15:34:50 -03:00
const { default: _, ...rest } = schema;
for (const key in rest) {
2025-01-10 18:35:10 -03:00
if (Array.isArray(rest[key])) {
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
} else if (typeof rest[key] === "object" && rest[key] !== null) {
rest[key] = removeDefaultProperty(rest[key]);
}
2024-12-09 15:34:50 -03:00
}
return rest;
2025-01-10 18:35:10 -03:00
}
export async function generateSchemaFromPrompt(prompt: string): Promise<any> {
2025-02-20 18:48:58 -03:00
const model = getModel("gpt-4o");
const temperatures = [0, 0.1, 0.3]; // Different temperatures to try
let lastError: Error | null = null;
for (const temp of temperatures) {
try {
2025-02-20 18:48:58 -03:00
const { extract } = await generateCompletions({
logger: logger.child({
method: "generateSchemaFromPrompt/generateCompletions",
}),
model: model,
options: {
mode: "llm",
systemPrompt: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt.
Consider:
1. The type of data being requested
2. Required fields vs optional fields
3. Appropriate data types for each field
4. Nested objects and arrays where appropriate
Valid JSON schema, has to be simple. No crazy properties. OpenAI has to support it.
Supported types
The following types are supported for Structured Outputs:
String
Number
Boolean
Integer
Object
Array
Enum
anyOf
Formats are not supported. Min/max are not supported. Anything beyond the above is not supported. Keep it simple with types and descriptions.
Optionals are not supported.
2025-01-19 22:18:51 -03:00
DO NOT USE FORMATS.
Keep it simple. Don't create too many properties, just the ones that are needed. Don't invent properties.
Return a valid JSON schema object with properties that would capture the information requested in the prompt.`,
2025-02-20 18:48:58 -03:00
prompt: `Generate a JSON schema for extracting the following information: ${prompt}`,
temperature: temp
},
2025-02-20 18:48:58 -03:00
markdown: prompt
});
2025-02-20 18:48:58 -03:00
return extract;
} catch (error) {
lastError = error as Error;
logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
continue;
}
}
// If we get here, all attempts failed
throw new Error(
`Failed to generate schema after all attempts. Last error: ${lastError?.message}`,
);
}