2024-11-07 20:57:33 +01:00
import OpenAI from "openai" ;
import { encoding_for_model } from "@dqbd/tiktoken" ;
import { TiktokenModel } from "@dqbd/tiktoken" ;
2025-01-22 18:47:44 -03:00
import {
Document ,
ExtractOptions ,
TokenUsage ,
} from "../../../controllers/v1/types" ;
2024-11-07 20:57:33 +01:00
import { Logger } from "winston" ;
import { EngineResultsTracker , Meta } from ".." ;
2025-01-14 11:37:00 -03:00
import { logger } from "../../../lib/logger" ;
2024-11-07 20:57:33 +01:00
const maxTokens = 32000 ;
const modifier = 4 ;
export class LLMRefusalError extends Error {
2024-12-11 19:46:11 -03:00
public refusal : string ;
public results : EngineResultsTracker | undefined ;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
constructor ( refusal : string ) {
super ( "LLM refused to extract the website's content" ) ;
this . refusal = refusal ;
}
2024-11-07 20:57:33 +01:00
}
function normalizeSchema ( x : any ) : any {
2024-12-11 19:46:11 -03:00
if ( typeof x !== "object" || x === null ) return x ;
if ( x [ "$defs" ] !== null && typeof x [ "$defs" ] === "object" ) {
x [ "$defs" ] = Object . fromEntries (
Object . entries ( x [ "$defs" ] ) . map ( ( [ name , schema ] ) = > [
name ,
2024-12-11 19:51:08 -03:00
normalizeSchema ( schema ) ,
] ) ,
2024-12-11 19:46:11 -03:00
) ;
}
if ( x && x . anyOf ) {
x . anyOf = x . anyOf . map ( ( x ) = > normalizeSchema ( x ) ) ;
}
if ( x && x . oneOf ) {
x . oneOf = x . oneOf . map ( ( x ) = > normalizeSchema ( x ) ) ;
}
if ( x && x . allOf ) {
x . allOf = x . allOf . map ( ( x ) = > normalizeSchema ( x ) ) ;
}
if ( x && x . not ) {
x . not = normalizeSchema ( x . not ) ;
}
if ( x && x . type === "object" ) {
return {
. . . x ,
properties : Object.fromEntries (
2024-12-11 19:51:08 -03:00
Object . entries ( x . properties ) . map ( ( [ k , v ] ) = > [ k , normalizeSchema ( v ) ] ) ,
2024-12-11 19:46:11 -03:00
) ,
required : Object.keys ( x . properties ) ,
2024-12-11 19:51:08 -03:00
additionalProperties : false ,
2024-12-11 19:46:11 -03:00
} ;
} else if ( x && x . type === "array" ) {
return {
. . . x ,
2024-12-11 19:51:08 -03:00
items : normalizeSchema ( x . items ) ,
2024-12-11 19:46:11 -03:00
} ;
} else {
return x ;
}
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
export async function generateOpenAICompletions (
logger : Logger ,
options : ExtractOptions ,
markdown? : string ,
previousWarning? : string ,
2024-12-11 19:51:08 -03:00
isExtractEndpoint? : boolean ,
2025-01-22 18:47:44 -03:00
model : TiktokenModel = ( process . env . MODEL_NAME as TiktokenModel ) ? ?
"gpt-4o-mini" ,
) : Promise < {
extract : any ;
numTokens : number ;
warning : string | undefined ;
totalUsage : TokenUsage ;
model : string ;
} > {
2024-12-11 19:46:11 -03:00
let extract : any ;
let warning : string | undefined ;
const openai = new OpenAI ( ) ;
2025-01-22 17:26:32 -03:00
2024-12-11 19:46:11 -03:00
if ( markdown === undefined ) {
throw new Error ( "document.markdown is undefined -- this is unexpected" ) ;
}
// count number of tokens
let numTokens = 0 ;
const encoder = encoding_for_model ( model as TiktokenModel ) ;
try {
// Encode the message into tokens
const tokens = encoder . encode ( markdown ) ;
// Return the number of tokens
numTokens = tokens . length ;
} catch ( error ) {
logger . warn ( "Calculating num tokens of string failed" , { error , markdown } ) ;
markdown = markdown . slice ( 0 , maxTokens * modifier ) ;
let w =
"Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +
maxTokens +
") we support." ;
warning = previousWarning === undefined ? w : w + " " + previousWarning ;
} finally {
// Free the encoder resources after use
encoder . free ( ) ;
}
if ( numTokens > maxTokens ) {
// trim the document to the maximum number of tokens, tokens != characters
markdown = markdown . slice ( 0 , maxTokens * modifier ) ;
const w =
"The extraction content would have used more tokens (" +
numTokens +
") than the maximum we allow (" +
maxTokens +
"). -- the input has been automatically trimmed." ;
warning = previousWarning === undefined ? w : w + " " + previousWarning ;
}
let schema = options . schema ;
2024-12-16 11:41:59 -03:00
if ( schema ) {
schema = removeDefaultProperty ( schema ) ;
2025-01-10 18:35:10 -03:00
}
2024-12-16 11:41:59 -03:00
2024-12-11 19:46:11 -03:00
if ( schema && schema . type === "array" ) {
schema = {
type : "object" ,
properties : {
2024-12-11 19:51:08 -03:00
items : options.schema ,
2024-12-11 19:46:11 -03:00
} ,
required : [ "items" ] ,
2024-12-11 19:51:08 -03:00
additionalProperties : false ,
2024-12-11 19:46:11 -03:00
} ;
} else if ( schema && typeof schema === "object" && ! schema . type ) {
schema = {
type : "object" ,
properties : Object.fromEntries (
2024-12-16 11:41:59 -03:00
Object . entries ( schema ) . map ( ( [ key , value ] ) = > {
return [ key , removeDefaultProperty ( value ) ] ;
2025-01-10 18:35:10 -03:00
} ) ,
2024-12-11 19:46:11 -03:00
) ,
required : Object.keys ( schema ) ,
2025-01-10 18:35:10 -03:00
additionalProperties : false ,
2024-12-11 19:46:11 -03:00
} ;
}
schema = normalizeSchema ( schema ) ;
const jsonCompletion = await openai . beta . chat . completions . parse ( {
model ,
temperature : 0 ,
messages : [
{
role : "system" ,
2024-12-11 19:51:08 -03:00
content : options.systemPrompt ,
2024-12-11 19:46:11 -03:00
} ,
{
role : "user" ,
2024-12-11 19:51:08 -03:00
content : [ { type : "text" , text : markdown } ] ,
2024-12-11 19:46:11 -03:00
} ,
{
role : "user" ,
content :
options.prompt !== undefined
2024-12-18 23:45:43 -03:00
? ` Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${ options . prompt } . If schema is provided, strictly follow it. `
2024-12-18 16:52:05 -03:00
: "Transform the above content into structured JSON output based on the provided schema if any." ,
2024-12-11 19:51:08 -03:00
} ,
2024-12-11 19:46:11 -03:00
] ,
response_format : options.schema
? {
type : "json_schema" ,
json_schema : {
2025-01-13 22:30:15 -03:00
name : "schema" ,
2024-12-11 19:46:11 -03:00
schema : schema ,
2024-12-11 19:51:08 -03:00
strict : true ,
} ,
2024-12-11 19:46:11 -03:00
}
2024-12-11 19:51:08 -03:00
: { type : "json_object" } ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-11-13 18:06:20 -03:00
2024-12-11 19:46:11 -03:00
if ( jsonCompletion . choices [ 0 ] . message . refusal !== null ) {
throw new LLMRefusalError ( jsonCompletion . choices [ 0 ] . message . refusal ) ;
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
extract = jsonCompletion . choices [ 0 ] . message . parsed ;
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
if ( extract === null && jsonCompletion . choices [ 0 ] . message . content !== null ) {
2024-11-07 20:57:33 +01:00
try {
2024-12-11 19:46:11 -03:00
if ( ! isExtractEndpoint ) {
extract = JSON . parse ( jsonCompletion . choices [ 0 ] . message . content ) ;
} else {
const extractData = JSON . parse (
2024-12-11 19:51:08 -03:00
jsonCompletion . choices [ 0 ] . message . content ,
2024-11-13 18:06:20 -03:00
) ;
2024-12-11 19:46:11 -03:00
extract = options . schema ? extractData.data.extract : extractData ;
}
} catch ( e ) {
logger . error ( "Failed to parse returned JSON, no schema specified." , {
2024-12-11 19:51:08 -03:00
error : e ,
2024-12-11 19:46:11 -03:00
} ) ;
throw new LLMRefusalError (
2024-12-11 19:51:08 -03:00
"Failed to parse returned JSON. Please specify a schema in the extract object." ,
2024-12-11 19:46:11 -03:00
) ;
2024-11-07 20:57:33 +01:00
}
2024-12-11 19:46:11 -03:00
}
2025-01-22 18:47:44 -03:00
const promptTokens = jsonCompletion . usage ? . prompt_tokens ? ? 0 ;
const completionTokens = jsonCompletion . usage ? . completion_tokens ? ? 0 ;
2025-01-19 22:04:12 -03:00
2024-12-11 19:46:11 -03:00
// If the users actually wants the items object, they can specify it as 'required' in the schema
// otherwise, we just return the items array
if (
options . schema &&
options . schema . type === "array" &&
! schema ? . required ? . includes ( "items" )
) {
extract = extract ? . items ;
}
2025-01-19 22:04:12 -03:00
// num tokens (just user prompt tokenized) | deprecated
// totalTokens = promptTokens + completionTokens
2025-01-22 18:47:44 -03:00
return {
extract ,
warning ,
numTokens ,
totalUsage : {
promptTokens ,
completionTokens ,
totalTokens : promptTokens + completionTokens ,
} ,
model ,
} ;
2024-12-11 19:46:11 -03:00
}
2024-11-07 20:57:33 +01:00
2024-12-11 19:46:11 -03:00
export async function performLLMExtract (
meta : Meta ,
2024-12-11 19:51:08 -03:00
document : Document ,
2024-12-11 19:46:11 -03:00
) : Promise < Document > {
if ( meta . options . formats . includes ( "extract" ) ) {
const { extract , warning } = await generateOpenAICompletions (
meta . logger . child ( {
2024-12-11 19:51:08 -03:00
method : "performLLMExtract/generateOpenAICompletions" ,
2024-12-11 19:46:11 -03:00
} ) ,
meta . options . extract ! ,
document . markdown ,
2024-12-11 19:51:08 -03:00
document . warning ,
2024-12-11 19:46:11 -03:00
) ;
2025-01-22 18:47:44 -03:00
2025-01-18 17:17:42 -03:00
if ( meta . options . formats . includes ( "json" ) ) {
document . json = extract ;
} else {
document . extract = extract ;
}
2024-12-11 19:46:11 -03:00
document . warning = warning ;
}
return document ;
2024-11-07 20:57:33 +01:00
}
2024-12-16 11:41:59 -03:00
2024-12-16 09:30:40 -03:00
export function removeDefaultProperty ( schema : any ) : any {
2025-01-10 18:35:10 -03:00
if ( typeof schema !== "object" || schema === null ) return schema ;
2024-12-09 15:34:50 -03:00
const { default : _ , . . . rest } = schema ;
for ( const key in rest ) {
2025-01-10 18:35:10 -03:00
if ( Array . isArray ( rest [ key ] ) ) {
rest [ key ] = rest [ key ] . map ( ( item : any ) = > removeDefaultProperty ( item ) ) ;
} else if ( typeof rest [ key ] === "object" && rest [ key ] !== null ) {
rest [ key ] = removeDefaultProperty ( rest [ key ] ) ;
}
2024-12-09 15:34:50 -03:00
}
return rest ;
2025-01-10 18:35:10 -03:00
}
2025-01-14 11:37:00 -03:00
export async function generateSchemaFromPrompt ( prompt : string ) : Promise < any > {
const openai = new OpenAI ( ) ;
const temperatures = [ 0 , 0.1 , 0.3 ] ; // Different temperatures to try
let lastError : Error | null = null ;
for ( const temp of temperatures ) {
try {
const result = await openai . beta . chat . completions . parse ( {
model : "gpt-4o" ,
temperature : temp ,
messages : [
{
role : "system" ,
content : ` You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt.
Consider:
1. The type of data being requested
2. Required fields vs optional fields
3. Appropriate data types for each field
4. Nested objects and arrays where appropriate
2025-01-19 22:04:12 -03:00
Valid JSON schema, has to be simple. No crazy properties. OpenAI has to support it.
Supported types
The following types are supported for Structured Outputs:
String
Number
Boolean
Integer
Object
Array
Enum
anyOf
Formats are not supported. Min/max are not supported. Anything beyond the above is not supported. Keep it simple with types and descriptions.
Optionals are not supported.
2025-01-19 22:18:51 -03:00
DO NOT USE FORMATS.
2025-01-19 22:04:12 -03:00
Keep it simple. Don't create too many properties, just the ones that are needed. Don't invent properties.
2025-01-14 11:37:00 -03:00
Return a valid JSON schema object with properties that would capture the information requested in the prompt. ` ,
} ,
{
role : "user" ,
content : ` Generate a JSON schema for extracting the following information: ${ prompt } ` ,
} ,
] ,
response_format : {
type : "json_object" ,
} ,
} ) ;
if ( result . choices [ 0 ] . message . refusal !== null ) {
throw new Error ( "LLM refused to generate schema" ) ;
}
let schema ;
try {
schema = JSON . parse ( result . choices [ 0 ] . message . content ? ? "" ) ;
return schema ;
} catch ( e ) {
throw new Error ( "Failed to parse schema JSON from LLM response" ) ;
}
} catch ( error ) {
lastError = error as Error ;
logger . warn ( ` Failed attempt with temperature ${ temp } : ${ error . message } ` ) ;
continue ;
}
}
// If we get here, all attempts failed
throw new Error (
` Failed to generate schema after all attempts. Last error: ${ lastError ? . message } ` ,
) ;
}