2024-11-07 20:57:33 +01:00
import OpenAI from "openai" ;
import { encoding_for_model } from "@dqbd/tiktoken" ;
import { TiktokenModel } from "@dqbd/tiktoken" ;
import { Document , ExtractOptions } from "../../../controllers/v1/types" ;
import { Logger } from "winston" ;
import { EngineResultsTracker , Meta } from ".." ;
const maxTokens = 32000 ;
const modifier = 4 ;
export class LLMRefusalError extends Error {
public refusal : string ;
public results : EngineResultsTracker | undefined ;
constructor ( refusal : string ) {
super ( "LLM refused to extract the website's content" )
this . refusal = refusal ;
}
}
function normalizeSchema ( x : any ) : any {
2024-11-11 10:55:45 +01:00
if ( typeof x !== "object" || x === null ) return x ;
if ( x [ "$defs" ] !== null && typeof x [ "$defs" ] === "object" ) {
x [ "$defs" ] = Object . fromEntries ( Object . entries ( x [ "$defs" ] ) . map ( ( [ name , schema ] ) = > [ name , normalizeSchema ( schema ) ] ) ) ;
}
if ( x && x . anyOf ) {
x . anyOf = x . anyOf . map ( x = > normalizeSchema ( x ) ) ;
}
if ( x && x . oneOf ) {
x . oneOf = x . oneOf . map ( x = > normalizeSchema ( x ) ) ;
}
if ( x && x . allOf ) {
x . allOf = x . allOf . map ( x = > normalizeSchema ( x ) ) ;
}
if ( x && x . not ) {
x . not = normalizeSchema ( x . not ) ;
}
2024-11-07 20:57:33 +01:00
if ( x && x . type === "object" ) {
return {
. . . x ,
properties : Object.fromEntries ( Object . entries ( x . properties ) . map ( ( [ k , v ] ) = > [ k , normalizeSchema ( v ) ] ) ) ,
2024-11-07 22:48:57 +01:00
required : Object.keys ( x . properties ) ,
2024-11-07 20:57:33 +01:00
additionalProperties : false ,
}
2024-11-07 22:46:59 +01:00
} else if ( x && x . type === "array" ) {
return {
. . . x ,
items : normalizeSchema ( x . items ) ,
}
2024-11-07 20:57:33 +01:00
} else {
return x ;
}
}
2024-11-20 13:09:46 -08:00
export async function generateOpenAICompletions ( logger : Logger , options : ExtractOptions , markdown? : string , previousWarning? : string ) : Promise < { extract : any , numTokens : number , warning : string | undefined } > {
2024-11-13 18:06:20 -03:00
let extract : any ;
let warning : string | undefined ;
2024-11-07 20:57:33 +01:00
const openai = new OpenAI ( ) ;
2024-11-20 12:48:10 -08:00
const model : TiktokenModel = ( process . env . MODEL_NAME as TiktokenModel ) ? ? "gpt-4o-mini" ;
2024-11-07 20:57:33 +01:00
2024-11-13 18:06:20 -03:00
if ( markdown === undefined ) {
2024-11-07 20:57:33 +01:00
throw new Error ( "document.markdown is undefined -- this is unexpected" ) ;
}
// count number of tokens
let numTokens = 0 ;
const encoder = encoding_for_model ( model as TiktokenModel ) ;
try {
// Encode the message into tokens
2024-11-13 18:06:20 -03:00
const tokens = encoder . encode ( markdown ) ;
2024-11-07 20:57:33 +01:00
// Return the number of tokens
numTokens = tokens . length ;
} catch ( error ) {
2024-11-13 18:06:20 -03:00
logger . warn ( "Calculating num tokens of string failed" , { error , markdown } ) ;
2024-11-07 20:57:33 +01:00
2024-11-13 18:06:20 -03:00
markdown = markdown . slice ( 0 , maxTokens * modifier ) ;
2024-11-07 20:57:33 +01:00
2024-11-13 18:06:20 -03:00
let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support." ;
warning = previousWarning === undefined ? w : w + " " + previousWarning ;
2024-11-07 20:57:33 +01:00
} finally {
// Free the encoder resources after use
encoder . free ( ) ;
}
if ( numTokens > maxTokens ) {
// trim the document to the maximum number of tokens, tokens != characters
2024-11-13 18:06:20 -03:00
markdown = markdown . slice ( 0 , maxTokens * modifier ) ;
2024-11-07 20:57:33 +01:00
2024-11-13 18:06:20 -03:00
const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed." ;
warning = previousWarning === undefined ? w : w + " " + previousWarning ;
2024-11-07 20:57:33 +01:00
}
let schema = options . schema ;
if ( schema && schema . type === "array" ) {
schema = {
type : "object" ,
properties : {
items : options.schema ,
} ,
required : [ "items" ] ,
additionalProperties : false ,
} ;
2024-11-19 10:04:42 -03:00
} else if ( schema && typeof schema === 'object' && ! schema . type ) {
schema = {
type : "object" ,
properties : Object.fromEntries (
Object . entries ( schema ) . map ( ( [ key , value ] ) = > [ key , { type : value } ] )
) ,
required : Object.keys ( schema ) ,
additionalProperties : false
} ;
2024-11-07 20:57:33 +01:00
}
schema = normalizeSchema ( schema ) ;
const jsonCompletion = await openai . beta . chat . completions . parse ( {
model ,
2024-11-20 10:23:44 -08:00
temperature : 0 ,
2024-11-07 20:57:33 +01:00
messages : [
{
role : "system" ,
content : options.systemPrompt ,
} ,
{
role : "user" ,
2024-11-13 18:06:20 -03:00
content : [ { type : "text" , text : markdown } ] ,
2024-11-07 20:57:33 +01:00
} ,
{
role : "user" ,
content : options.prompt !== undefined
? ` Transform the above content into structured JSON output based on the following user request: ${ options . prompt } `
: "Transform the above content into structured JSON output." ,
} ,
] ,
response_format : options.schema ? {
type : "json_schema" ,
json_schema : {
name : "websiteContent" ,
schema : schema ,
strict : true ,
}
} : { type : "json_object" } ,
} ) ;
if ( jsonCompletion . choices [ 0 ] . message . refusal !== null ) {
throw new LLMRefusalError ( jsonCompletion . choices [ 0 ] . message . refusal ) ;
}
2024-11-13 18:06:20 -03:00
extract = jsonCompletion . choices [ 0 ] . message . parsed ;
2024-11-11 21:07:37 +01:00
2024-11-13 18:06:20 -03:00
if ( extract === null && jsonCompletion . choices [ 0 ] . message . content !== null ) {
2024-11-11 21:07:37 +01:00
try {
2024-11-13 18:06:20 -03:00
extract = JSON . parse ( jsonCompletion . choices [ 0 ] . message . content ) ;
2024-11-11 21:07:37 +01:00
} catch ( e ) {
logger . error ( "Failed to parse returned JSON, no schema specified." , { error : e } ) ;
throw new LLMRefusalError ( "Failed to parse returned JSON. Please specify a schema in the extract object." ) ;
}
}
2024-11-20 11:50:14 -08:00
// If the users actually wants the items object, they can specify it as 'required' in the schema
// otherwise, we just return the items array
if ( options . schema && options . schema . type === "array" && ! schema ? . required ? . includes ( "items" ) ) {
2024-11-13 18:06:20 -03:00
extract = extract ? . items ;
2024-11-07 20:57:33 +01:00
}
2024-11-20 13:09:46 -08:00
return { extract , warning , numTokens } ;
2024-11-07 20:57:33 +01:00
}
export async function performLLMExtract ( meta : Meta , document : Document ) : Promise < Document > {
if ( meta . options . formats . includes ( "extract" ) ) {
2024-11-13 18:06:20 -03:00
const { extract , warning } = await generateOpenAICompletions (
meta . logger . child ( { method : "performLLMExtract/generateOpenAICompletions" } ) ,
meta . options . extract ! ,
document . markdown ,
document . warning ,
) ;
document . extract = extract ;
document . warning = warning ;
2024-11-07 20:57:33 +01:00
}
return document ;
}