2024-04-30 12:19:43 -07:00
import OpenAI from "openai" ;
import { Document } from "../../lib/entities" ;
2024-05-20 17:07:38 -07:00
import { numTokensFromString } from "./helpers" ;
2024-04-28 15:52:09 -07:00
2024-04-30 12:19:43 -07:00
export type ScraperCompletionResult = {
data : any | null ;
url : string ;
} ;
2024-04-28 13:59:35 -07:00
2024-05-20 17:07:38 -07:00
const maxTokens = 32000 ;
const modifier = 4 ;
2024-04-28 13:59:35 -07:00
const defaultPrompt =
2024-04-30 12:19:43 -07:00
"You are a professional web scraper. Extract the contents of the webpage" ;
2024-04-28 13:59:35 -07:00
2024-04-28 15:52:09 -07:00
function prepareOpenAIDoc (
2024-06-28 16:39:09 -04:00
document : Document ,
2024-12-11 19:51:08 -03:00
mode : "markdown" | "raw-html" ,
2024-08-22 14:12:52 +02:00
) : [ OpenAI . Chat . Completions . ChatCompletionContentPart [ ] , number ] | null {
2024-05-20 17:07:38 -07:00
let markdown = document . markdown ;
2024-06-28 16:39:09 -04:00
let extractionTarget = document . markdown ;
if ( mode === "raw-html" ) {
extractionTarget = document . rawHtml ;
}
// Check if the markdown content exists in the document
if ( ! extractionTarget ) {
2024-08-22 14:12:52 +02:00
return null ;
// throw new Error(
// `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
// );
2024-04-28 13:59:35 -07:00
}
2024-05-20 17:07:38 -07:00
// count number of tokens
2024-06-28 16:39:09 -04:00
const numTokens = numTokensFromString ( extractionTarget , "gpt-4" ) ;
2024-05-20 17:07:38 -07:00
if ( numTokens > maxTokens ) {
// trim the document to the maximum number of tokens, tokens != characters
2024-08-29 21:00:57 -03:00
extractionTarget = extractionTarget . slice ( 0 , maxTokens * modifier ) ;
2024-05-20 17:07:38 -07:00
}
2024-06-28 16:39:09 -04:00
return [ [ { type : "text" , text : extractionTarget } ] , numTokens ] ;
2024-04-28 13:59:35 -07:00
}
2024-04-29 12:12:55 -07:00
export async function generateOpenAICompletions ( {
2024-04-28 15:52:09 -07:00
client ,
2024-08-29 21:00:57 -03:00
model = process . env . MODEL_NAME || "gpt-4o-mini" ,
2024-04-28 15:52:09 -07:00
document ,
schema , //TODO - add zod dynamic type checking
2024-08-29 21:00:57 -03:00
systemPrompt = defaultPrompt ,
prompt ,
2024-04-30 12:19:43 -07:00
temperature ,
2024-12-11 19:51:08 -03:00
mode ,
2024-04-28 15:52:09 -07:00
} : {
2024-04-30 12:19:43 -07:00
client : OpenAI ;
model? : string ;
document : Document ;
schema : any ; // This should be replaced with a proper Zod schema type when available
prompt? : string ;
2024-08-29 21:00:57 -03:00
systemPrompt? : string ;
2024-04-30 12:19:43 -07:00
temperature? : number ;
2024-06-28 16:39:09 -04:00
mode : "markdown" | "raw-html" ;
2024-04-28 15:52:09 -07:00
} ) : Promise < Document > {
2024-04-30 12:19:43 -07:00
const openai = client as OpenAI ;
2024-08-22 14:12:52 +02:00
const preparedDoc = prepareOpenAIDoc ( document , mode ) ;
if ( preparedDoc === null ) {
return {
. . . document ,
2024-08-29 21:00:57 -03:00
warning :
2024-12-11 19:51:08 -03:00
"LLM extraction was not performed since the document's content is empty or missing." ,
2024-08-22 14:12:52 +02:00
} ;
}
const [ content , numTokens ] = preparedDoc ;
2024-04-30 09:20:15 -07:00
2024-08-29 21:00:57 -03:00
let completion ;
let llmExtraction ;
if ( prompt && ! schema ) {
const jsonCompletion = await openai . chat . completions . create ( {
model ,
messages : [
{
role : "system" ,
2024-12-11 19:51:08 -03:00
content : systemPrompt ,
2024-04-28 13:59:35 -07:00
} ,
2024-08-29 21:00:57 -03:00
{ role : "user" , content } ,
2024-08-30 11:57:55 -03:00
{
role : "user" ,
2024-12-11 19:51:08 -03:00
content : ` Transform the above content into structured json output based on the following user request: ${ prompt } ` ,
} ,
2024-08-29 21:00:57 -03:00
] ,
response_format : { type : "json_object" } ,
2024-12-11 19:51:08 -03:00
temperature ,
2024-08-29 21:00:57 -03:00
} ) ;
2024-08-30 11:57:55 -03:00
try {
llmExtraction = JSON . parse (
2024-12-11 19:51:08 -03:00
( jsonCompletion . choices [ 0 ] . message . content ? ? "" ) . trim ( ) ,
2024-08-30 11:57:55 -03:00
) ;
} catch ( e ) {
throw new Error ( "Invalid JSON" ) ;
}
2024-08-29 21:00:57 -03:00
} else {
completion = await openai . chat . completions . create ( {
model ,
messages : [
{
role : "system" ,
2024-12-11 19:51:08 -03:00
content : systemPrompt ,
2024-08-29 21:00:57 -03:00
} ,
2024-12-11 19:51:08 -03:00
{ role : "user" , content } ,
2024-08-29 21:00:57 -03:00
] ,
tools : [
{
type : "function" ,
function : {
name : "extract_content" ,
description : "Extracts the content from the given webpage(s)" ,
2024-12-11 19:51:08 -03:00
parameters : schema ,
} ,
} ,
2024-08-29 21:00:57 -03:00
] ,
tool_choice : { type : "function" , function : { name : "extract_content" } } ,
2024-12-11 19:51:08 -03:00
temperature ,
2024-08-29 21:00:57 -03:00
} ) ;
const c = completion . choices [ 0 ] . message . tool_calls [ 0 ] . function . arguments ;
// Extract the LLM extraction content from the completion response
2024-08-30 11:57:55 -03:00
try {
llmExtraction = JSON . parse ( c ) ;
} catch ( e ) {
throw new Error ( "Invalid JSON" ) ;
}
2024-08-29 21:00:57 -03:00
}
2024-04-28 19:28:28 -07:00
2024-04-28 15:52:09 -07:00
// Return the document with the LLM extraction content added
2024-04-28 13:59:35 -07:00
return {
2024-04-28 15:52:09 -07:00
. . . document ,
2024-04-30 12:19:43 -07:00
llm_extraction : llmExtraction ,
2024-08-29 21:00:57 -03:00
warning :
numTokens > maxTokens
? ` Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${ maxTokens } tokens, Attemped: ${ numTokens } tokens). If results are not good, email us at help@mendable.ai so we can help you. `
2024-12-11 19:51:08 -03:00
: undefined ,
2024-04-28 15:52:09 -07:00
} ;
2024-04-28 13:59:35 -07:00
}