2024-04-30 12:19:43 -07:00
import OpenAI from "openai" ;
import { Document } from "../../lib/entities" ;
2024-05-20 17:07:38 -07:00
import { numTokensFromString } from "./helpers" ;
2024-04-28 15:52:09 -07:00
2024-04-30 12:19:43 -07:00
export type ScraperCompletionResult = {
data : any | null ;
url : string ;
} ;
2024-04-28 13:59:35 -07:00
2024-05-20 17:07:38 -07:00
const maxTokens = 32000 ;
const modifier = 4 ;
2024-04-28 13:59:35 -07:00
const defaultPrompt =
2024-04-30 12:19:43 -07:00
"You are a professional web scraper. Extract the contents of the webpage" ;
2024-04-28 13:59:35 -07:00
2024-04-28 15:52:09 -07:00
function prepareOpenAIDoc (
2024-06-28 16:39:09 -04:00
document : Document ,
mode : "markdown" | "raw-html"
2024-05-20 17:07:38 -07:00
) : [ OpenAI . Chat . Completions . ChatCompletionContentPart [ ] , number ] {
2024-06-28 16:39:09 -04:00
2024-05-20 17:07:38 -07:00
let markdown = document . markdown ;
2024-06-28 16:39:09 -04:00
let extractionTarget = document . markdown ;
if ( mode === "raw-html" ) {
extractionTarget = document . rawHtml ;
}
// Check if the markdown content exists in the document
if ( ! extractionTarget ) {
2024-04-30 16:19:32 -07:00
throw new Error (
2024-06-28 16:39:09 -04:00
` ${ mode } content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai `
2024-04-30 16:19:32 -07:00
) ;
2024-04-28 13:59:35 -07:00
}
2024-06-28 16:39:09 -04:00
2024-05-20 17:07:38 -07:00
// count number of tokens
2024-06-28 16:39:09 -04:00
const numTokens = numTokensFromString ( extractionTarget , "gpt-4" ) ;
2024-05-20 17:07:38 -07:00
if ( numTokens > maxTokens ) {
// trim the document to the maximum number of tokens, tokens != characters
2024-06-28 16:39:09 -04:00
extractionTarget = extractionTarget . slice ( 0 , ( maxTokens * modifier ) ) ;
2024-05-20 17:07:38 -07:00
}
2024-06-28 16:39:09 -04:00
return [ [ { type : "text" , text : extractionTarget } ] , numTokens ] ;
2024-04-28 13:59:35 -07:00
}
2024-04-29 12:12:55 -07:00
export async function generateOpenAICompletions ( {
2024-04-28 15:52:09 -07:00
client ,
2024-07-15 22:52:17 -04:00
model = process . env . MODEL_NAME || "gpt-4o" ,
2024-04-28 15:52:09 -07:00
document ,
schema , //TODO - add zod dynamic type checking
prompt = defaultPrompt ,
2024-04-30 12:19:43 -07:00
temperature ,
2024-06-28 16:39:09 -04:00
mode
2024-04-28 15:52:09 -07:00
} : {
2024-04-30 12:19:43 -07:00
client : OpenAI ;
model? : string ;
document : Document ;
schema : any ; // This should be replaced with a proper Zod schema type when available
prompt? : string ;
temperature? : number ;
2024-06-28 16:39:09 -04:00
mode : "markdown" | "raw-html" ;
2024-04-28 15:52:09 -07:00
} ) : Promise < Document > {
2024-04-30 12:19:43 -07:00
const openai = client as OpenAI ;
2024-06-28 16:39:09 -04:00
const [ content , numTokens ] = prepareOpenAIDoc ( document , mode ) ;
2024-04-30 09:20:15 -07:00
2024-04-28 13:59:35 -07:00
const completion = await openai . chat . completions . create ( {
model ,
messages : [
{
2024-04-30 12:19:43 -07:00
role : "system" ,
2024-04-28 13:59:35 -07:00
content : prompt ,
} ,
2024-04-30 12:19:43 -07:00
{ role : "user" , content } ,
2024-04-28 13:59:35 -07:00
] ,
tools : [
{
2024-04-30 12:19:43 -07:00
type : "function" ,
2024-04-28 13:59:35 -07:00
function : {
2024-04-30 12:19:43 -07:00
name : "extract_content" ,
description : "Extracts the content from the given webpage(s)" ,
2024-04-28 13:59:35 -07:00
parameters : schema ,
} ,
} ,
] ,
2024-04-30 18:36:21 -07:00
tool_choice : { "type" : "function" , "function" : { "name" : "extract_content" } } ,
2024-04-28 13:59:35 -07:00
temperature ,
2024-04-30 12:19:43 -07:00
} ) ;
const c = completion . choices [ 0 ] . message . tool_calls [ 0 ] . function . arguments ;
2024-04-28 13:59:35 -07:00
2024-04-28 15:52:09 -07:00
// Extract the LLM extraction content from the completion response
2024-04-28 19:28:28 -07:00
const llmExtraction = JSON . parse ( c ) ;
2024-04-28 15:52:09 -07:00
// Return the document with the LLM extraction content added
2024-04-28 13:59:35 -07:00
return {
2024-04-28 15:52:09 -07:00
. . . document ,
2024-04-30 12:19:43 -07:00
llm_extraction : llmExtraction ,
2024-05-20 17:07:38 -07:00
warning : numTokens > maxTokens ? ` Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${ maxTokens } tokens, Attemped: ${ numTokens } tokens). If results are not good, email us at help@mendable.ai so we can help you. ` : undefined ,
2024-04-28 15:52:09 -07:00
} ;
2024-04-28 13:59:35 -07:00
}
2024-04-30 18:36:21 -07:00