apps/api/src/lib/LLM-extraction/models.ts

import OpenAI from "openai";
import { Document } from "../../lib/entities";
import { numTokensFromString } from "./helpers";

export type ScraperCompletionResult = {
  data: any | null;
  url: string;
};

const maxTokens = 32000;
const modifier = 4;
const defaultPrompt =
  "You are a professional web scraper. Extract the contents of the webpage";

function prepareOpenAIDoc(
  document: Document,
  mode: "markdown" | "raw-html"
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {

  let markdown = document.markdown;

  let extractionTarget = document.markdown;

  if (mode === "raw-html") {
    extractionTarget = document.rawHtml;
  }

  // Check if the markdown content exists in the document
  if (!extractionTarget) {
    throw new Error(
      `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
    );
  }


  // count number of tokens
  const numTokens = numTokensFromString(extractionTarget, "gpt-4");

  if (numTokens > maxTokens) {
    // trim the document to the maximum number of tokens, tokens != characters
    extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
  }

  return [[{ type: "text", text: extractionTarget }], numTokens];
}

export async function generateOpenAICompletions({
  client,
  model = process.env.MODEL_NAME || "gpt-4o",
  document,
  schema, //TODO - add zod dynamic type checking
  prompt = defaultPrompt,
  temperature,
  mode
}: {
  client: OpenAI;
  model?: string;
  document: Document;
  schema: any; // This should be replaced with a proper Zod schema type when available
  prompt?: string;
  temperature?: number;
  mode: "markdown" | "raw-html";
}): Promise<Document> {
  const openai = client as OpenAI;
  const [content, numTokens] = prepareOpenAIDoc(document, mode);

  const completion = await openai.chat.completions.create({
    model,
    messages: [
      {
        role: "system",
        content: prompt,
      },
      { role: "user", content },
    ],
    tools: [
      {
        type: "function",
        function: {
          name: "extract_content",
          description: "Extracts the content from the given webpage(s)",
          parameters: schema,
        },
      },
    ],
    tool_choice: { "type": "function", "function": {"name": "extract_content"}},
    temperature,
  });

  const c = completion.choices[0].message.tool_calls[0].function.arguments;

  // Extract the LLM extraction content from the completion response
  const llmExtraction = JSON.parse(c);

  // Return the document with the LLM extraction content added
  return {
    ...document,
    llm_extraction: llmExtraction,
    warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
  };
}
Nick: cleanup 2024-04-30 12:19:43 -07:00			`import OpenAI from "openai";`
			`import { Document } from "../../lib/entities";`
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`import { numTokensFromString } from "./helpers";`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
Nick: cleanup 2024-04-30 12:19:43 -07:00			`export type ScraperCompletionResult = {`
			`data: any \| null;`
			`url: string;`
			`};`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`const maxTokens = 32000;`
			`const modifier = 4;`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`const defaultPrompt =`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`"You are a professional web scraper. Extract the contents of the webpage";`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`function prepareOpenAIDoc(`
init 2024-06-28 16:39:09 -04:00			`document: Document,`
			`mode: "markdown" \| "raw-html"`
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {`
init 2024-06-28 16:39:09 -04:00
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`let markdown = document.markdown;`

init 2024-06-28 16:39:09 -04:00			`let extractionTarget = document.markdown;`

			`if (mode === "raw-html") {`
			`extractionTarget = document.rawHtml;`
			`}`

			`// Check if the markdown content exists in the document`
			`if (!extractionTarget) {`
Nick: improvements 2024-04-30 16:19:32 -07:00			`throw new Error(`
init 2024-06-28 16:39:09 -04:00			`${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
Nick: improvements 2024-04-30 16:19:32 -07:00			`);`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`}`

init 2024-06-28 16:39:09 -04:00


Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`// count number of tokens`
init 2024-06-28 16:39:09 -04:00			`const numTokens = numTokensFromString(extractionTarget, "gpt-4");`
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00
			`if (numTokens > maxTokens) {`
			`// trim the document to the maximum number of tokens, tokens != characters`
init 2024-06-28 16:39:09 -04:00			`extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));`
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`}`

init 2024-06-28 16:39:09 -04:00			`return [[{ type: "text", text: extractionTarget }], numTokens];`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`}`

Caleb: added ajv json schema validation. 2024-04-29 12:12:55 -07:00			`export async function generateOpenAICompletions({`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`client,`
Update models.ts 2024-07-15 22:52:17 -04:00			`model = process.env.MODEL_NAME \|\| "gpt-4o",`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`document,`
			`schema, //TODO - add zod dynamic type checking`
			`prompt = defaultPrompt,`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`temperature,`
init 2024-06-28 16:39:09 -04:00			`mode`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`}: {`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`client: OpenAI;`
			`model?: string;`
			`document: Document;`
			`schema: any; // This should be replaced with a proper Zod schema type when available`
			`prompt?: string;`
			`temperature?: number;`
init 2024-06-28 16:39:09 -04:00			`mode: "markdown" \| "raw-html";`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`}): Promise<Document> {`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`const openai = client as OpenAI;`
init 2024-06-28 16:39:09 -04:00			`const [content, numTokens] = prepareOpenAIDoc(document, mode);`
Caleb: trying to get loggin workng 2024-04-30 09:20:15 -07:00
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`const completion = await openai.chat.completions.create({`
			`model,`
			`messages: [`
			`{`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`role: "system",`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`content: prompt,`
			`},`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`{ role: "user", content },`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`],`
			`tools: [`
			`{`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`type: "function",`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`function: {`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`name: "extract_content",`
			`description: "Extracts the content from the given webpage(s)",`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`parameters: schema,`
			`},`
			`},`
			`],`
Update models.ts 2024-04-30 18:36:21 -07:00			`tool_choice: { "type": "function", "function": {"name": "extract_content"}},`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`temperature,`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`});`

			`const c = completion.choices[0].message.tool_calls[0].function.arguments;`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`// Extract the LLM extraction content from the completion response`
Caleb: converted llm response to json 2024-04-28 19:28:28 -07:00			`const llmExtraction = JSON.parse(c);`

Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`// Return the document with the LLM extraction content added`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`return {`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`...document,`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`llm_extraction: llmExtraction,`
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`};`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`}`
Update models.ts 2024-04-30 18:36:21 -07:00