apps/api/src/lib/LLM-extraction/models.ts

import OpenAI from "openai";
import { Document } from "../../lib/entities";
import { numTokensFromString } from "./helpers";

export type ScraperCompletionResult = {
  data: any | null;
  url: string;
};

const maxTokens = 32000;
const modifier = 4;
const defaultPrompt =
  "You are a professional web scraper. Extract the contents of the webpage";

function prepareOpenAIDoc(
  document: Document,
  mode: "markdown" | "raw-html",
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
  let markdown = document.markdown;

  let extractionTarget = document.markdown;

  if (mode === "raw-html") {
    extractionTarget = document.rawHtml;
  }

  // Check if the markdown content exists in the document
  if (!extractionTarget) {
    return null;
    // throw new Error(
    //   `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
    // );
  }

  // count number of tokens
  const numTokens = numTokensFromString(extractionTarget, "gpt-4");

  if (numTokens > maxTokens) {
    // trim the document to the maximum number of tokens, tokens != characters
    extractionTarget = extractionTarget.slice(0, maxTokens * modifier);
  }
  return [[{ type: "text", text: extractionTarget }], numTokens];
}

export async function generateOpenAICompletions({
  client,
  model = process.env.MODEL_NAME || "gpt-4o-mini",
  document,
  schema, //TODO - add zod dynamic type checking
  systemPrompt = defaultPrompt,
  prompt,
  temperature,
  mode,
}: {
  client: OpenAI;
  model?: string;
  document: Document;
  schema: any; // This should be replaced with a proper Zod schema type when available
  prompt?: string;
  systemPrompt?: string;
  temperature?: number;
  mode: "markdown" | "raw-html";
}): Promise<Document> {
  const openai = client as OpenAI;
  const preparedDoc = prepareOpenAIDoc(document, mode);

  if (preparedDoc === null) {
    return {
      ...document,
      warning:
        "LLM extraction was not performed since the document's content is empty or missing.",
    };
  }
  const [content, numTokens] = preparedDoc;

  let completion;
  let llmExtraction;
  if (prompt && !schema) {
    const jsonCompletion = await openai.chat.completions.create({
      model,
      messages: [
        {
          role: "system",
          content: systemPrompt,
        },
        { role: "user", content },
        {
          role: "user",
          content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
        },
      ],
      response_format: { type: "json_object" },
      temperature,
    });

    try {
      llmExtraction = JSON.parse(
        (jsonCompletion.choices[0].message.content ?? "").trim(),
      );
    } catch (e) {
      throw new Error("Invalid JSON");
    }
  } else {
    completion = await openai.chat.completions.create({
      model,
      messages: [
        {
          role: "system",
          content: systemPrompt,
        },
        { role: "user", content },
      ],
      tools: [
        {
          type: "function",
          function: {
            name: "extract_content",
            description: "Extracts the content from the given webpage(s)",
            parameters: schema,
          },
        },
      ],
      tool_choice: { type: "function", function: { name: "extract_content" } },
      temperature,
    });
    const c = completion.choices[0].message.tool_calls[0].function.arguments;

    // Extract the LLM extraction content from the completion response
    try {
      llmExtraction = JSON.parse(c);
    } catch (e) {
      throw new Error("Invalid JSON");
    }
  }

  // Return the document with the LLM extraction content added
  return {
    ...document,
    llm_extraction: llmExtraction,
    warning:
      numTokens > maxTokens
        ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
        : undefined,
  };
}
Nick: cleanup 2024-04-30 12:19:43 -07:00			`import OpenAI from "openai";`
			`import { Document } from "../../lib/entities";`
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`import { numTokensFromString } from "./helpers";`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
Nick: cleanup 2024-04-30 12:19:43 -07:00			`export type ScraperCompletionResult = {`
			`data: any \| null;`
			`url: string;`
			`};`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`const maxTokens = 32000;`
			`const modifier = 4;`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`const defaultPrompt =`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`"You are a professional web scraper. Extract the contents of the webpage";`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`function prepareOpenAIDoc(`
init 2024-06-28 16:39:09 -04:00			`document: Document,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`mode: "markdown" \| "raw-html",`
fix(llm-extract): handle llm-extract if scrape failed 2024-08-22 14:12:52 +02:00			`): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] \| null {`
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`let markdown = document.markdown;`

init 2024-06-28 16:39:09 -04:00			`let extractionTarget = document.markdown;`

			`if (mode === "raw-html") {`
			`extractionTarget = document.rawHtml;`
			`}`

			`// Check if the markdown content exists in the document`
			`if (!extractionTarget) {`
fix(llm-extract): handle llm-extract if scrape failed 2024-08-22 14:12:52 +02:00			`return null;`
			`// throw new Error(`
			// `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
			`// );`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`}`

Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`// count number of tokens`
init 2024-06-28 16:39:09 -04:00			`const numTokens = numTokensFromString(extractionTarget, "gpt-4");`
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00
			`if (numTokens > maxTokens) {`
			`// trim the document to the maximum number of tokens, tokens != characters`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`extractionTarget = extractionTarget.slice(0, maxTokens * modifier);`
Nick: max num tokens for llm extract (for now) + slice the max 2024-05-20 17:07:38 -07:00			`}`
init 2024-06-28 16:39:09 -04:00			`return [[{ type: "text", text: extractionTarget }], numTokens];`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`}`

Caleb: added ajv json schema validation. 2024-04-29 12:12:55 -07:00			`export async function generateOpenAICompletions({`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`client,`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`model = process.env.MODEL_NAME \|\| "gpt-4o-mini",`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`document,`
			`schema, //TODO - add zod dynamic type checking`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`systemPrompt = defaultPrompt,`
			`prompt,`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`temperature,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`mode,`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`}: {`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`client: OpenAI;`
			`model?: string;`
			`document: Document;`
			`schema: any; // This should be replaced with a proper Zod schema type when available`
			`prompt?: string;`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`systemPrompt?: string;`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`temperature?: number;`
init 2024-06-28 16:39:09 -04:00			`mode: "markdown" \| "raw-html";`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`}): Promise<Document> {`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`const openai = client as OpenAI;`
fix(llm-extract): handle llm-extract if scrape failed 2024-08-22 14:12:52 +02:00			`const preparedDoc = prepareOpenAIDoc(document, mode);`

			`if (preparedDoc === null) {`
			`return {`
			`...document,`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`warning:`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`"LLM extraction was not performed since the document's content is empty or missing.",`
fix(llm-extract): handle llm-extract if scrape failed 2024-08-22 14:12:52 +02:00			`};`
			`}`
			`const [content, numTokens] = preparedDoc;`
Caleb: trying to get loggin workng 2024-04-30 09:20:15 -07:00
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`let completion;`
			`let llmExtraction;`
			`if (prompt && !schema) {`
			`const jsonCompletion = await openai.chat.completions.create({`
			`model,`
			`messages: [`
			`{`
			`role: "system",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`content: systemPrompt,`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`},`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`{ role: "user", content },`
Nick: improvements to llm extract error handling 2024-08-30 11:57:55 -03:00			`{`
			`role: "user",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
			`},`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`],`
			`response_format: { type: "json_object" },`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`temperature,`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`});`

Nick: improvements to llm extract error handling 2024-08-30 11:57:55 -03:00			`try {`
			`llmExtraction = JSON.parse(`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`(jsonCompletion.choices[0].message.content ?? "").trim(),`
Nick: improvements to llm extract error handling 2024-08-30 11:57:55 -03:00			`);`
			`} catch (e) {`
			`throw new Error("Invalid JSON");`
			`}`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`} else {`
			`completion = await openai.chat.completions.create({`
			`model,`
			`messages: [`
			`{`
			`role: "system",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`content: systemPrompt,`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`},`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`{ role: "user", content },`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`],`
			`tools: [`
			`{`
			`type: "function",`
			`function: {`
			`name: "extract_content",`
			`description: "Extracts the content from the given webpage(s)",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`parameters: schema,`
			`},`
			`},`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`],`
			`tool_choice: { type: "function", function: { name: "extract_content" } },`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`temperature,`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`});`
			`const c = completion.choices[0].message.tool_calls[0].function.arguments;`

			`// Extract the LLM extraction content from the completion response`
Nick: improvements to llm extract error handling 2024-08-30 11:57:55 -03:00			`try {`
			`llmExtraction = JSON.parse(c);`
			`} catch (e) {`
			`throw new Error("Invalid JSON");`
			`}`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`}`
Caleb: converted llm response to json 2024-04-28 19:28:28 -07:00
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`// Return the document with the LLM extraction content added`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`return {`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`...document,`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`llm_extraction: llmExtraction,`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`warning:`
			`numTokens > maxTokens`
			? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`: undefined,`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`};`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`}`