apps/api/src/lib/LLM-extraction/index.ts

import OpenAI from "openai";
import Ajv from "ajv";
const ajv = new Ajv(); // Initialize AJV for JSON schema validation

import { generateOpenAICompletions } from "./models";
import { Document, ExtractorOptions } from "../entities";
import { logger } from "../logger";

// Generate completion using OpenAI
export async function generateCompletions(
  documents: Document[],
  extractionOptions: ExtractorOptions | undefined,
  mode: "markdown" | "raw-html",
): Promise<Document[]> {
  // const schema = zodToJsonSchema(options.schema)

  const schema = extractionOptions?.extractionSchema;
  const systemPrompt = extractionOptions?.extractionPrompt;
  const prompt = extractionOptions?.userPrompt;

  const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider

  const completions = await Promise.all(
    documents.map(async (document: Document) => {
      switch (switchVariable) {
        case "openAI":
          const llm = new OpenAI();
          try {
            const completionResult = await generateOpenAICompletions({
              client: llm,
              document: document,
              schema: schema,
              prompt: prompt,
              systemPrompt: systemPrompt,
              mode: mode,
            });
            // Validate the JSON output against the schema using AJV
            if (schema) {
              const validate = ajv.compile(schema);
              if (!validate(completionResult.llm_extraction)) {
                //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
                throw new Error(
                  `JSON parsing error(s): ${validate.errors
                    ?.map((err) => err.message)
                    .join(
                      ", ",
                    )}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`,
                );
              }
            }

            return completionResult;
          } catch (error) {
            logger.error(`Error generating completions: ${error}`);
            throw error;
          }
        default:
          throw new Error("Invalid client");
      }
    }),
  );

  return completions;
}

// generate basic completion

export async function generateBasicCompletion(prompt: string) {
  const openai = new OpenAI();
  const model = "gpt-4o";

  const completion = await openai.chat.completions.create({
    temperature: 0,
    model,
    messages: [{ role: "user", content: prompt }],
  });
  return completion.choices[0].message.content;
}
Nick: cleanup 2024-04-30 12:19:43 -07:00			`import OpenAI from "openai";`
			`import Ajv from "ajv";`
Caleb: added ajv json schema validation. 2024-04-29 12:12:55 -07:00			`const ajv = new Ajv(); // Initialize AJV for JSON schema validation`
Caleb: first test passing 2024-04-28 17:38:20 -07:00
Nick: cleanup 2024-04-30 12:19:43 -07:00			`import { generateOpenAICompletions } from "./models";`
			`import { Document, ExtractorOptions } from "../entities";`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`import { logger } from "../logger";`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
Nick: cleanup 2024-04-30 12:19:43 -07:00			`// Generate completion using OpenAI`
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`export async function generateCompletions(`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`documents: Document[],`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`extractionOptions: ExtractorOptions \| undefined,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`mode: "markdown" \| "raw-html",`
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`): Promise<Document[]> {`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`// const schema = zodToJsonSchema(options.schema)`

`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`const schema = extractionOptions?.extractionSchema;`
			`const systemPrompt = extractionOptions?.extractionPrompt;`
			`const prompt = extractionOptions?.userPrompt;`
Nick: cleanup 2024-04-30 12:19:43 -07:00
			`const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider`

			`const completions = await Promise.all(`
			`documents.map(async (document: Document) => {`
			`switch (switchVariable) {`
			`case "openAI":`
			`const llm = new OpenAI();`
Nick: improvements to llm extract error handling 2024-08-30 11:57:55 -03:00			`try {`
			`const completionResult = await generateOpenAICompletions({`
			`client: llm,`
			`document: document,`
			`schema: schema,`
			`prompt: prompt,`
			`systemPrompt: systemPrompt,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`mode: mode,`
Nick: improvements to llm extract error handling 2024-08-30 11:57:55 -03:00			`});`
			`// Validate the JSON output against the schema using AJV`
			`if (schema) {`
			`const validate = ajv.compile(schema);`
			`if (!validate(completionResult.llm_extraction)) {`
			`//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.`
			`throw new Error(`
			`JSON parsing error(s): ${validate.errors
			`?.map((err) => err.message)`
			`.join(`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`", ",`
			)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`,
Nick: improvements to llm extract error handling 2024-08-30 11:57:55 -03:00			`);`
			`}`
Nick: prompt option, still need to convert to new structured outputs 2024-08-29 21:00:57 -03:00			`}`
Nick: cleanup 2024-04-30 12:19:43 -07:00
Nick: improvements to llm extract error handling 2024-08-30 11:57:55 -03:00			`return completionResult;`
			`} catch (error) {`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			logger.error(`Error generating completions: ${error}`);
Nick: improvements to llm extract error handling 2024-08-30 11:57:55 -03:00			`throw error;`
			`}`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`default:`
			`throw new Error("Invalid client");`
			`}`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`}),`
Nick: cleanup 2024-04-30 12:19:43 -07:00			`);`

			`return completions;`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`}`
Nick: extract fixes 2024-12-17 16:58:35 -03:00
			`// generate basic completion`

			`export async function generateBasicCompletion(prompt: string) {`
			`const openai = new OpenAI();`
Nick: small improvements 2024-12-18 21:45:06 -03:00			`const model = "gpt-4o";`
Nick: extract fixes 2024-12-17 16:58:35 -03:00
			`const completion = await openai.chat.completions.create({`
Nick: fixes to extract rephrase prompt 2025-01-11 20:22:36 -03:00			`temperature: 0,`
Nick: extract fixes 2024-12-17 16:58:35 -03:00			`model,`
			`messages: [{ role: "user", content: prompt }],`
			`});`
			`return completion.choices[0].message.content;`
			`}`