apps/api/src/lib/LLM-extraction/index.ts

import Turndown from 'turndown'
import OpenAI from 'openai'
// import { LlamaModel } from 'node-llama-cpp'
import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema'
import Ajv from 'ajv';
const ajv = new Ajv(); // Initialize AJV for JSON schema validation

import {
    ScraperCompletionResult,
    generateOpenAICompletions,
} from './models'
import { Document, ExtractorOptions } from '../entities'

  // Generate completion using OpenAI
export async function generateCompletions(
    documents: Document[],
    extractionOptions: ExtractorOptions
): Promise<Document[]> {
    // const schema = zodToJsonSchema(options.schema)

    const schema = extractionOptions.extractionSchema;
    const prompt = extractionOptions.extractionPrompt;

    const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider


    const completions = await Promise.all(documents.map(async (document: Document) => {
        switch (switchVariable) {
            case "openAI":
                const llm = new OpenAI();
                const completionResult = await generateOpenAICompletions({
                    client: llm,
                    document: document,
                    schema: schema,
                    prompt: prompt
                });
                // Validate the JSON output against the schema using AJV
                const validate = ajv.compile(schema);
                if (!validate(completionResult.llm_extraction)) {
                    //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
                    throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
                }

                return completionResult;
            default:
                throw new Error('Invalid client');
        }
    }));
    

    return completions;
}
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`import Turndown from 'turndown'`
			`import OpenAI from 'openai'`
			`// import { LlamaModel } from 'node-llama-cpp'`
			`import { z } from 'zod'`
			`import { zodToJsonSchema } from 'zod-to-json-schema'`
Caleb: added ajv json schema validation. 2024-04-29 12:12:55 -07:00			`import Ajv from 'ajv';`
			`const ajv = new Ajv(); // Initialize AJV for JSON schema validation`
Caleb: first test passing 2024-04-28 17:38:20 -07:00
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`import {`
			`ScraperCompletionResult,`
			`generateOpenAICompletions,`
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`} from './models'`
			`import { Document, ExtractorOptions } from '../entities'`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
			`// Generate completion using OpenAI`
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`export async function generateCompletions(`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`documents: Document[],`
			`extractionOptions: ExtractorOptions`
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`): Promise<Document[]> {`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`// const schema = zodToJsonSchema(options.schema)`

			`const schema = extractionOptions.extractionSchema;`
			`const prompt = extractionOptions.extractionPrompt;`

Caleb: first test passing 2024-04-28 17:38:20 -07:00			`const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider`

Caleb: added ajv json schema validation. 2024-04-29 12:12:55 -07:00
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`const completions = await Promise.all(documents.map(async (document: Document) => {`
			`switch (switchVariable) {`
			`case "openAI":`
			`const llm = new OpenAI();`
Caleb: added ajv json schema validation. 2024-04-29 12:12:55 -07:00			`const completionResult = await generateOpenAICompletions({`
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`client: llm,`
			`document: document,`
			`schema: schema,`
			`prompt: prompt`
			`});`
Caleb: added ajv json schema validation. 2024-04-29 12:12:55 -07:00			`// Validate the JSON output against the schema using AJV`
			`const validate = ajv.compile(schema);`
			`if (!validate(completionResult.llm_extraction)) {`
Caleb: trying to get loggin workng 2024-04-30 09:20:15 -07:00			`//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.`
Caleb: added ajv json schema validation. 2024-04-29 12:12:55 -07:00			throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
			`}`

			`return completionResult;`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`default:`
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`throw new Error('Invalid client');`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`}`
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`}));`
Caleb: added ajv json schema validation. 2024-04-29 12:12:55 -07:00
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`return completions;`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`}`