apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts

import OpenAI from "openai";
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
import { Document, ExtractOptions } from "../../../controllers/v1/types";
import { Logger } from "winston";
import { EngineResultsTracker, Meta } from "..";

const maxTokens = 32000;
const modifier = 4;

export class LLMRefusalError extends Error {
    public refusal: string;
    public results: EngineResultsTracker | undefined;

    constructor(refusal: string) {
        super("LLM refused to extract the website's content")
        this.refusal = refusal;
    }
}

function normalizeSchema(x: any): any {
    if (typeof x !== "object" || x === null) return x;

    if (x["$defs"] !== null && typeof x["$defs"] === "object") {
        x["$defs"] = Object.fromEntries(Object.entries(x["$defs"]).map(([name, schema]) => [name, normalizeSchema(schema)]));
    }

    if (x && x.anyOf) {
        x.anyOf = x.anyOf.map(x => normalizeSchema(x));
    }

    if (x && x.oneOf) {
        x.oneOf = x.oneOf.map(x => normalizeSchema(x));
    }

    if (x && x.allOf) {
        x.allOf = x.allOf.map(x => normalizeSchema(x));
    }

    if (x && x.not) {
        x.not = normalizeSchema(x.not);
    }

    if (x && x.type === "object") {
        return {
            ...x,
            properties: Object.fromEntries(Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)])),
            required: Object.keys(x.properties),
            additionalProperties: false,
        }
    } else if (x && x.type === "array") {
        return {
            ...x,
            items: normalizeSchema(x.items),
        }
    } else {
        return x;
    }
}

export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, numTokens: number, warning: string | undefined }> {
    let extract: any;
    let warning: string | undefined;

    const openai = new OpenAI();
    const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";

    if (markdown === undefined) {
        throw new Error("document.markdown is undefined -- this is unexpected");
    }

    // count number of tokens
    let numTokens = 0;
    const encoder = encoding_for_model(model as TiktokenModel);
    try {
        // Encode the message into tokens
        const tokens = encoder.encode(markdown);
    
        // Return the number of tokens
        numTokens = tokens.length;
    } catch (error) {
        logger.warn("Calculating num tokens of string failed", { error, markdown });

        markdown = markdown.slice(0, maxTokens * modifier);

        let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
        warning = previousWarning === undefined ? w : w + " " + previousWarning;
    } finally {
        // Free the encoder resources after use
        encoder.free();
    }

    if (numTokens > maxTokens) {
        // trim the document to the maximum number of tokens, tokens != characters
        markdown = markdown.slice(0, maxTokens * modifier);

        const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
        warning = previousWarning === undefined ? w : w + " " + previousWarning;
    }

    let schema = options.schema;
    if (schema && schema.type === "array") {
        schema = {
            type: "object",
            properties: {
                items: options.schema,
            },
            required: ["items"],
            additionalProperties: false,
        };
    } else if (schema && typeof schema === 'object' && !schema.type) {
      schema = {
          type: "object",
          properties: Object.fromEntries(
              Object.entries(schema).map(([key, value]) => [key, { type: value }])
          ),
          required: Object.keys(schema),
          additionalProperties: false
      };
    }

    schema = normalizeSchema(schema);

    const jsonCompletion = await openai.beta.chat.completions.parse({
        model,
        temperature: 0,
        messages: [
            {
                role: "system",
                content: options.systemPrompt,
            },
            {
                role: "user",
                content: [{ type: "text", text: markdown }],
            },
            {
                role: "user",
                content: options.prompt !== undefined
                    ? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
                    : "Transform the above content into structured JSON output.",
            },
        ],
        response_format: options.schema ? {
            type: "json_schema",
            json_schema: {
                name: "websiteContent",
                schema: schema,
                strict: true,
            }
        } : { type: "json_object" },
    });

    if (jsonCompletion.choices[0].message.refusal !== null) {
        throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
    }

    extract = jsonCompletion.choices[0].message.parsed;

    if (extract === null && jsonCompletion.choices[0].message.content !== null) {
        try {
            extract = JSON.parse(jsonCompletion.choices[0].message.content);
        } catch (e) {
            logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
            throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
        }
    }

    // If the users actually wants the items object, they can specify it as 'required' in the schema
    // otherwise, we just return the items array
    if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) {
        extract = extract?.items;
    }
    return { extract, warning, numTokens };
}

export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
    if (meta.options.formats.includes("extract")) {
        const { extract, warning } = await generateOpenAICompletions(
          meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
          meta.options.extract!,
          document.markdown,
          document.warning,
        );
        document.extract = extract;
        document.warning = warning;
    }

    return document;
}
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`import OpenAI from "openai";`
			`import { encoding_for_model } from "@dqbd/tiktoken";`
			`import { TiktokenModel } from "@dqbd/tiktoken";`
			`import { Document, ExtractOptions } from "../../../controllers/v1/types";`
			`import { Logger } from "winston";`
			`import { EngineResultsTracker, Meta } from "..";`

			`const maxTokens = 32000;`
			`const modifier = 4;`

			`export class LLMRefusalError extends Error {`
			`public refusal: string;`
			`public results: EngineResultsTracker \| undefined;`

			`constructor(refusal: string) {`
			`super("LLM refused to extract the website's content")`
			`this.refusal = refusal;`
			`}`
			`}`

			`function normalizeSchema(x: any): any {`
fix(scrapeURL/llmExtract): better schema normalization 2024-11-11 10:55:45 +01:00			`if (typeof x !== "object" \|\| x === null) return x;`

			`if (x["$defs"] !== null && typeof x["$defs"] === "object") {`
			`x["$defs"] = Object.fromEntries(Object.entries(x["$defs"]).map(([name, schema]) => [name, normalizeSchema(schema)]));`
			`}`

			`if (x && x.anyOf) {`
			`x.anyOf = x.anyOf.map(x => normalizeSchema(x));`
			`}`

			`if (x && x.oneOf) {`
			`x.oneOf = x.oneOf.map(x => normalizeSchema(x));`
			`}`

			`if (x && x.allOf) {`
			`x.allOf = x.allOf.map(x => normalizeSchema(x));`
			`}`

			`if (x && x.not) {`
			`x.not = normalizeSchema(x.not);`
			`}`

`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`if (x && x.type === "object") {`
			`return {`
			`...x,`
			`properties: Object.fromEntries(Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)])),`
fix(scrapeURL/llmExtract): fill in required field as well 2024-11-07 22:48:57 +01:00			`required: Object.keys(x.properties),`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`additionalProperties: false,`
			`}`
fix(scrapeURL/llmExtract): array schema fix 2024-11-07 22:46:59 +01:00			`} else if (x && x.type === "array") {`
			`return {`
			`...x,`
			`items: normalizeSchema(x.items),`
			`}`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`} else {`
			`return x;`
			`}`
			`}`

Loggin num tokens 2024-11-20 13:09:46 -08:00			`export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, numTokens: number, warning: string \| undefined }> {`
wip 2024-11-13 18:06:20 -03:00			`let extract: any;`
			`let warning: string \| undefined;`

`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`const openai = new OpenAI();`
Nick: fixes 2024-11-20 12:48:10 -08:00			`const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
wip 2024-11-13 18:06:20 -03:00			`if (markdown === undefined) {`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`throw new Error("document.markdown is undefined -- this is unexpected");`
			`}`

			`// count number of tokens`
			`let numTokens = 0;`
			`const encoder = encoding_for_model(model as TiktokenModel);`
			`try {`
			`// Encode the message into tokens`
wip 2024-11-13 18:06:20 -03:00			`const tokens = encoder.encode(markdown);`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
			`// Return the number of tokens`
			`numTokens = tokens.length;`
			`} catch (error) {`
wip 2024-11-13 18:06:20 -03:00			`logger.warn("Calculating num tokens of string failed", { error, markdown });`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
wip 2024-11-13 18:06:20 -03:00			`markdown = markdown.slice(0, maxTokens * modifier);`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
wip 2024-11-13 18:06:20 -03:00			`let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";`
			`warning = previousWarning === undefined ? w : w + " " + previousWarning;`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`} finally {`
			`// Free the encoder resources after use`
			`encoder.free();`
			`}`

			`if (numTokens > maxTokens) {`
			`// trim the document to the maximum number of tokens, tokens != characters`
wip 2024-11-13 18:06:20 -03:00			`markdown = markdown.slice(0, maxTokens * modifier);`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
wip 2024-11-13 18:06:20 -03:00			`const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";`
			`warning = previousWarning === undefined ? w : w + " " + previousWarning;`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`

			`let schema = options.schema;`
			`if (schema && schema.type === "array") {`
			`schema = {`
			`type: "object",`
			`properties: {`
			`items: options.schema,`
			`},`
			`required: ["items"],`
			`additionalProperties: false,`
			`};`
fix schema 2024-11-19 10:04:42 -03:00			`} else if (schema && typeof schema === 'object' && !schema.type) {`
			`schema = {`
			`type: "object",`
			`properties: Object.fromEntries(`
			`Object.entries(schema).map(([key, value]) => [key, { type: value }])`
			`),`
			`required: Object.keys(schema),`
			`additionalProperties: false`
			`};`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`

			`schema = normalizeSchema(schema);`

			`const jsonCompletion = await openai.beta.chat.completions.parse({`
			`model,`
Nick: 2024-11-20 10:23:44 -08:00			`temperature: 0,`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`messages: [`
			`{`
			`role: "system",`
			`content: options.systemPrompt,`
			`},`
			`{`
			`role: "user",`
wip 2024-11-13 18:06:20 -03:00			`content: [{ type: "text", text: markdown }],`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`},`
			`{`
			`role: "user",`
			`content: options.prompt !== undefined`
			? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
			`: "Transform the above content into structured JSON output.",`
			`},`
			`],`
			`response_format: options.schema ? {`
			`type: "json_schema",`
			`json_schema: {`
			`name: "websiteContent",`
			`schema: schema,`
			`strict: true,`
			`}`
			`} : { type: "json_object" },`
			`});`

			`if (jsonCompletion.choices[0].message.refusal !== null) {`
			`throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);`
			`}`

wip 2024-11-13 18:06:20 -03:00			`extract = jsonCompletion.choices[0].message.parsed;`
fix(scrapeURL/llmExtract): fix schema-less LLM extract 2024-11-11 21:07:37 +01:00
wip 2024-11-13 18:06:20 -03:00			`if (extract === null && jsonCompletion.choices[0].message.content !== null) {`
fix(scrapeURL/llmExtract): fix schema-less LLM extract 2024-11-11 21:07:37 +01:00			`try {`
wip 2024-11-13 18:06:20 -03:00			`extract = JSON.parse(jsonCompletion.choices[0].message.content);`
fix(scrapeURL/llmExtract): fix schema-less LLM extract 2024-11-11 21:07:37 +01:00			`} catch (e) {`
			`logger.error("Failed to parse returned JSON, no schema specified.", { error: e });`
			`throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");`
			`}`
			`}`

Nick: extract fixes 2024-11-20 11:50:14 -08:00			`// If the users actually wants the items object, they can specify it as 'required' in the schema`
			`// otherwise, we just return the items array`
			`if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) {`
wip 2024-11-13 18:06:20 -03:00			`extract = extract?.items;`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`
Loggin num tokens 2024-11-20 13:09:46 -08:00			`return { extract, warning, numTokens };`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`

			`export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {`
			`if (meta.options.formats.includes("extract")) {`
wip 2024-11-13 18:06:20 -03:00			`const { extract, warning } = await generateOpenAICompletions(`
			`meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),`
			`meta.options.extract!,`
			`document.markdown,`
			`document.warning,`
			`);`
			`document.extract = extract;`
			`document.warning = warning;`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`

			`return document;`
			`}`