apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts

import OpenAI from "openai";
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
import { Document, ExtractOptions } from "../../../controllers/v1/types";
import { Logger } from "winston";
import { EngineResultsTracker, Meta } from "..";
import { logger } from "../../../lib/logger";

const maxTokens = 32000;
const modifier = 4;

export class LLMRefusalError extends Error {
  public refusal: string;
  public results: EngineResultsTracker | undefined;

  constructor(refusal: string) {
    super("LLM refused to extract the website's content");
    this.refusal = refusal;
  }
}

function normalizeSchema(x: any): any {
  if (typeof x !== "object" || x === null) return x;

  if (x["$defs"] !== null && typeof x["$defs"] === "object") {
    x["$defs"] = Object.fromEntries(
      Object.entries(x["$defs"]).map(([name, schema]) => [
        name,
        normalizeSchema(schema),
      ]),
    );
  }

  if (x && x.anyOf) {
    x.anyOf = x.anyOf.map((x) => normalizeSchema(x));
  }

  if (x && x.oneOf) {
    x.oneOf = x.oneOf.map((x) => normalizeSchema(x));
  }

  if (x && x.allOf) {
    x.allOf = x.allOf.map((x) => normalizeSchema(x));
  }

  if (x && x.not) {
    x.not = normalizeSchema(x.not);
  }

  if (x && x.type === "object") {
    return {
      ...x,
      properties: Object.fromEntries(
        Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]),
      ),
      required: Object.keys(x.properties),
      additionalProperties: false,
    };
  } else if (x && x.type === "array") {
    return {
      ...x,
      items: normalizeSchema(x.items),
    };
  } else {
    return x;
  }
}

export async function generateOpenAICompletions(
  logger: Logger,
  options: ExtractOptions,
  markdown?: string,
  previousWarning?: string,
  isExtractEndpoint?: boolean,
): Promise<{ extract: any; numTokens: number; warning: string | undefined }> {
  let extract: any;
  let warning: string | undefined;

  const openai = new OpenAI();
  const model: TiktokenModel =
    (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";

  if (markdown === undefined) {
    throw new Error("document.markdown is undefined -- this is unexpected");
  }

  // count number of tokens
  let numTokens = 0;
  const encoder = encoding_for_model(model as TiktokenModel);
  try {
    // Encode the message into tokens
    const tokens = encoder.encode(markdown);

    // Return the number of tokens
    numTokens = tokens.length;
  } catch (error) {
    logger.warn("Calculating num tokens of string failed", { error, markdown });

    markdown = markdown.slice(0, maxTokens * modifier);

    let w =
      "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +
      maxTokens +
      ") we support.";
    warning = previousWarning === undefined ? w : w + " " + previousWarning;
  } finally {
    // Free the encoder resources after use
    encoder.free();
  }

  if (numTokens > maxTokens) {
    // trim the document to the maximum number of tokens, tokens != characters
    markdown = markdown.slice(0, maxTokens * modifier);

    const w =
      "The extraction content would have used more tokens (" +
      numTokens +
      ") than the maximum we allow (" +
      maxTokens +
      "). -- the input has been automatically trimmed.";
    warning = previousWarning === undefined ? w : w + " " + previousWarning;
  }

  let schema = options.schema;
  if (schema) {
    schema = removeDefaultProperty(schema);
  }

  if (schema && schema.type === "array") {
    schema = {
      type: "object",
      properties: {
        items: options.schema,
      },
      required: ["items"],
      additionalProperties: false,
    };
  } else if (schema && typeof schema === "object" && !schema.type) {
    schema = {
      type: "object",
      properties: Object.fromEntries(
        Object.entries(schema).map(([key, value]) => {
          return [key, removeDefaultProperty(value)];
        }),
      ),
      required: Object.keys(schema),
      additionalProperties: false,
    };
  }

  schema = normalizeSchema(schema);

  const jsonCompletion = await openai.beta.chat.completions.parse({
    model,
    temperature: 0,
    messages: [
      {
        role: "system",
        content: options.systemPrompt,
      },
      {
        role: "user",
        content: [{ type: "text", text: markdown }],
      },
      {
        role: "user",
        content:
          options.prompt !== undefined
            ? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}. If schema is provided, strictly follow it.`
            : "Transform the above content into structured JSON output based on the provided schema if any.",
      },
    ],
    response_format: options.schema
      ? {
          type: "json_schema",
          json_schema: {
            name: "schema",
            schema: schema,
            strict: true,
          },
        }
      : { type: "json_object" },
  });

  if (jsonCompletion.choices[0].message.refusal !== null) {
    throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
  }

  extract = jsonCompletion.choices[0].message.parsed;

  if (extract === null && jsonCompletion.choices[0].message.content !== null) {
    try {
      if (!isExtractEndpoint) {
        extract = JSON.parse(jsonCompletion.choices[0].message.content);
      } else {
        const extractData = JSON.parse(
          jsonCompletion.choices[0].message.content,
        );
        extract = options.schema ? extractData.data.extract : extractData;
      }
    } catch (e) {
      logger.error("Failed to parse returned JSON, no schema specified.", {
        error: e,
      });
      throw new LLMRefusalError(
        "Failed to parse returned JSON. Please specify a schema in the extract object.",
      );
    }
  }

  // If the users actually wants the items object, they can specify it as 'required' in the schema
  // otherwise, we just return the items array
  if (
    options.schema &&
    options.schema.type === "array" &&
    !schema?.required?.includes("items")
  ) {
    extract = extract?.items;
  }
  return { extract, warning, numTokens };
}

export async function performLLMExtract(
  meta: Meta,
  document: Document,
): Promise<Document> {
  if (meta.options.formats.includes("extract")) {
    const { extract, warning } = await generateOpenAICompletions(
      meta.logger.child({
        method: "performLLMExtract/generateOpenAICompletions",
      }),
      meta.options.extract!,
      document.markdown,
      document.warning,
    );
    document.extract = extract;
    document.warning = warning;
  }

  return document;
}

export function removeDefaultProperty(schema: any): any {
  if (typeof schema !== "object" || schema === null) return schema;

  const { default: _, ...rest } = schema;

  for (const key in rest) {
    if (Array.isArray(rest[key])) {
      rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
    } else if (typeof rest[key] === "object" && rest[key] !== null) {
      rest[key] = removeDefaultProperty(rest[key]);
    }
  }

  return rest;
}

export async function generateSchemaFromPrompt(prompt: string): Promise<any> {
  const openai = new OpenAI();

  const temperatures = [0, 0.1, 0.3]; // Different temperatures to try
  let lastError: Error | null = null;

  for (const temp of temperatures) {
    try {
      const result = await openai.beta.chat.completions.parse({
        model: "gpt-4o",
        temperature: temp,
        messages: [
          {
            role: "system",
            content: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt.
Consider:
1. The type of data being requested
2. Required fields vs optional fields
3. Appropriate data types for each field
4. Nested objects and arrays where appropriate

Return a valid JSON schema object with properties that would capture the information requested in the prompt.`,
          },
          {
            role: "user",
            content: `Generate a JSON schema for extracting the following information: ${prompt}`,
          },
        ],
        response_format: {
          type: "json_object",
        },
      });

      if (result.choices[0].message.refusal !== null) {
        throw new Error("LLM refused to generate schema");
      }

      let schema;
      try {
        schema = JSON.parse(result.choices[0].message.content ?? "");
        return schema;
      } catch (e) {
        throw new Error("Failed to parse schema JSON from LLM response");
      }
    } catch (error) {
      lastError = error as Error;
      logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
      continue;
    }
  }

  // If we get here, all attempts failed
  throw new Error(
    `Failed to generate schema after all attempts. Last error: ${lastError?.message}`,
  );
}
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`import OpenAI from "openai";`
			`import { encoding_for_model } from "@dqbd/tiktoken";`
			`import { TiktokenModel } from "@dqbd/tiktoken";`
			`import { Document, ExtractOptions } from "../../../controllers/v1/types";`
			`import { Logger } from "winston";`
			`import { EngineResultsTracker, Meta } from "..";`
Nick: extract without a schema should work as expected 2025-01-14 11:37:00 -03:00			`import { logger } from "../../../lib/logger";`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
			`const maxTokens = 32000;`
			`const modifier = 4;`

			`export class LLMRefusalError extends Error {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`public refusal: string;`
			`public results: EngineResultsTracker \| undefined;`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`constructor(refusal: string) {`
			`super("LLM refused to extract the website's content");`
			`this.refusal = refusal;`
			`}`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`

			`function normalizeSchema(x: any): any {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (typeof x !== "object" \|\| x === null) return x;`

			`if (x["$defs"] !== null && typeof x["$defs"] === "object") {`
			`x["$defs"] = Object.fromEntries(`
			`Object.entries(x["$defs"]).map(([name, schema]) => [`
			`name,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`normalizeSchema(schema),`
			`]),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
			`}`

			`if (x && x.anyOf) {`
			`x.anyOf = x.anyOf.map((x) => normalizeSchema(x));`
			`}`

			`if (x && x.oneOf) {`
			`x.oneOf = x.oneOf.map((x) => normalizeSchema(x));`
			`}`

			`if (x && x.allOf) {`
			`x.allOf = x.allOf.map((x) => normalizeSchema(x));`
			`}`

			`if (x && x.not) {`
			`x.not = normalizeSchema(x.not);`
			`}`

			`if (x && x.type === "object") {`
			`return {`
			`...x,`
			`properties: Object.fromEntries(`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`),`
			`required: Object.keys(x.properties),`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`additionalProperties: false,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
			`} else if (x && x.type === "array") {`
			`return {`
			`...x,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`items: normalizeSchema(x.items),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
			`} else {`
			`return x;`
			`}`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`

Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`export async function generateOpenAICompletions(`
			`logger: Logger,`
			`options: ExtractOptions,`
			`markdown?: string,`
			`previousWarning?: string,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`isExtractEndpoint?: boolean,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`): Promise<{ extract: any; numTokens: number; warning: string \| undefined }> {`
			`let extract: any;`
			`let warning: string \| undefined;`

			`const openai = new OpenAI();`
			`const model: TiktokenModel =`
			`(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";`

			`if (markdown === undefined) {`
			`throw new Error("document.markdown is undefined -- this is unexpected");`
			`}`

			`// count number of tokens`
			`let numTokens = 0;`
			`const encoder = encoding_for_model(model as TiktokenModel);`
			`try {`
			`// Encode the message into tokens`
			`const tokens = encoder.encode(markdown);`

			`// Return the number of tokens`
			`numTokens = tokens.length;`
			`} catch (error) {`
			`logger.warn("Calculating num tokens of string failed", { error, markdown });`

			`markdown = markdown.slice(0, maxTokens * modifier);`

			`let w =`
			`"Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +`
			`maxTokens +`
			`") we support.";`
			`warning = previousWarning === undefined ? w : w + " " + previousWarning;`
			`} finally {`
			`// Free the encoder resources after use`
			`encoder.free();`
			`}`

			`if (numTokens > maxTokens) {`
			`// trim the document to the maximum number of tokens, tokens != characters`
			`markdown = markdown.slice(0, maxTokens * modifier);`

			`const w =`
			`"The extraction content would have used more tokens (" +`
			`numTokens +`
			`") than the maximum we allow (" +`
			`maxTokens +`
			`"). -- the input has been automatically trimmed.";`
			`warning = previousWarning === undefined ? w : w + " " + previousWarning;`
			`}`

			`let schema = options.schema;`
merged with main 2024-12-16 11:41:59 -03:00			`if (schema) {`
			`schema = removeDefaultProperty(schema);`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`}`
merged with main 2024-12-16 11:41:59 -03:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (schema && schema.type === "array") {`
			`schema = {`
			`type: "object",`
			`properties: {`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`items: options.schema,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`},`
			`required: ["items"],`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`additionalProperties: false,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
			`} else if (schema && typeof schema === "object" && !schema.type) {`
			`schema = {`
			`type: "object",`
			`properties: Object.fromEntries(`
merged with main 2024-12-16 11:41:59 -03:00			`Object.entries(schema).map(([key, value]) => {`
			`return [key, removeDefaultProperty(value)];`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`}),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`),`
			`required: Object.keys(schema),`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`additionalProperties: false,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
			`}`

			`schema = normalizeSchema(schema);`

			`const jsonCompletion = await openai.beta.chat.completions.parse({`
			`model,`
			`temperature: 0,`
			`messages: [`
			`{`
			`role: "system",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`content: options.systemPrompt,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`},`
			`{`
			`role: "user",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`content: [{ type: "text", text: markdown }],`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`},`
			`{`
			`role: "user",`
			`content:`
			`options.prompt !== undefined`
Update llmExtract.ts 2024-12-18 23:45:43 -03:00			? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}. If schema is provided, strictly follow it.`
Update llmExtract.ts 2024-12-18 16:52:05 -03:00			`: "Transform the above content into structured JSON output based on the provided schema if any.",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`},`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`],`
			`response_format: options.schema`
			`? {`
			`type: "json_schema",`
			`json_schema: {`
(feat/extract) New re-ranker + multi entity extraction (#1061 ) 2025-01-13 22:30:15 -03:00			`name: "schema",`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`schema: schema,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`strict: true,`
			`},`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`}`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`: { type: "json_object" },`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`
wip 2024-11-13 18:06:20 -03:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (jsonCompletion.choices[0].message.refusal !== null) {`
			`throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);`
			`}`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`extract = jsonCompletion.choices[0].message.parsed;`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (extract === null && jsonCompletion.choices[0].message.content !== null) {`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`try {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (!isExtractEndpoint) {`
			`extract = JSON.parse(jsonCompletion.choices[0].message.content);`
			`} else {`
			`const extractData = JSON.parse(`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`jsonCompletion.choices[0].message.content,`
wip 2024-11-13 18:06:20 -03:00			`);`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`extract = options.schema ? extractData.data.extract : extractData;`
			`}`
			`} catch (e) {`
			`logger.error("Failed to parse returned JSON, no schema specified.", {`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`error: e,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`
			`throw new LLMRefusalError(`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`"Failed to parse returned JSON. Please specify a schema in the extract object.",`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`}`

			`// If the users actually wants the items object, they can specify it as 'required' in the schema`
			`// otherwise, we just return the items array`
			`if (`
			`options.schema &&`
			`options.schema.type === "array" &&`
			`!schema?.required?.includes("items")`
			`) {`
			`extract = extract?.items;`
			`}`
			`return { extract, warning, numTokens };`
			`}`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`export async function performLLMExtract(`
			`meta: Meta,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`document: Document,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`): Promise<Document> {`
			`if (meta.options.formats.includes("extract")) {`
			`const { extract, warning } = await generateOpenAICompletions(`
			`meta.logger.child({`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`method: "performLLMExtract/generateOpenAICompletions",`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`}),`
			`meta.options.extract!,`
			`document.markdown,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`document.warning,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
			`document.extract = extract;`
			`document.warning = warning;`
			`}`

			`return document;`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`
merged with main 2024-12-16 11:41:59 -03:00
added unit tests 2024-12-16 09:30:40 -03:00			`export function removeDefaultProperty(schema: any): any {`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`if (typeof schema !== "object" \|\| schema === null) return schema;`
fixed optional+default bug on llm schema 2024-12-09 15:34:50 -03:00
			`const { default: _, ...rest } = schema;`

			`for (const key in rest) {`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`if (Array.isArray(rest[key])) {`
			`rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));`
			`} else if (typeof rest[key] === "object" && rest[key] !== null) {`
			`rest[key] = removeDefaultProperty(rest[key]);`
			`}`
fixed optional+default bug on llm schema 2024-12-09 15:34:50 -03:00			`}`

			`return rest;`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`}`
Nick: extract without a schema should work as expected 2025-01-14 11:37:00 -03:00
			`export async function generateSchemaFromPrompt(prompt: string): Promise<any> {`
			`const openai = new OpenAI();`

			`const temperatures = [0, 0.1, 0.3]; // Different temperatures to try`
			`let lastError: Error \| null = null;`

			`for (const temp of temperatures) {`
			`try {`
			`const result = await openai.beta.chat.completions.parse({`
			`model: "gpt-4o",`
			`temperature: temp,`
			`messages: [`
			`{`
			`role: "system",`
			content: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt.
			`Consider:`
			`1. The type of data being requested`
			`2. Required fields vs optional fields`
			`3. Appropriate data types for each field`
			`4. Nested objects and arrays where appropriate`

			Return a valid JSON schema object with properties that would capture the information requested in the prompt.`,
			`},`
			`{`
			`role: "user",`
			content: `Generate a JSON schema for extracting the following information: ${prompt}`,
			`},`
			`],`
			`response_format: {`
			`type: "json_object",`
			`},`
			`});`

			`if (result.choices[0].message.refusal !== null) {`
			`throw new Error("LLM refused to generate schema");`
			`}`

			`let schema;`
			`try {`
			`schema = JSON.parse(result.choices[0].message.content ?? "");`
			`return schema;`
			`} catch (e) {`
			`throw new Error("Failed to parse schema JSON from LLM response");`
			`}`
			`} catch (error) {`
			`lastError = error as Error;`
			logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
			`continue;`
			`}`
			`}`

			`// If we get here, all attempts failed`
			`throw new Error(`
			`Failed to generate schema after all attempts. Last error: ${lastError?.message}`,
			`);`
			`}`