apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts

import OpenAI from "openai";
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
import {
  Document,
  ExtractOptions,
  TokenUsage,
} from "../../../controllers/v1/types";
import { Logger } from "winston";
import { EngineResultsTracker, Meta } from "..";
import { logger } from "../../../lib/logger";

const maxTokens = 32000;
const modifier = 4;

export class LLMRefusalError extends Error {
  public refusal: string;
  public results: EngineResultsTracker | undefined;

  constructor(refusal: string) {
    super("LLM refused to extract the website's content");
    this.refusal = refusal;
  }
}

function normalizeSchema(x: any): any {
  if (typeof x !== "object" || x === null) return x;

  if (x["$defs"] !== null && typeof x["$defs"] === "object") {
    x["$defs"] = Object.fromEntries(
      Object.entries(x["$defs"]).map(([name, schema]) => [
        name,
        normalizeSchema(schema),
      ]),
    );
  }

  if (x && x.anyOf) {
    x.anyOf = x.anyOf.map((x) => normalizeSchema(x));
  }

  if (x && x.oneOf) {
    x.oneOf = x.oneOf.map((x) => normalizeSchema(x));
  }

  if (x && x.allOf) {
    x.allOf = x.allOf.map((x) => normalizeSchema(x));
  }

  if (x && x.not) {
    x.not = normalizeSchema(x.not);
  }

  if (x && x.type === "object") {
    return {
      ...x,
      properties: Object.fromEntries(
        Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]),
      ),
      required: Object.keys(x.properties),
      additionalProperties: false,
    };
  } else if (x && x.type === "array") {
    return {
      ...x,
      items: normalizeSchema(x.items),
    };
  } else {
    return x;
  }
}

export async function generateOpenAICompletions(
  logger: Logger,
  options: ExtractOptions,
  markdown?: string,
  previousWarning?: string,
  isExtractEndpoint?: boolean,
  model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ??
    "gpt-4o-mini",
): Promise<{
  extract: any;
  numTokens: number;
  warning: string | undefined;
  totalUsage: TokenUsage;
  model: string;
}> {
  let extract: any;
  let warning: string | undefined;

  const openai = new OpenAI();

  if (markdown === undefined) {
    throw new Error("document.markdown is undefined -- this is unexpected");
  }

  // count number of tokens
  let numTokens = 0;
  const encoder = encoding_for_model(model as TiktokenModel);
  try {
    // Encode the message into tokens
    const tokens = encoder.encode(markdown);

    // Return the number of tokens
    numTokens = tokens.length;
  } catch (error) {
    logger.warn("Calculating num tokens of string failed", { error, markdown });

    markdown = markdown.slice(0, maxTokens * modifier);

    let w =
      "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +
      maxTokens +
      ") we support.";
    warning = previousWarning === undefined ? w : w + " " + previousWarning;
  } finally {
    // Free the encoder resources after use
    encoder.free();
  }

  if (numTokens > maxTokens) {
    // trim the document to the maximum number of tokens, tokens != characters
    markdown = markdown.slice(0, maxTokens * modifier);

    const w =
      "The extraction content would have used more tokens (" +
      numTokens +
      ") than the maximum we allow (" +
      maxTokens +
      "). -- the input has been automatically trimmed.";
    warning = previousWarning === undefined ? w : w + " " + previousWarning;
  }

  let schema = options.schema;
  if (schema) {
    schema = removeDefaultProperty(schema);
  }

  if (schema && schema.type === "array") {
    schema = {
      type: "object",
      properties: {
        items: options.schema,
      },
      required: ["items"],
      additionalProperties: false,
    };
  } else if (schema && typeof schema === "object" && !schema.type) {
    schema = {
      type: "object",
      properties: Object.fromEntries(
        Object.entries(schema).map(([key, value]) => {
          return [key, removeDefaultProperty(value)];
        }),
      ),
      required: Object.keys(schema),
      additionalProperties: false,
    };
  }

  schema = normalizeSchema(schema);

  const jsonCompletion = await openai.beta.chat.completions.parse({
    model,
    temperature: 0,
    messages: [
      {
        role: "system",
        content: options.systemPrompt,
      },
      {
        role: "user",
        content: [{ type: "text", text: markdown }],
      },
      {
        role: "user",
        content:
          options.prompt !== undefined
            ? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}. If schema is provided, strictly follow it.`
            : "Transform the above content into structured JSON output based on the provided schema if any.",
      },
    ],
    response_format: options.schema
      ? {
          type: "json_schema",
          json_schema: {
            name: "schema",
            schema: schema,
            strict: true,
          },
        }
      : { type: "json_object" },
  });

  if (jsonCompletion.choices[0].message.refusal !== null) {
    throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
  }

  extract = jsonCompletion.choices[0].message.parsed;

  if (extract === null && jsonCompletion.choices[0].message.content !== null) {
    try {
      if (!isExtractEndpoint) {
        extract = JSON.parse(jsonCompletion.choices[0].message.content);
      } else {
        const extractData = JSON.parse(
          jsonCompletion.choices[0].message.content,
        );
        extract = options.schema ? extractData.data.extract : extractData;
      }
    } catch (e) {
      logger.error("Failed to parse returned JSON, no schema specified.", {
        error: e,
      });
      throw new LLMRefusalError(
        "Failed to parse returned JSON. Please specify a schema in the extract object.",
      );
    }
  }

  const promptTokens = jsonCompletion.usage?.prompt_tokens ?? 0;
  const completionTokens = jsonCompletion.usage?.completion_tokens ?? 0;

  // If the users actually wants the items object, they can specify it as 'required' in the schema
  // otherwise, we just return the items array
  if (
    options.schema &&
    options.schema.type === "array" &&
    !schema?.required?.includes("items")
  ) {
    extract = extract?.items;
  }
  // num tokens (just user prompt tokenized) | deprecated
  // totalTokens = promptTokens + completionTokens
  return {
    extract,
    warning,
    numTokens,
    totalUsage: {
      promptTokens,
      completionTokens,
      totalTokens: promptTokens + completionTokens,
    },
    model,
  };
}

export async function performLLMExtract(
  meta: Meta,
  document: Document,
): Promise<Document> {
  if (meta.options.formats.includes("extract")) {
    const { extract, warning } = await generateOpenAICompletions(
      meta.logger.child({
        method: "performLLMExtract/generateOpenAICompletions",
      }),
      meta.options.extract!,
      document.markdown,
      document.warning,
    );

    if (meta.options.formats.includes("json")) {
      document.json = extract;
    } else {
      document.extract = extract;
    }
    document.warning = warning;
  }

  return document;
}

export function removeDefaultProperty(schema: any): any {
  if (typeof schema !== "object" || schema === null) return schema;

  const { default: _, ...rest } = schema;

  for (const key in rest) {
    if (Array.isArray(rest[key])) {
      rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
    } else if (typeof rest[key] === "object" && rest[key] !== null) {
      rest[key] = removeDefaultProperty(rest[key]);
    }
  }

  return rest;
}

export async function generateSchemaFromPrompt(prompt: string): Promise<any> {
  const openai = new OpenAI();

  const temperatures = [0, 0.1, 0.3]; // Different temperatures to try
  let lastError: Error | null = null;

  for (const temp of temperatures) {
    try {
      const result = await openai.beta.chat.completions.parse({
        model: "gpt-4o",
        temperature: temp,
        messages: [
          {
            role: "system",
            content: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt.
Consider:
1. The type of data being requested
2. Required fields vs optional fields
3. Appropriate data types for each field
4. Nested objects and arrays where appropriate

Valid JSON schema, has to be simple. No crazy properties. OpenAI has to support it.
Supported types
The following types are supported for Structured Outputs:

String
Number
Boolean
Integer
Object
Array
Enum
anyOf

Formats are not supported. Min/max are not supported. Anything beyond the above is not supported. Keep it simple with types and descriptions.
Optionals are not supported.
DO NOT USE FORMATS.
Keep it simple. Don't create too many properties, just the ones that are needed. Don't invent properties.
Return a valid JSON schema object with properties that would capture the information requested in the prompt.`,
          },
          {
            role: "user",
            content: `Generate a JSON schema for extracting the following information: ${prompt}`,
          },
        ],
        response_format: {
          type: "json_object",
        },
      });

      if (result.choices[0].message.refusal !== null) {
        throw new Error("LLM refused to generate schema");
      }

      let schema;
      try {
        schema = JSON.parse(result.choices[0].message.content ?? "");
        return schema;
      } catch (e) {
        throw new Error("Failed to parse schema JSON from LLM response");
      }
    } catch (error) {
      lastError = error as Error;
      logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
      continue;
    }
  }

  // If we get here, all attempts failed
  throw new Error(
    `Failed to generate schema after all attempts. Last error: ${lastError?.message}`,
  );
}
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`import OpenAI from "openai";`
			`import { encoding_for_model } from "@dqbd/tiktoken";`
			`import { TiktokenModel } from "@dqbd/tiktoken";`
Nick: formatting done 2025-01-22 18:47:44 -03:00			`import {`
			`Document,`
			`ExtractOptions,`
			`TokenUsage,`
			`} from "../../../controllers/v1/types";`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`import { Logger } from "winston";`
			`import { EngineResultsTracker, Meta } from "..";`
Nick: extract without a schema should work as expected 2025-01-14 11:37:00 -03:00			`import { logger } from "../../../lib/logger";`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
			`const maxTokens = 32000;`
			`const modifier = 4;`

			`export class LLMRefusalError extends Error {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`public refusal: string;`
			`public results: EngineResultsTracker \| undefined;`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`constructor(refusal: string) {`
			`super("LLM refused to extract the website's content");`
			`this.refusal = refusal;`
			`}`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`

			`function normalizeSchema(x: any): any {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (typeof x !== "object" \|\| x === null) return x;`

			`if (x["$defs"] !== null && typeof x["$defs"] === "object") {`
			`x["$defs"] = Object.fromEntries(`
			`Object.entries(x["$defs"]).map(([name, schema]) => [`
			`name,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`normalizeSchema(schema),`
			`]),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
			`}`

			`if (x && x.anyOf) {`
			`x.anyOf = x.anyOf.map((x) => normalizeSchema(x));`
			`}`

			`if (x && x.oneOf) {`
			`x.oneOf = x.oneOf.map((x) => normalizeSchema(x));`
			`}`

			`if (x && x.allOf) {`
			`x.allOf = x.allOf.map((x) => normalizeSchema(x));`
			`}`

			`if (x && x.not) {`
			`x.not = normalizeSchema(x.not);`
			`}`

			`if (x && x.type === "object") {`
			`return {`
			`...x,`
			`properties: Object.fromEntries(`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`),`
			`required: Object.keys(x.properties),`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`additionalProperties: false,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
			`} else if (x && x.type === "array") {`
			`return {`
			`...x,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`items: normalizeSchema(x.items),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
			`} else {`
			`return x;`
			`}`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`

Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`export async function generateOpenAICompletions(`
			`logger: Logger,`
			`options: ExtractOptions,`
			`markdown?: string,`
			`previousWarning?: string,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`isExtractEndpoint?: boolean,`
Nick: formatting done 2025-01-22 18:47:44 -03:00			`model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ??`
			`"gpt-4o-mini",`
			`): Promise<{`
			`extract: any;`
			`numTokens: number;`
			`warning: string \| undefined;`
			`totalUsage: TokenUsage;`
			`model: string;`
			`}> {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`let extract: any;`
			`let warning: string \| undefined;`

			`const openai = new OpenAI();`
Reapply "Nick:" 2025-01-22 17:26:32 -03:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (markdown === undefined) {`
			`throw new Error("document.markdown is undefined -- this is unexpected");`
			`}`

			`// count number of tokens`
			`let numTokens = 0;`
			`const encoder = encoding_for_model(model as TiktokenModel);`
			`try {`
			`// Encode the message into tokens`
			`const tokens = encoder.encode(markdown);`

			`// Return the number of tokens`
			`numTokens = tokens.length;`
			`} catch (error) {`
			`logger.warn("Calculating num tokens of string failed", { error, markdown });`

			`markdown = markdown.slice(0, maxTokens * modifier);`

			`let w =`
			`"Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +`
			`maxTokens +`
			`") we support.";`
			`warning = previousWarning === undefined ? w : w + " " + previousWarning;`
			`} finally {`
			`// Free the encoder resources after use`
			`encoder.free();`
			`}`

			`if (numTokens > maxTokens) {`
			`// trim the document to the maximum number of tokens, tokens != characters`
			`markdown = markdown.slice(0, maxTokens * modifier);`

			`const w =`
			`"The extraction content would have used more tokens (" +`
			`numTokens +`
			`") than the maximum we allow (" +`
			`maxTokens +`
			`"). -- the input has been automatically trimmed.";`
			`warning = previousWarning === undefined ? w : w + " " + previousWarning;`
			`}`

			`let schema = options.schema;`
merged with main 2024-12-16 11:41:59 -03:00			`if (schema) {`
			`schema = removeDefaultProperty(schema);`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`}`
merged with main 2024-12-16 11:41:59 -03:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (schema && schema.type === "array") {`
			`schema = {`
			`type: "object",`
			`properties: {`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`items: options.schema,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`},`
			`required: ["items"],`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`additionalProperties: false,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
			`} else if (schema && typeof schema === "object" && !schema.type) {`
			`schema = {`
			`type: "object",`
			`properties: Object.fromEntries(`
merged with main 2024-12-16 11:41:59 -03:00			`Object.entries(schema).map(([key, value]) => {`
			`return [key, removeDefaultProperty(value)];`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`}),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`),`
			`required: Object.keys(schema),`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`additionalProperties: false,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
			`}`

			`schema = normalizeSchema(schema);`

			`const jsonCompletion = await openai.beta.chat.completions.parse({`
			`model,`
			`temperature: 0,`
			`messages: [`
			`{`
			`role: "system",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`content: options.systemPrompt,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`},`
			`{`
			`role: "user",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`content: [{ type: "text", text: markdown }],`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`},`
			`{`
			`role: "user",`
			`content:`
			`options.prompt !== undefined`
Update llmExtract.ts 2024-12-18 23:45:43 -03:00			? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}. If schema is provided, strictly follow it.`
Update llmExtract.ts 2024-12-18 16:52:05 -03:00			`: "Transform the above content into structured JSON output based on the provided schema if any.",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`},`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`],`
			`response_format: options.schema`
			`? {`
			`type: "json_schema",`
			`json_schema: {`
(feat/extract) New re-ranker + multi entity extraction (#1061 ) 2025-01-13 22:30:15 -03:00			`name: "schema",`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`schema: schema,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`strict: true,`
			`},`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`}`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`: { type: "json_object" },`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`
wip 2024-11-13 18:06:20 -03:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (jsonCompletion.choices[0].message.refusal !== null) {`
			`throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);`
			`}`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`extract = jsonCompletion.choices[0].message.parsed;`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (extract === null && jsonCompletion.choices[0].message.content !== null) {`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`try {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (!isExtractEndpoint) {`
			`extract = JSON.parse(jsonCompletion.choices[0].message.content);`
			`} else {`
			`const extractData = JSON.parse(`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`jsonCompletion.choices[0].message.content,`
wip 2024-11-13 18:06:20 -03:00			`);`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`extract = options.schema ? extractData.data.extract : extractData;`
			`}`
			`} catch (e) {`
			`logger.error("Failed to parse returned JSON, no schema specified.", {`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`error: e,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`
			`throw new LLMRefusalError(`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`"Failed to parse returned JSON. Please specify a schema in the extract object.",`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`}`

Nick: formatting done 2025-01-22 18:47:44 -03:00			`const promptTokens = jsonCompletion.usage?.prompt_tokens ?? 0;`
			`const completionTokens = jsonCompletion.usage?.completion_tokens ?? 0;`
Reapply "Merge pull request #1068 from mendableai/nsc/llm-usage-extract" 2025-01-19 22:04:12 -03:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`// If the users actually wants the items object, they can specify it as 'required' in the schema`
			`// otherwise, we just return the items array`
			`if (`
			`options.schema &&`
			`options.schema.type === "array" &&`
			`!schema?.required?.includes("items")`
			`) {`
			`extract = extract?.items;`
			`}`
Reapply "Merge pull request #1068 from mendableai/nsc/llm-usage-extract" 2025-01-19 22:04:12 -03:00			`// num tokens (just user prompt tokenized) \| deprecated`
			`// totalTokens = promptTokens + completionTokens`
Nick: formatting done 2025-01-22 18:47:44 -03:00			`return {`
			`extract,`
			`warning,`
			`numTokens,`
			`totalUsage: {`
			`promptTokens,`
			`completionTokens,`
			`totalTokens: promptTokens + completionTokens,`
			`},`
			`model,`
			`};`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`}`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`export async function performLLMExtract(`
			`meta: Meta,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`document: Document,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`): Promise<Document> {`
			`if (meta.options.formats.includes("extract")) {`
			`const { extract, warning } = await generateOpenAICompletions(`
			`meta.logger.child({`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`method: "performLLMExtract/generateOpenAICompletions",`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`}),`
			`meta.options.extract!,`
			`document.markdown,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`document.warning,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
Nick: formatting done 2025-01-22 18:47:44 -03:00
Nick: 2025-01-18 17:17:42 -03:00			`if (meta.options.formats.includes("json")) {`
			`document.json = extract;`
			`} else {`
			`document.extract = extract;`
			`}`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`document.warning = warning;`
			`}`

			`return document;`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`
merged with main 2024-12-16 11:41:59 -03:00
added unit tests 2024-12-16 09:30:40 -03:00			`export function removeDefaultProperty(schema: any): any {`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`if (typeof schema !== "object" \|\| schema === null) return schema;`
fixed optional+default bug on llm schema 2024-12-09 15:34:50 -03:00
			`const { default: _, ...rest } = schema;`

			`for (const key in rest) {`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`if (Array.isArray(rest[key])) {`
			`rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));`
			`} else if (typeof rest[key] === "object" && rest[key] !== null) {`
			`rest[key] = removeDefaultProperty(rest[key]);`
			`}`
fixed optional+default bug on llm schema 2024-12-09 15:34:50 -03:00			`}`

			`return rest;`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`}`
Nick: extract without a schema should work as expected 2025-01-14 11:37:00 -03:00
			`export async function generateSchemaFromPrompt(prompt: string): Promise<any> {`
			`const openai = new OpenAI();`

			`const temperatures = [0, 0.1, 0.3]; // Different temperatures to try`
			`let lastError: Error \| null = null;`

			`for (const temp of temperatures) {`
			`try {`
			`const result = await openai.beta.chat.completions.parse({`
			`model: "gpt-4o",`
			`temperature: temp,`
			`messages: [`
			`{`
			`role: "system",`
			content: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt.
			`Consider:`
			`1. The type of data being requested`
			`2. Required fields vs optional fields`
			`3. Appropriate data types for each field`
			`4. Nested objects and arrays where appropriate`

Reapply "Merge pull request #1068 from mendableai/nsc/llm-usage-extract" 2025-01-19 22:04:12 -03:00			`Valid JSON schema, has to be simple. No crazy properties. OpenAI has to support it.`
			`Supported types`
			`The following types are supported for Structured Outputs:`

			`String`
			`Number`
			`Boolean`
			`Integer`
			`Object`
			`Array`
			`Enum`
			`anyOf`

			`Formats are not supported. Min/max are not supported. Anything beyond the above is not supported. Keep it simple with types and descriptions.`
			`Optionals are not supported.`
Update llmExtract.ts 2025-01-19 22:18:51 -03:00			`DO NOT USE FORMATS.`
Reapply "Merge pull request #1068 from mendableai/nsc/llm-usage-extract" 2025-01-19 22:04:12 -03:00			`Keep it simple. Don't create too many properties, just the ones that are needed. Don't invent properties.`
Nick: extract without a schema should work as expected 2025-01-14 11:37:00 -03:00			Return a valid JSON schema object with properties that would capture the information requested in the prompt.`,
			`},`
			`{`
			`role: "user",`
			content: `Generate a JSON schema for extracting the following information: ${prompt}`,
			`},`
			`],`
			`response_format: {`
			`type: "json_object",`
			`},`
			`});`

			`if (result.choices[0].message.refusal !== null) {`
			`throw new Error("LLM refused to generate schema");`
			`}`

			`let schema;`
			`try {`
			`schema = JSON.parse(result.choices[0].message.content ?? "");`
			`return schema;`
			`} catch (e) {`
			`throw new Error("Failed to parse schema JSON from LLM response");`
			`}`
			`} catch (error) {`
			`lastError = error as Error;`
			logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
			`continue;`
			`}`
			`}`

			`// If we get here, all attempts failed`
			`throw new Error(`
			`Failed to generate schema after all attempts. Last error: ${lastError?.message}`,
			`);`
			`}`