apps/api/src/lib/extract/completions/batchExtract.ts

import {
  generateCompletions,
  GenerateCompletionsOptions,
} from "../../../scraper/scrapeURL/transformers/llmExtract";
import { buildDocument } from "../build-document";
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
import { Document } from "../../../controllers/v1/types";
import {
  buildBatchExtractPrompt,
  buildBatchExtractSystemPrompt,
} from "../build-prompts";
import { getModel } from "../../generic-ai";

import fs from "fs/promises";
import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";
import type { Logger } from "winston";

type BatchExtractOptions = {
  multiEntitySchema: any;
  links: string[];
  prompt: string;
  systemPrompt: string;
  doc: Document;
  useAgent: boolean;
  extractId?: string;
};

/**
 * Batch extract information from a list of URLs using a multi-entity schema.
 * @param multiEntitySchema - The schema for the multi-entity extraction
 * @param links - The URLs to extract information from
 * @param prompt - The prompt for the extraction
 * @param systemPrompt - The system prompt for the extraction
 * @param doc - The document to extract information from
 * @returns The completion promise
 */
export async function batchExtractPromise(options: BatchExtractOptions, logger: Logger): Promise<{
  extract: any; // array of extracted data
  numTokens: number;
  totalUsage: TokenUsage;
  warning?: string;
  sources: string[];
  smartScrapeCost: number;
  otherCost: number;
  smartScrapeCallCount: number;
  otherCallCount: number;
}> {
  const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent, extractId } = options;


  const generationOptions: GenerateCompletionsOptions = {
    logger: logger.child({
      method: "extractService/generateCompletions",
    }),
    options: {
      mode: "llm",
      systemPrompt: buildBatchExtractSystemPrompt(
        systemPrompt,
        multiEntitySchema,
        links,
      ),
      prompt: buildBatchExtractPrompt(prompt),
      schema: multiEntitySchema,
    },
    markdown: buildDocument(doc),
    isExtractEndpoint: true,
    model: getModel("gemini-2.0-flash", "google"),
  };

  let extractedDataArray: any[] = [];
  let warning: string | undefined;
  let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0;
  try {
    const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({
      extractOptions: generationOptions,
      urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
      useAgent,
      extractId,
    });
    extractedDataArray = e;
    warning = w;
    smCost = smartScrapeCost;
    oCost = otherCost;
    smCallCount = smartScrapeCallCount;
    oCallCount = otherCallCount;
  } catch (error) {
    logger.error("extractData failed", { error });
  }

  // await fs.writeFile(
  //   `logs/extractedDataArray-${crypto.randomUUID()}.json`,
  //   JSON.stringify(extractedDataArray, null, 2),
  // );

  // TODO: fix this
  return {
    extract: extractedDataArray,
    numTokens: 0,
    totalUsage: {
      promptTokens: 0,
      completionTokens: 0,
      totalTokens: 0,
      model: "gemini-2.0-flash",
    },
    warning: warning,
    sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
    smartScrapeCost: smCost,
    otherCost: oCost,
    smartScrapeCallCount: smCallCount,
    otherCallCount: oCallCount,
  };
}
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00			`import {`
			`generateCompletions,`
			`GenerateCompletionsOptions,`
			`} from "../../../scraper/scrapeURL/transformers/llmExtract";`
(feat/extract) Refactor and Reranker improvements (#1100 ) 2025-01-27 20:07:01 -03:00			`import { buildDocument } from "../build-document";`
			`import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";`
			`import { Document } from "../../../controllers/v1/types";`
			`import {`
			`buildBatchExtractPrompt,`
			`buildBatchExtractSystemPrompt,`
			`} from "../build-prompts";`
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00			`import { getModel } from "../../generic-ai";`

			`import fs from "fs/promises";`
			`import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";`
feat(batchExtract): thingymajig 2025-04-15 20:28:53 -07:00			`import type { Logger } from "winston";`
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00
			`type BatchExtractOptions = {`
			`multiEntitySchema: any;`
			`links: string[];`
			`prompt: string;`
			`systemPrompt: string;`
			`doc: Document;`
			`useAgent: boolean;`
correlate with eid 2025-04-15 23:06:13 -07:00			`extractId?: string;`
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00			`};`
(feat/extract) Refactor and Reranker improvements (#1100 ) 2025-01-27 20:07:01 -03:00
			`/**`
			`* Batch extract information from a list of URLs using a multi-entity schema.`
			`* @param multiEntitySchema - The schema for the multi-entity extraction`
			`* @param links - The URLs to extract information from`
			`* @param prompt - The prompt for the extraction`
			`* @param systemPrompt - The system prompt for the extraction`
			`* @param doc - The document to extract information from`
			`* @returns The completion promise`
			`*/`
feat(batchExtract): thingymajig 2025-04-15 20:28:53 -07:00			`export async function batchExtractPromise(options: BatchExtractOptions, logger: Logger): Promise<{`
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00			`extract: any; // array of extracted data`
(feat/extract) Refactor and Reranker improvements (#1100 ) 2025-01-27 20:07:01 -03:00			`numTokens: number;`
			`totalUsage: TokenUsage;`
			`warning?: string;`
			`sources: string[];`
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00			`smartScrapeCost: number;`
			`otherCost: number;`
			`smartScrapeCallCount: number;`
			`otherCallCount: number;`
(feat/extract) Refactor and Reranker improvements (#1100 ) 2025-01-27 20:07:01 -03:00			`}> {`
correlate with eid 2025-04-15 23:06:13 -07:00			`const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent, extractId } = options;`
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00

			`const generationOptions: GenerateCompletionsOptions = {`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`logger: logger.child({`
			`method: "extractService/generateCompletions",`
(feat/extract) Refactor and Reranker improvements (#1100 ) 2025-01-27 20:07:01 -03:00			`}),`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`options: {`
(feat/extract) Refactor and Reranker improvements (#1100 ) 2025-01-27 20:07:01 -03:00			`mode: "llm",`
			`systemPrompt: buildBatchExtractSystemPrompt(`
			`systemPrompt,`
			`multiEntitySchema,`
			`links,`
			`),`
			`prompt: buildBatchExtractPrompt(prompt),`
			`schema: multiEntitySchema,`
			`},`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`markdown: buildDocument(doc),`
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00			`isExtractEndpoint: true,`
			`model: getModel("gemini-2.0-flash", "google"),`
			`};`

			`let extractedDataArray: any[] = [];`
			`let warning: string \| undefined;`
			`let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0;`
			`try {`
			`const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({`
			`extractOptions: generationOptions,`
			`urls: [doc.metadata.sourceURL \|\| doc.metadata.url \|\| ""],`
			`useAgent,`
correlate with eid 2025-04-15 23:06:13 -07:00			`extractId,`
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00			`});`
			`extractedDataArray = e;`
			`warning = w;`
			`smCost = smartScrapeCost;`
			`oCost = otherCost;`
			`smCallCount = smartScrapeCallCount;`
			`oCallCount = otherCallCount;`
			`} catch (error) {`
feat(batchExtract): thingymajig 2025-04-15 20:28:53 -07:00			`logger.error("extractData failed", { error });`
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00			`}`
(feat/extract) Refactor and Reranker improvements (#1100 ) 2025-01-27 20:07:01 -03:00
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00			`// await fs.writeFile(`
			// `logs/extractedDataArray-${crypto.randomUUID()}.json`,
			`// JSON.stringify(extractedDataArray, null, 2),`
			`// );`

			`// TODO: fix this`
(feat/extract) Refactor and Reranker improvements (#1100 ) 2025-01-27 20:07:01 -03:00			`return {`
(feat/fire-1) FIRE-1 (#1462 ) 2025-04-15 00:19:45 -07:00			`extract: extractedDataArray,`
			`numTokens: 0,`
			`totalUsage: {`
			`promptTokens: 0,`
			`completionTokens: 0,`
			`totalTokens: 0,`
			`model: "gemini-2.0-flash",`
			`},`
			`warning: warning,`
			`sources: [doc.metadata.url \|\| doc.metadata.sourceURL \|\| ""],`
			`smartScrapeCost: smCost,`
			`otherCost: oCost,`
			`smartScrapeCallCount: smCallCount,`
			`otherCallCount: oCallCount,`
(feat/extract) Refactor and Reranker improvements (#1100 ) 2025-01-27 20:07:01 -03:00			`};`
			`}`