Files
firecrawl/apps/api/src/lib/extract/completions/batchExtract.ts
T

113 lines
3.3 KiB
TypeScript
Raw Normal View History

2025-04-15 00:19:45 -07:00
import {
generateCompletions,
GenerateCompletionsOptions,
} from "../../../scraper/scrapeURL/transformers/llmExtract";
import { buildDocument } from "../build-document";
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
import { Document } from "../../../controllers/v1/types";
import {
buildBatchExtractPrompt,
buildBatchExtractSystemPrompt,
} from "../build-prompts";
2025-04-15 00:19:45 -07:00
import { getModel } from "../../generic-ai";
import fs from "fs/promises";
import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";
2025-04-15 20:28:53 -07:00
import type { Logger } from "winston";
2025-04-15 00:19:45 -07:00
type BatchExtractOptions = {
multiEntitySchema: any;
links: string[];
prompt: string;
systemPrompt: string;
doc: Document;
useAgent: boolean;
2025-04-15 23:06:13 -07:00
extractId?: string;
2025-04-15 00:19:45 -07:00
};
/**
* Batch extract information from a list of URLs using a multi-entity schema.
* @param multiEntitySchema - The schema for the multi-entity extraction
* @param links - The URLs to extract information from
* @param prompt - The prompt for the extraction
* @param systemPrompt - The system prompt for the extraction
* @param doc - The document to extract information from
* @returns The completion promise
*/
2025-04-15 20:28:53 -07:00
export async function batchExtractPromise(options: BatchExtractOptions, logger: Logger): Promise<{
2025-04-15 00:19:45 -07:00
extract: any; // array of extracted data
numTokens: number;
totalUsage: TokenUsage;
warning?: string;
sources: string[];
2025-04-15 00:19:45 -07:00
smartScrapeCost: number;
otherCost: number;
smartScrapeCallCount: number;
otherCallCount: number;
}> {
2025-04-15 23:06:13 -07:00
const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent, extractId } = options;
2025-04-15 00:19:45 -07:00
const generationOptions: GenerateCompletionsOptions = {
2025-02-20 18:48:58 -03:00
logger: logger.child({
method: "extractService/generateCompletions",
}),
2025-02-20 18:48:58 -03:00
options: {
mode: "llm",
systemPrompt: buildBatchExtractSystemPrompt(
systemPrompt,
multiEntitySchema,
links,
),
prompt: buildBatchExtractPrompt(prompt),
schema: multiEntitySchema,
},
2025-02-20 18:48:58 -03:00
markdown: buildDocument(doc),
2025-04-15 00:19:45 -07:00
isExtractEndpoint: true,
model: getModel("gemini-2.0-flash", "google"),
};
let extractedDataArray: any[] = [];
let warning: string | undefined;
let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0;
try {
const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({
extractOptions: generationOptions,
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
useAgent,
2025-04-15 23:06:13 -07:00
extractId,
2025-04-15 00:19:45 -07:00
});
extractedDataArray = e;
warning = w;
smCost = smartScrapeCost;
oCost = otherCost;
smCallCount = smartScrapeCallCount;
oCallCount = otherCallCount;
} catch (error) {
2025-04-15 20:28:53 -07:00
logger.error("extractData failed", { error });
2025-04-15 00:19:45 -07:00
}
2025-04-15 00:19:45 -07:00
// await fs.writeFile(
// `logs/extractedDataArray-${crypto.randomUUID()}.json`,
// JSON.stringify(extractedDataArray, null, 2),
// );
// TODO: fix this
return {
2025-04-15 00:19:45 -07:00
extract: extractedDataArray,
numTokens: 0,
totalUsage: {
promptTokens: 0,
completionTokens: 0,
totalTokens: 0,
model: "gemini-2.0-flash",
},
warning: warning,
sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
smartScrapeCost: smCost,
otherCost: oCost,
smartScrapeCallCount: smCallCount,
otherCallCount: oCallCount,
};
}