2025-01-27 20:07:01 -03:00
|
|
|
import { logger } from "../../../lib/logger";
|
2025-02-20 18:48:58 -03:00
|
|
|
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
2025-01-27 20:07:01 -03:00
|
|
|
import { buildDocument } from "../build-document";
|
|
|
|
|
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
|
|
|
|
|
import { Document } from "../../../controllers/v1/types";
|
|
|
|
|
import {
|
|
|
|
|
buildBatchExtractPrompt,
|
|
|
|
|
buildBatchExtractSystemPrompt,
|
|
|
|
|
} from "../build-prompts";
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Batch extract information from a list of URLs using a multi-entity schema.
|
|
|
|
|
* @param multiEntitySchema - The schema for the multi-entity extraction
|
|
|
|
|
* @param links - The URLs to extract information from
|
|
|
|
|
* @param prompt - The prompt for the extraction
|
|
|
|
|
* @param systemPrompt - The system prompt for the extraction
|
|
|
|
|
* @param doc - The document to extract information from
|
|
|
|
|
* @returns The completion promise
|
|
|
|
|
*/
|
|
|
|
|
export async function batchExtractPromise(
|
|
|
|
|
multiEntitySchema: any,
|
|
|
|
|
links: string[],
|
|
|
|
|
prompt: string,
|
|
|
|
|
systemPrompt: string,
|
|
|
|
|
doc: Document,
|
|
|
|
|
): Promise<{
|
|
|
|
|
extract: any;
|
|
|
|
|
numTokens: number;
|
|
|
|
|
totalUsage: TokenUsage;
|
|
|
|
|
warning?: string;
|
|
|
|
|
sources: string[];
|
|
|
|
|
}> {
|
2025-02-20 18:48:58 -03:00
|
|
|
const completion = await generateCompletions({
|
|
|
|
|
logger: logger.child({
|
|
|
|
|
method: "extractService/generateCompletions",
|
2025-01-27 20:07:01 -03:00
|
|
|
}),
|
2025-02-20 18:48:58 -03:00
|
|
|
options: {
|
2025-01-27 20:07:01 -03:00
|
|
|
mode: "llm",
|
|
|
|
|
systemPrompt: buildBatchExtractSystemPrompt(
|
|
|
|
|
systemPrompt,
|
|
|
|
|
multiEntitySchema,
|
|
|
|
|
links,
|
|
|
|
|
),
|
|
|
|
|
prompt: buildBatchExtractPrompt(prompt),
|
|
|
|
|
schema: multiEntitySchema,
|
|
|
|
|
},
|
2025-02-20 18:48:58 -03:00
|
|
|
markdown: buildDocument(doc),
|
|
|
|
|
isExtractEndpoint: true
|
|
|
|
|
});
|
2025-01-27 20:07:01 -03:00
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
extract: completion.extract,
|
|
|
|
|
numTokens: completion.numTokens,
|
|
|
|
|
totalUsage: completion.totalUsage,
|
|
|
|
|
sources: [doc.metadata.url || doc.metadata.sourceURL || ""]
|
|
|
|
|
};
|
|
|
|
|
}
|