Files
firecrawl/apps/api/src/lib/extract/extraction-service.ts
T

243 lines
6.7 KiB
TypeScript
Raw Normal View History

2024-12-26 12:41:37 -03:00
import { Document, ExtractRequest, URLTrace } from "../../controllers/v1/types";
import { PlanType } from "../../types";
import { logger } from "../logger";
import { processUrl } from "./url-processor";
import { scrapeDocument } from "./document-scraper";
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { buildDocument } from "./build-document";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
2024-12-30 21:42:01 -03:00
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
import { saveCrawl, StoredCrawl } from "../crawl-redis";
2025-01-07 16:16:01 -03:00
import { updateExtract } from "./extract-redis";
2025-01-08 15:13:33 -03:00
import { CUSTOM_U_TEAMS } from "./config";
2024-12-26 12:41:37 -03:00
interface ExtractServiceOptions {
request: ExtractRequest;
teamId: string;
plan: PlanType;
subId?: string;
}
interface ExtractResult {
success: boolean;
data?: any;
2025-01-03 20:44:27 -03:00
extractId: string;
2024-12-26 12:41:37 -03:00
warning?: string;
2024-12-26 12:43:58 -03:00
urlTrace?: URLTrace[];
2024-12-26 12:41:37 -03:00
error?: string;
}
2024-12-30 21:42:01 -03:00
function getRootDomain(url: string): string {
try {
2025-01-10 18:35:10 -03:00
if (url.endsWith("/*")) {
2024-12-30 21:42:01 -03:00
url = url.slice(0, -2);
}
const urlObj = new URL(url);
return `${urlObj.protocol}//${urlObj.hostname}`;
} catch (e) {
return url;
}
}
2025-01-10 18:35:10 -03:00
export async function performExtraction(
extractId: string,
options: ExtractServiceOptions,
): Promise<ExtractResult> {
2024-12-26 12:41:37 -03:00
const { request, teamId, plan, subId } = options;
const urlTraces: URLTrace[] = [];
let docs: Document[] = [];
// Process URLs
2025-01-10 18:35:10 -03:00
const urlPromises = request.urls.map((url) =>
processUrl(
{
url,
prompt: request.prompt,
teamId,
plan,
allowExternalLinks: request.allowExternalLinks,
origin: request.origin,
limit: request.limit,
includeSubdomains: request.includeSubdomains,
},
urlTraces,
),
2024-12-26 12:41:37 -03:00
);
const processedUrls = await Promise.all(urlPromises);
2025-01-10 18:35:10 -03:00
const links = processedUrls.flat().filter((url) => url);
2024-12-26 12:41:37 -03:00
if (links.length === 0) {
return {
success: false,
2025-01-10 18:35:10 -03:00
error:
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
2025-01-03 20:44:27 -03:00
extractId,
2024-12-26 12:41:37 -03:00
urlTrace: urlTraces,
};
}
// Scrape documents
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
2025-01-10 18:35:10 -03:00
const scrapePromises = links.map((url) =>
scrapeDocument(
{
url,
teamId,
plan,
origin: request.origin || "api",
timeout,
},
urlTraces,
),
2024-12-26 12:41:37 -03:00
);
try {
const results = await Promise.all(scrapePromises);
docs.push(...results.filter((doc): doc is Document => doc !== null));
} catch (error) {
return {
success: false,
error: error.message,
2025-01-03 20:44:27 -03:00
extractId,
2024-12-26 12:41:37 -03:00
urlTrace: urlTraces,
};
}
// Generate completions
const completions = await generateOpenAICompletions(
logger.child({ method: "extractService/generateOpenAICompletions" }),
{
mode: "llm",
systemPrompt:
(request.systemPrompt ? `${request.systemPrompt}\n` : "") +
2025-01-02 18:00:18 -03:00
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
2024-12-26 12:41:37 -03:00
links.join(", "),
prompt: request.prompt,
schema: request.schema,
},
docs.map((x) => buildDocument(x)).join("\n"),
undefined,
true,
);
// Update token usage in traces
if (completions.numTokens) {
2025-01-10 18:35:10 -03:00
const totalLength = docs.reduce(
(sum, doc) => sum + (doc.markdown?.length || 0),
0,
);
2024-12-26 12:41:37 -03:00
docs.forEach((doc) => {
if (doc.metadata?.sourceURL) {
const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
if (trace && trace.contentStats) {
trace.contentStats.tokensUsed = Math.floor(
2025-01-10 18:35:10 -03:00
((doc.markdown?.length || 0) / totalLength) * completions.numTokens,
2024-12-26 12:41:37 -03:00
);
}
}
});
}
2024-12-30 21:42:01 -03:00
// Kickoff background crawl for indexing root domains
2024-12-31 15:22:50 -03:00
// const rootDomains = new Set(request.urls.map(getRootDomain));
// rootDomains.forEach(async url => {
// const crawlId = crypto.randomUUID();
2025-01-10 18:35:10 -03:00
2024-12-31 15:22:50 -03:00
// // Create and save crawl configuration first
// const sc: StoredCrawl = {
// originUrl: url,
// crawlerOptions: {
// maxDepth: 15,
// limit: 5000,
// includePaths: [],
// excludePaths: [],
// ignoreSitemap: false,
// includeSubdomains: true,
// allowExternalLinks: false,
// allowBackwardLinks: true
// },
// scrapeOptions: {
// formats: ["markdown"],
// onlyMainContent: true,
// waitFor: 0,
// mobile: false,
// removeBase64Images: true,
// fastMode: false,
// parsePDF: true,
// skipTlsVerification: false,
// },
2025-01-10 18:35:10 -03:00
// internalOptions: {
2024-12-31 15:22:50 -03:00
// disableSmartWaitCache: true,
// isBackgroundIndex: true
// },
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// createdAt: Date.now(),
// plan: "hobby", // make it a low concurrency
// };
// // Save the crawl configuration
// await saveCrawl(crawlId, sc);
// // Then kick off the job
// await _addScrapeJobToBullMQ({
// url,
// mode: "kickoff" as const,
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// plan: "hobby", // make it a low concurrency
// crawlerOptions: sc.crawlerOptions,
// scrapeOptions: sc.scrapeOptions,
// internalOptions: sc.internalOptions,
// origin: "index",
// crawl_id: crawlId,
// webhook: null,
// v1: true,
// }, {}, crypto.randomUUID(), 50);
// });
2024-12-30 21:42:01 -03:00
2025-01-08 15:13:33 -03:00
let linksBilled = links.length * 5;
2025-01-10 18:35:10 -03:00
if (CUSTOM_U_TEAMS.includes(teamId)) {
2025-01-08 15:13:33 -03:00
linksBilled = 1;
}
2024-12-26 12:41:37 -03:00
// Bill team for usage
2025-01-08 15:13:33 -03:00
billTeam(teamId, subId, linksBilled).catch((error) => {
2024-12-26 12:41:37 -03:00
logger.error(
2025-01-08 15:13:33 -03:00
`Failed to bill team ${teamId} for ${linksBilled} credits: ${error}`,
2024-12-26 12:41:37 -03:00
);
});
// Log job
logJob({
2025-01-03 20:44:27 -03:00
job_id: extractId,
2024-12-26 12:41:37 -03:00
success: true,
message: "Extract completed",
num_docs: 1,
docs: completions.extract ?? {},
time_taken: (new Date().getTime() - Date.now()) / 1000,
team_id: teamId,
mode: "extract",
url: request.urls.join(", "),
scrapeOptions: request,
origin: request.origin ?? "api",
num_tokens: completions.numTokens ?? 0,
2025-01-07 16:16:01 -03:00
}).then(() => {
updateExtract(extractId, {
status: "completed",
}).catch((error) => {
2025-01-10 18:35:10 -03:00
logger.error(
`Failed to update extract ${extractId} status to completed: ${error}`,
);
2025-01-07 16:16:01 -03:00
});
2024-12-26 12:41:37 -03:00
});
return {
success: true,
data: completions.extract ?? {},
2025-01-03 20:44:27 -03:00
extractId,
2024-12-26 12:41:37 -03:00
warning: completions.warning,
2024-12-26 12:43:58 -03:00
urlTrace: request.urlTrace ? urlTraces : undefined,
2024-12-26 12:41:37 -03:00
};
2025-01-10 18:35:10 -03:00
}