Files
firecrawl/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
T

215 lines
6.3 KiB
TypeScript
Raw Normal View History

2025-02-19 12:42:33 -05:00
import { logger as _logger } from "../logger";
import { updateGeneratedLlmsTxt } from "./generate-llmstxt-redis";
import { getMapResults } from "../../controllers/v1/map";
import { MapResponse, ScrapeResponse, Document } from "../../controllers/v1/types";
import { Response } from "express";
import OpenAI from "openai";
import { zodResponseFormat } from "openai/helpers/zod";
import { z } from "zod";
import { scrapeDocument } from "../extract/document-scraper";
import { PlanType } from "../../types";
import { getLlmsTextFromCache, saveLlmsTextToCache } from "./generate-llmstxt-supabase";
2025-02-19 15:21:52 -03:00
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
2025-02-19 12:42:33 -05:00
interface GenerateLLMsTextServiceOptions {
generationId: string;
teamId: string;
plan: PlanType;
url: string;
maxUrls: number;
showFullText: boolean;
2025-02-19 15:21:52 -03:00
subId?: string;
2025-02-19 12:42:33 -05:00
}
const DescriptionSchema = z.object({
description: z.string(),
title: z.string(),
});
export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
const openai = new OpenAI();
2025-02-19 15:21:52 -03:00
const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = options;
const startTime = Date.now();
2025-02-19 12:42:33 -05:00
const logger = _logger.child({
module: "generate-llmstxt",
method: "performGenerateLlmsTxt",
generationId,
teamId,
});
try {
// Check cache first
const cachedResult = await getLlmsTextFromCache(url, maxUrls);
if (cachedResult) {
logger.info("Found cached LLMs text", { url });
// Update final result with cached text
await updateGeneratedLlmsTxt(generationId, {
status: "completed",
generatedText: cachedResult.llmstxt,
fullText: cachedResult.llmstxt_full,
showFullText: showFullText,
});
2025-02-19 15:50:59 -03:00
2025-02-19 12:42:33 -05:00
return {
success: true,
data: {
generatedText: cachedResult.llmstxt,
fullText: cachedResult.llmstxt_full,
showFullText: showFullText,
},
};
}
// If not in cache, proceed with generation
// First, get all URLs from the map controller
const mapResult = await getMapResults({
url,
teamId,
plan,
limit: maxUrls,
includeSubdomains: false,
ignoreSitemap: false,
includeMetadata: true,
});
if (!mapResult || !mapResult.links) {
throw new Error(`Failed to map URLs`);
}
_logger.debug("Mapping URLs", mapResult.links);
const urls = mapResult.links;
let llmstxt = `# ${url} llms.txt\n\n`;
let llmsFulltxt = `# ${url} llms-full.txt\n\n`;
2025-02-19 15:50:59 -03:00
// Process URLs in batches of 10
for (let i = 0; i < urls.length; i += 10) {
const batch = urls.slice(i, i + 10);
2025-02-19 12:42:33 -05:00
2025-02-19 15:50:59 -03:00
const batchResults = await Promise.all(batch.map(async (url) => {
_logger.debug(`Scraping URL: ${url}`);
try {
const document = await scrapeDocument(
{
url,
teamId,
plan,
origin: url,
timeout: 30000,
isSingleUrl: true,
},
[],
logger,
{ onlyMainContent: true }
);
if (!document || !document.markdown) {
logger.error(`Failed to scrape URL ${url}`);
return null;
2025-02-19 12:42:33 -05:00
}
2025-02-19 15:50:59 -03:00
_logger.debug(`Generating description for ${document.metadata?.url}`);
const completion = await openai.beta.chat.completions.parse({
model: "gpt-4o-mini",
messages: [
{
role: "user",
content: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose. Here is the content: ${document.markdown}`
}
],
response_format: zodResponseFormat(DescriptionSchema, "description")
});
const parsedResponse = completion.choices[0].message.parsed;
return {
title: parsedResponse!.title,
description: parsedResponse!.description,
url: document.metadata?.url,
markdown: document.markdown
};
} catch (error) {
logger.error(`Failed to process URL ${url}`, { error });
return null;
}
}));
// Process successful results from batch
for (const result of batchResults) {
if (!result) continue;
2025-02-19 12:42:33 -05:00
2025-02-19 15:50:59 -03:00
llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
llmsFulltxt += `## ${result.title}\n${result.markdown}\n\n`;
2025-02-19 12:42:33 -05:00
}
2025-02-19 15:50:59 -03:00
// Update progress after each batch
await updateGeneratedLlmsTxt(generationId, {
status: "processing",
generatedText: llmstxt,
fullText: llmsFulltxt,
});
2025-02-19 12:42:33 -05:00
}
// After successful generation, save to cache
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
// Update final result with both generated text and full text
await updateGeneratedLlmsTxt(generationId, {
status: "completed",
generatedText: llmstxt,
fullText: llmsFulltxt,
showFullText: showFullText,
});
2025-02-19 15:21:52 -03:00
// Log job with token usage and sources
await logJob({
job_id: generationId,
success: true,
message: "LLMs text generation completed",
num_docs: urls.length,
docs: [{ llmstxt: llmstxt, llmsfulltxt: llmsFulltxt }],
time_taken: (Date.now() - startTime) / 1000,
team_id: teamId,
mode: "llmstxt",
url: url,
scrapeOptions: options,
origin: "api",
num_tokens: 0,
tokens_billed: 0,
sources: {},
});
// Bill team for usage
billTeam(teamId, subId, urls.length, logger).catch(
(error) => {
logger.error(
`Failed to bill team ${teamId} for ${urls.length} urls`, { teamId, count: urls.length, error },
);
},
);
2025-02-19 12:42:33 -05:00
return {
success: true,
data: {
generatedText: llmstxt,
fullText: llmsFulltxt,
showFullText: showFullText,
},
};
} catch (error: any) {
logger.error("Generate LLMs text error", { error });
await updateGeneratedLlmsTxt(generationId, {
status: "failed",
error: error.message || "Unknown error occurred",
});
throw error;
}
}