apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts

import { logger as _logger } from "../logger";
import { updateGeneratedLlmsTxt } from "./generate-llmstxt-redis";
import { getMapResults } from "../../controllers/v1/map";
import { z } from "zod";
import { scrapeDocument } from "../extract/document-scraper";
import {
  getLlmsTextFromCache,
  saveLlmsTextToCache,
} from "./generate-llmstxt-supabase";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { getModel } from "../generic-ai";
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";

interface GenerateLLMsTextServiceOptions {
  generationId: string;
  teamId: string;
  url: string;
  maxUrls: number;
  showFullText: boolean;
  subId?: string;
}

const descriptionSchema = z.object({
  description: z.string(),
  title: z.string(),
});

// Helper function to remove page separators
function removePageSeparators(text: string): string {
  return text.replace(/<\|firecrawl-page-\d+-lllmstxt\|>\n/g, "");
}

// Helper function to limit pages in full text
function limitPages(fullText: string, maxPages: number): string {
  const pages = fullText.split(/<\|firecrawl-page-\d+-lllmstxt\|>\n/);
  // First element is the header, so we start from index 1
  const limitedPages = pages.slice(0, maxPages + 1);
  return limitedPages.join("");
}

// Helper function to limit llmstxt entries
function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string {
  // Split by newlines
  const lines = llmstxt.split('\n');
  
  // Find the header line (starts with #)
  const headerIndex = lines.findIndex(line => line.startsWith('#'));
  if (headerIndex === -1) return llmstxt;
  
  // Get the header and the entries
  const header = lines[headerIndex];
  const entries = lines.filter(line => line.startsWith('- ['));
  
  // Take only the requested number of entries
  const limitedEntries = entries.slice(0, maxEntries);
  
  // Reconstruct the text
  return `${header}\n\n${limitedEntries.join('\n')}`;
}

export async function performGenerateLlmsTxt(
  options: GenerateLLMsTextServiceOptions,
) {
  const { generationId, teamId, url, maxUrls = 100, showFullText, subId } =
    options;
  const startTime = Date.now();
  const logger = _logger.child({
    module: "generate-llmstxt",
    method: "performGenerateLlmsTxt",
    generationId,
    teamId,
  });

  try {
    // Enforce max URL limit
    const effectiveMaxUrls = Math.min(maxUrls, 5000);

    // Check cache first
    const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls);
    if (cachedResult) {
      logger.info("Found cached LLMs text", { url });

      // Limit pages and remove separators before returning
      const limitedFullText = limitPages(cachedResult.llmstxt_full, effectiveMaxUrls);
      const cleanFullText = removePageSeparators(limitedFullText);
      
      // Limit llmstxt entries to match maxUrls
      const limitedLlmsTxt = limitLlmsTxtEntries(cachedResult.llmstxt, effectiveMaxUrls);

      // Update final result with cached text
      await updateGeneratedLlmsTxt(generationId, {
        status: "completed",
        generatedText: limitedLlmsTxt,
        fullText: cleanFullText,
        showFullText: showFullText,
      });

      return {
        success: true,
        data: {
          generatedText: limitedLlmsTxt,
          fullText: cleanFullText,
          showFullText: showFullText,
        },
      };
    }

    // If not in cache, proceed with generation
    // First, get all URLs from the map controller
    const mapResult = await getMapResults({
      url,
      teamId,
      limit: effectiveMaxUrls,
      includeSubdomains: false,
      ignoreSitemap: false,
      includeMetadata: true,
    });

    if (!mapResult || !mapResult.links) {
      throw new Error(`Failed to map URLs`);
    }

    _logger.debug("Mapping URLs", mapResult.links);

    const urls = mapResult.links;
    let llmstxt = `# ${url} llms.txt\n\n`;
    let llmsFulltxt = `# ${url} llms-full.txt\n\n`;

    // Process URLs in batches of 10
    for (let i = 0; i < urls.length; i += 10) {
      const batch = urls.slice(i, i + 10);

      const batchResults = await Promise.all(
        batch.map(async (url) => {
          _logger.debug(`Scraping URL: ${url}`);
          try {
            const document = await scrapeDocument(
              {
                url,
                teamId,
                origin: url,
                timeout: 30000,
                isSingleUrl: true,
              },
              [],
              logger,
              { onlyMainContent: true },
            );

            if (!document || !document.markdown) {
              logger.error(`Failed to scrape URL ${url}`);
              return null;
            }

            _logger.debug(
              `Generating description for ${document.metadata?.url}`,
            );

            const { extract } = await generateCompletions({
              logger,
              model: getModel("gpt-4o-mini"),
              options: {
                systemPrompt: "",
                mode: "llm",
                schema: descriptionSchema,
                prompt: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose.`,
              },
              markdown: document.markdown,
            });

            return {
              title: extract.title,
              description: extract.description,
              url: document.metadata?.url,
              markdown: document.markdown,
            };
          } catch (error) {
            logger.error(`Failed to process URL ${url}`, { error });
            return null;
          }
        }),
      );

      // Process successful results from batch
      for (const result of batchResults) {
        if (!result) continue;

        llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
        llmsFulltxt += `<|firecrawl-page-${i + batchResults.indexOf(result) + 1}-lllmstxt|>\n## ${result.title}\n${result.markdown}\n\n`;
      }

      // Update progress after each batch
      await updateGeneratedLlmsTxt(generationId, {
        status: "processing",
        generatedText: llmstxt,
        fullText: removePageSeparators(llmsFulltxt),
      });
    }

    // After successful generation, save to cache
    await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, effectiveMaxUrls);

    // Limit pages and remove separators before final update
    const limitedFullText = limitPages(llmsFulltxt, effectiveMaxUrls);
    const cleanFullText = removePageSeparators(limitedFullText);

    // Update final result with both generated text and full text
    await updateGeneratedLlmsTxt(generationId, {
      status: "completed",
      generatedText: llmstxt,
      fullText: cleanFullText,
      showFullText: showFullText,
    });

    // Log job with token usage and sources
    await logJob({
      job_id: generationId,
      success: true,
      message: "LLMs text generation completed",
      num_docs: urls.length,
      docs: [{ llmstxt: llmstxt, llmsfulltxt: llmsFulltxt }],
      time_taken: (Date.now() - startTime) / 1000,
      team_id: teamId,
      mode: "llmstxt",
      url: url,
      scrapeOptions: options,
      origin: "api",
      num_tokens: 0,
      tokens_billed: 0,
      sources: {},
    });

    // Bill team for usage
    billTeam(teamId, subId, urls.length, logger).catch((error) => {
      logger.error(`Failed to bill team ${teamId} for ${urls.length} urls`, {
        teamId,
        count: urls.length,
        error,
      });
    });

    return {
      success: true,
      data: {
        generatedText: llmstxt,
        fullText: cleanFullText,
        showFullText: showFullText,
      },
    };
  } catch (error: any) {
    logger.error("Generate LLMs text error", { error });

    await updateGeneratedLlmsTxt(generationId, {
      status: "failed",
      error: error.message || "Unknown error occurred",
    });

    throw error;
  }
}
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`import { logger as _logger } from "../logger";`
			`import { updateGeneratedLlmsTxt } from "./generate-llmstxt-redis";`
			`import { getMapResults } from "../../controllers/v1/map";`
			`import { z } from "zod";`
			`import { scrapeDocument } from "../extract/document-scraper";`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`import {`
			`getLlmsTextFromCache,`
			`saveLlmsTextToCache,`
			`} from "./generate-llmstxt-supabase";`
Nick: fixes 2025-02-19 15:21:52 -03:00			`import { billTeam } from "../../services/billing/credit_billing";`
			`import { logJob } from "../../services/logging/log_job";`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`import { getModel } from "../generic-ai";`
			`import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00
			`interface GenerateLLMsTextServiceOptions {`
			`generationId: string;`
			`teamId: string;`
			`url: string;`
			`maxUrls: number;`
			`showFullText: boolean;`
Nick: fixes 2025-02-19 15:21:52 -03:00			`subId?: string;`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`}`

(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`const descriptionSchema = z.object({`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`description: z.string(),`
			`title: z.string(),`
			`});`

Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`// Helper function to remove page separators`
			`function removePageSeparators(text: string): string {`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`return text.replace(/<\\|firecrawl-page-\d+-lllmstxt\\|>\n/g, "");`
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`}`

			`// Helper function to limit pages in full text`
			`function limitPages(fullText: string, maxPages: number): string {`
			`const pages = fullText.split(/<\\|firecrawl-page-\d+-lllmstxt\\|>\n/);`
			`// First element is the header, so we start from index 1`
			`const limitedPages = pages.slice(0, maxPages + 1);`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`return limitedPages.join("");`
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`}`

Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285 ) 2025-03-03 16:37:33 -05:00			`// Helper function to limit llmstxt entries`
			`function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string {`
			`// Split by newlines`
			`const lines = llmstxt.split('\n');`

			`// Find the header line (starts with #)`
			`const headerIndex = lines.findIndex(line => line.startsWith('#'));`
			`if (headerIndex === -1) return llmstxt;`

			`// Get the header and the entries`
			`const header = lines[headerIndex];`
			`const entries = lines.filter(line => line.startsWith('- ['));`

			`// Take only the requested number of entries`
			`const limitedEntries = entries.slice(0, maxEntries);`

			`// Reconstruct the text`
			return `${header}\n\n${limitedEntries.join('\n')}`;
			`}`

(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`export async function performGenerateLlmsTxt(`
			`options: GenerateLLMsTextServiceOptions,`
			`) {`
ACUC: Dynamic Limits (FIR-1641) (#1434 ) 2025-04-10 18:49:23 +02:00			`const { generationId, teamId, url, maxUrls = 100, showFullText, subId } =`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`options;`
Nick: fixes 2025-02-19 15:21:52 -03:00			`const startTime = Date.now();`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`const logger = _logger.child({`
			`module: "generate-llmstxt",`
			`method: "performGenerateLlmsTxt",`
			`generationId,`
			`teamId,`
			`});`

			`try {`
Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285 ) 2025-03-03 16:37:33 -05:00			`// Enforce max URL limit`
			`const effectiveMaxUrls = Math.min(maxUrls, 5000);`

Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`// Check cache first`
Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285 ) 2025-03-03 16:37:33 -05:00			`const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls);`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`if (cachedResult) {`
			`logger.info("Found cached LLMs text", { url });`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`// Limit pages and remove separators before returning`
Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285 ) 2025-03-03 16:37:33 -05:00			`const limitedFullText = limitPages(cachedResult.llmstxt_full, effectiveMaxUrls);`
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`const cleanFullText = removePageSeparators(limitedFullText);`
Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285 ) 2025-03-03 16:37:33 -05:00
			`// Limit llmstxt entries to match maxUrls`
			`const limitedLlmsTxt = limitLlmsTxtEntries(cachedResult.llmstxt, effectiveMaxUrls);`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`// Update final result with cached text`
			`await updateGeneratedLlmsTxt(generationId, {`
			`status: "completed",`
Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285 ) 2025-03-03 16:37:33 -05:00			`generatedText: limitedLlmsTxt,`
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`fullText: cleanFullText,`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`showFullText: showFullText,`
			`});`

			`return {`
			`success: true,`
			`data: {`
Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285 ) 2025-03-03 16:37:33 -05:00			`generatedText: limitedLlmsTxt,`
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`fullText: cleanFullText,`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`showFullText: showFullText,`
			`},`
			`};`
			`}`

			`// If not in cache, proceed with generation`
			`// First, get all URLs from the map controller`
			`const mapResult = await getMapResults({`
			`url,`
			`teamId,`
Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285 ) 2025-03-03 16:37:33 -05:00			`limit: effectiveMaxUrls,`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`includeSubdomains: false,`
			`ignoreSitemap: false,`
			`includeMetadata: true,`
			`});`

			`if (!mapResult \|\| !mapResult.links) {`
			throw new Error(`Failed to map URLs`);
			`}`

			`_logger.debug("Mapping URLs", mapResult.links);`

			`const urls = mapResult.links;`
			let llmstxt = `# ${url} llms.txt\n\n`;
			let llmsFulltxt = `# ${url} llms-full.txt\n\n`;

Update generate-llmstxt-service.ts 2025-02-19 15:50:59 -03:00			`// Process URLs in batches of 10`
			`for (let i = 0; i < urls.length; i += 10) {`
			`const batch = urls.slice(i, i + 10);`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`const batchResults = await Promise.all(`
			`batch.map(async (url) => {`
			_logger.debug(`Scraping URL: ${url}`);
			`try {`
			`const document = await scrapeDocument(`
Update generate-llmstxt-service.ts 2025-02-19 15:50:59 -03:00			`{`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`url,`
			`teamId,`
			`origin: url,`
			`timeout: 30000,`
			`isSingleUrl: true,`
			`},`
			`[],`
			`logger,`
			`{ onlyMainContent: true },`
			`);`

			`if (!document \|\| !document.markdown) {`
			logger.error(`Failed to scrape URL ${url}`);
			`return null;`
			`}`

			`_logger.debug(`
			`Generating description for ${document.metadata?.url}`,
			`);`

			`const { extract } = await generateCompletions({`
			`logger,`
			`model: getModel("gpt-4o-mini"),`
			`options: {`
			`systemPrompt: "",`
			`mode: "llm",`
			`schema: descriptionSchema,`
			prompt: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose.`,
			`},`
			`markdown: document.markdown,`
			`});`

			`return {`
			`title: extract.title,`
			`description: extract.description,`
			`url: document.metadata?.url,`
			`markdown: document.markdown,`
			`};`
			`} catch (error) {`
			logger.error(`Failed to process URL ${url}`, { error });
			`return null;`
			`}`
			`}),`
			`);`
Update generate-llmstxt-service.ts 2025-02-19 15:50:59 -03:00
			`// Process successful results from batch`
			`for (const result of batchResults) {`
			`if (!result) continue;`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00
Update generate-llmstxt-service.ts 2025-02-19 15:50:59 -03:00			llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			llmsFulltxt += `<\|firecrawl-page-${i + batchResults.indexOf(result) + 1}-lllmstxt\|>\n## ${result.title}\n${result.markdown}\n\n`;
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`}`
Update generate-llmstxt-service.ts 2025-02-19 15:50:59 -03:00
			`// Update progress after each batch`
			`await updateGeneratedLlmsTxt(generationId, {`
			`status: "processing",`
			`generatedText: llmstxt,`
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`fullText: removePageSeparators(llmsFulltxt),`
Update generate-llmstxt-service.ts 2025-02-19 15:50:59 -03:00			`});`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`}`

			`// After successful generation, save to cache`
Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285 ) 2025-03-03 16:37:33 -05:00			`await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, effectiveMaxUrls);`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`// Limit pages and remove separators before final update`
Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285 ) 2025-03-03 16:37:33 -05:00			`const limitedFullText = limitPages(llmsFulltxt, effectiveMaxUrls);`
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`const cleanFullText = removePageSeparators(limitedFullText);`

Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`// Update final result with both generated text and full text`
			`await updateGeneratedLlmsTxt(generationId, {`
			`status: "completed",`
			`generatedText: llmstxt,`
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`fullText: cleanFullText,`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`showFullText: showFullText,`
			`});`

Nick: fixes 2025-02-19 15:21:52 -03:00			`// Log job with token usage and sources`
			`await logJob({`
			`job_id: generationId,`
			`success: true,`
			`message: "LLMs text generation completed",`
			`num_docs: urls.length,`
			`docs: [{ llmstxt: llmstxt, llmsfulltxt: llmsFulltxt }],`
			`time_taken: (Date.now() - startTime) / 1000,`
			`team_id: teamId,`
			`mode: "llmstxt",`
			`url: url,`
			`scrapeOptions: options,`
			`origin: "api",`
			`num_tokens: 0,`
			`tokens_billed: 0,`
			`sources: {},`
			`});`

			`// Bill team for usage`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`billTeam(teamId, subId, urls.length, logger).catch((error) => {`
			logger.error(`Failed to bill team ${teamId} for ${urls.length} urls`, {
			`teamId,`
			`count: urls.length,`
			`error,`
			`});`
			`});`
Nick: fixes 2025-02-19 15:21:52 -03:00
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`return {`
			`success: true,`
			`data: {`
			`generatedText: llmstxt,`
Nick: llmstxt improvements 2025-02-19 16:09:46 -03:00			`fullText: cleanFullText,`
Add llmstxt generator endpoint (#1201 ) 2025-02-19 12:42:33 -05:00			`showFullText: showFullText,`
			`},`
			`};`
			`} catch (error: any) {`
			`logger.error("Generate LLMs text error", { error });`

			`await updateGeneratedLlmsTxt(generationId, {`
			`status: "failed",`
			`error: error.message \|\| "Unknown error occurred",`
			`});`

			`throw error;`
			`}`
(feat/ai-sdk) Migrate to AI-SDK (#1220 ) 2025-02-20 18:48:58 -03:00			`}`