2025-02-19 12:42:33 -05:00
import { logger as _logger } from "../logger" ;
import { updateGeneratedLlmsTxt } from "./generate-llmstxt-redis" ;
import { getMapResults } from "../../controllers/v1/map" ;
import { z } from "zod" ;
import { scrapeDocument } from "../extract/document-scraper" ;
2025-02-20 18:48:58 -03:00
import {
getLlmsTextFromCache ,
saveLlmsTextToCache ,
} from "./generate-llmstxt-supabase" ;
2025-02-19 15:21:52 -03:00
import { billTeam } from "../../services/billing/credit_billing" ;
import { logJob } from "../../services/logging/log_job" ;
2025-02-20 18:48:58 -03:00
import { getModel } from "../generic-ai" ;
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract" ;
2025-02-19 12:42:33 -05:00
interface GenerateLLMsTextServiceOptions {
generationId : string ;
teamId : string ;
url : string ;
maxUrls : number ;
showFullText : boolean ;
2025-02-19 15:21:52 -03:00
subId? : string ;
2025-02-19 12:42:33 -05:00
}
2025-02-20 18:48:58 -03:00
const descriptionSchema = z . object ( {
2025-02-19 12:42:33 -05:00
description : z.string ( ) ,
title : z.string ( ) ,
} ) ;
2025-02-19 16:09:46 -03:00
// Helper function to remove page separators
function removePageSeparators ( text : string ) : string {
2025-02-20 18:48:58 -03:00
return text . replace ( /<\|firecrawl-page-\d+-lllmstxt\|>\n/g , "" ) ;
2025-02-19 16:09:46 -03:00
}
// Helper function to limit pages in full text
function limitPages ( fullText : string , maxPages : number ) : string {
const pages = fullText . split ( /<\|firecrawl-page-\d+-lllmstxt\|>\n/ ) ;
// First element is the header, so we start from index 1
const limitedPages = pages . slice ( 0 , maxPages + 1 ) ;
2025-02-20 18:48:58 -03:00
return limitedPages . join ( "" ) ;
2025-02-19 16:09:46 -03:00
}
2025-03-03 16:37:33 -05:00
// Helper function to limit llmstxt entries
function limitLlmsTxtEntries ( llmstxt : string , maxEntries : number ) : string {
// Split by newlines
const lines = llmstxt . split ( '\n' ) ;
// Find the header line (starts with #)
const headerIndex = lines . findIndex ( line = > line . startsWith ( '#' ) ) ;
if ( headerIndex === - 1 ) return llmstxt ;
// Get the header and the entries
const header = lines [ headerIndex ] ;
const entries = lines . filter ( line = > line . startsWith ( '- [' ) ) ;
// Take only the requested number of entries
const limitedEntries = entries . slice ( 0 , maxEntries ) ;
// Reconstruct the text
return ` ${ header } \ n \ n ${ limitedEntries . join ( '\n' ) } ` ;
}
2025-02-20 18:48:58 -03:00
export async function performGenerateLlmsTxt (
options : GenerateLLMsTextServiceOptions ,
) {
2025-04-10 18:49:23 +02:00
const { generationId , teamId , url , maxUrls = 100 , showFullText , subId } =
2025-02-20 18:48:58 -03:00
options ;
2025-02-19 15:21:52 -03:00
const startTime = Date . now ( ) ;
2025-02-19 12:42:33 -05:00
const logger = _logger . child ( {
module : "generate-llmstxt" ,
method : "performGenerateLlmsTxt" ,
generationId ,
teamId ,
} ) ;
try {
2025-03-03 16:37:33 -05:00
// Enforce max URL limit
const effectiveMaxUrls = Math . min ( maxUrls , 5000 ) ;
2025-02-19 12:42:33 -05:00
// Check cache first
2025-03-03 16:37:33 -05:00
const cachedResult = await getLlmsTextFromCache ( url , effectiveMaxUrls ) ;
2025-02-19 12:42:33 -05:00
if ( cachedResult ) {
logger . info ( "Found cached LLMs text" , { url } ) ;
2025-02-20 18:48:58 -03:00
2025-02-19 16:09:46 -03:00
// Limit pages and remove separators before returning
2025-03-03 16:37:33 -05:00
const limitedFullText = limitPages ( cachedResult . llmstxt_full , effectiveMaxUrls ) ;
2025-02-19 16:09:46 -03:00
const cleanFullText = removePageSeparators ( limitedFullText ) ;
2025-03-03 16:37:33 -05:00
// Limit llmstxt entries to match maxUrls
const limitedLlmsTxt = limitLlmsTxtEntries ( cachedResult . llmstxt , effectiveMaxUrls ) ;
2025-02-20 18:48:58 -03:00
2025-02-19 12:42:33 -05:00
// Update final result with cached text
await updateGeneratedLlmsTxt ( generationId , {
status : "completed" ,
2025-03-03 16:37:33 -05:00
generatedText : limitedLlmsTxt ,
2025-02-19 16:09:46 -03:00
fullText : cleanFullText ,
2025-02-19 12:42:33 -05:00
showFullText : showFullText ,
} ) ;
return {
success : true ,
data : {
2025-03-03 16:37:33 -05:00
generatedText : limitedLlmsTxt ,
2025-02-19 16:09:46 -03:00
fullText : cleanFullText ,
2025-02-19 12:42:33 -05:00
showFullText : showFullText ,
} ,
} ;
}
// If not in cache, proceed with generation
// First, get all URLs from the map controller
const mapResult = await getMapResults ( {
url ,
teamId ,
2025-03-03 16:37:33 -05:00
limit : effectiveMaxUrls ,
2025-02-19 12:42:33 -05:00
includeSubdomains : false ,
ignoreSitemap : false ,
includeMetadata : true ,
} ) ;
if ( ! mapResult || ! mapResult . links ) {
throw new Error ( ` Failed to map URLs ` ) ;
}
_logger . debug ( "Mapping URLs" , mapResult . links ) ;
const urls = mapResult . links ;
let llmstxt = ` # ${ url } llms.txt \ n \ n ` ;
let llmsFulltxt = ` # ${ url } llms-full.txt \ n \ n ` ;
2025-02-19 15:50:59 -03:00
// Process URLs in batches of 10
for ( let i = 0 ; i < urls . length ; i += 10 ) {
const batch = urls . slice ( i , i + 10 ) ;
2025-02-19 12:42:33 -05:00
2025-02-20 18:48:58 -03:00
const batchResults = await Promise . all (
batch . map ( async ( url ) = > {
_logger . debug ( ` Scraping URL: ${ url } ` ) ;
try {
const document = await scrapeDocument (
2025-02-19 15:50:59 -03:00
{
2025-02-20 18:48:58 -03:00
url ,
teamId ,
origin : url ,
timeout : 30000 ,
isSingleUrl : true ,
} ,
[ ] ,
logger ,
{ onlyMainContent : true } ,
) ;
if ( ! document || ! document . markdown ) {
logger . error ( ` Failed to scrape URL ${ url } ` ) ;
return null ;
}
_logger . debug (
` Generating description for ${ document . metadata ? . url } ` ,
) ;
const { extract } = await generateCompletions ( {
logger ,
model : getModel ( "gpt-4o-mini" ) ,
options : {
systemPrompt : "" ,
mode : "llm" ,
schema : descriptionSchema ,
prompt : ` Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${ document . metadata ? . url } . This will help in a user finding the page for its intended purpose. ` ,
} ,
markdown : document.markdown ,
} ) ;
return {
title : extract.title ,
description : extract.description ,
url : document.metadata?.url ,
markdown : document.markdown ,
} ;
} catch ( error ) {
logger . error ( ` Failed to process URL ${ url } ` , { error } ) ;
return null ;
}
} ) ,
) ;
2025-02-19 15:50:59 -03:00
// Process successful results from batch
for ( const result of batchResults ) {
if ( ! result ) continue ;
2025-02-20 18:48:58 -03:00
2025-02-19 15:50:59 -03:00
llmstxt += ` - [ ${ result . title } ]( ${ result . url } ): ${ result . description } \ n ` ;
2025-02-19 16:09:46 -03:00
llmsFulltxt += ` <|firecrawl-page- ${ i + batchResults . indexOf ( result ) + 1 } -lllmstxt|> \ n## ${ result . title } \ n ${ result . markdown } \ n \ n ` ;
2025-02-19 12:42:33 -05:00
}
2025-02-19 15:50:59 -03:00
// Update progress after each batch
await updateGeneratedLlmsTxt ( generationId , {
status : "processing" ,
generatedText : llmstxt ,
2025-02-19 16:09:46 -03:00
fullText : removePageSeparators ( llmsFulltxt ) ,
2025-02-19 15:50:59 -03:00
} ) ;
2025-02-19 12:42:33 -05:00
}
// After successful generation, save to cache
2025-03-03 16:37:33 -05:00
await saveLlmsTextToCache ( url , llmstxt , llmsFulltxt , effectiveMaxUrls ) ;
2025-02-19 12:42:33 -05:00
2025-02-19 16:09:46 -03:00
// Limit pages and remove separators before final update
2025-03-03 16:37:33 -05:00
const limitedFullText = limitPages ( llmsFulltxt , effectiveMaxUrls ) ;
2025-02-19 16:09:46 -03:00
const cleanFullText = removePageSeparators ( limitedFullText ) ;
2025-02-19 12:42:33 -05:00
// Update final result with both generated text and full text
await updateGeneratedLlmsTxt ( generationId , {
status : "completed" ,
generatedText : llmstxt ,
2025-02-19 16:09:46 -03:00
fullText : cleanFullText ,
2025-02-19 12:42:33 -05:00
showFullText : showFullText ,
} ) ;
2025-02-19 15:21:52 -03:00
// Log job with token usage and sources
await logJob ( {
job_id : generationId ,
success : true ,
message : "LLMs text generation completed" ,
num_docs : urls.length ,
docs : [ { llmstxt : llmstxt , llmsfulltxt : llmsFulltxt } ] ,
time_taken : ( Date . now ( ) - startTime ) / 1000 ,
team_id : teamId ,
mode : "llmstxt" ,
url : url ,
scrapeOptions : options ,
origin : "api" ,
num_tokens : 0 ,
tokens_billed : 0 ,
sources : { } ,
} ) ;
// Bill team for usage
2025-02-20 18:48:58 -03:00
billTeam ( teamId , subId , urls . length , logger ) . catch ( ( error ) = > {
logger . error ( ` Failed to bill team ${ teamId } for ${ urls . length } urls ` , {
teamId ,
count : urls.length ,
error ,
} ) ;
} ) ;
2025-02-19 15:21:52 -03:00
2025-02-19 12:42:33 -05:00
return {
success : true ,
data : {
generatedText : llmstxt ,
2025-02-19 16:09:46 -03:00
fullText : cleanFullText ,
2025-02-19 12:42:33 -05:00
showFullText : showFullText ,
} ,
} ;
} catch ( error : any ) {
logger . error ( "Generate LLMs text error" , { error } ) ;
await updateGeneratedLlmsTxt ( generationId , {
status : "failed" ,
error : error.message || "Unknown error occurred" ,
} ) ;
throw error ;
}
2025-02-20 18:48:58 -03:00
}