2024-12-26 12:41:37 -03:00
import { MapDocument , URLTrace } from "../../controllers/v1/types" ;
import { getMapResults } from "../../controllers/v1/map" ;
import { PlanType } from "../../types" ;
import { removeDuplicateUrls } from "../validateUrl" ;
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist" ;
import { generateBasicCompletion } from "../LLM-extraction" ;
import { buildRefrasedPrompt } from "./build-prompts" ;
2025-01-13 22:30:15 -03:00
import { rerankLinksWithLLM } from "./reranker" ;
2024-12-31 18:06:07 -03:00
import { extractConfig } from "./config" ;
2025-01-14 01:45:50 -03:00
import { updateExtract } from "./extract-redis" ;
import { ExtractStep } from "./extract-redis" ;
2024-12-26 12:41:37 -03:00
interface ProcessUrlOptions {
url : string ;
prompt? : string ;
2025-01-13 22:30:15 -03:00
schema? : any ;
2024-12-26 12:41:37 -03:00
teamId : string ;
plan : PlanType ;
allowExternalLinks? : boolean ;
origin? : string ;
limit? : number ;
includeSubdomains? : boolean ;
}
2025-01-10 18:35:10 -03:00
export async function processUrl (
options : ProcessUrlOptions ,
urlTraces : URLTrace [ ] ,
2025-01-14 02:13:42 -03:00
updateExtractCallback : ( links : string [ ] ) = > void ,
2025-01-10 18:35:10 -03:00
) : Promise < string [ ] > {
2024-12-26 12:41:37 -03:00
const trace : URLTrace = {
url : options.url ,
2025-01-10 18:35:10 -03:00
status : "mapped" ,
2024-12-26 12:41:37 -03:00
timing : {
discoveredAt : new Date ( ) . toISOString ( ) ,
} ,
} ;
urlTraces . push ( trace ) ;
if ( ! options . url . includes ( "/*" ) && ! options . allowExternalLinks ) {
if ( ! isUrlBlocked ( options . url ) ) {
trace . usedInCompletion = true ;
return [ options . url ] ;
}
2025-01-10 18:35:10 -03:00
trace . status = "error" ;
trace . error = "URL is blocked" ;
2024-12-26 12:41:37 -03:00
trace . usedInCompletion = false ;
return [ ] ;
}
const baseUrl = options . url . replace ( "/*" , "" ) ;
let urlWithoutWww = baseUrl . replace ( "www." , "" ) ;
let rephrasedPrompt = options . prompt ;
if ( options . prompt ) {
2025-01-10 18:35:10 -03:00
rephrasedPrompt =
2025-01-13 22:30:15 -03:00
(
await generateBasicCompletion (
buildRefrasedPrompt ( options . prompt , baseUrl ) ,
)
)
? . replace ( '"' , "" )
. replace ( "/" , "" ) ? ? options . prompt ;
2024-12-26 12:41:37 -03:00
}
try {
const mapResults = await getMapResults ( {
url : baseUrl ,
search : rephrasedPrompt ,
teamId : options.teamId ,
plan : options.plan ,
allowExternalLinks : options.allowExternalLinks ,
origin : options.origin ,
limit : options.limit ,
ignoreSitemap : false ,
includeMetadata : true ,
includeSubdomains : options.includeSubdomains ,
} ) ;
let mappedLinks = mapResults . mapResults as MapDocument [ ] ;
2025-01-02 18:00:18 -03:00
let allUrls = [ . . . mappedLinks . map ( ( m ) = > m . url ) , . . . mapResults . links ] ;
let uniqueUrls = removeDuplicateUrls ( allUrls ) ;
// Track all discovered URLs
2025-01-10 18:35:10 -03:00
uniqueUrls . forEach ( ( discoveredUrl ) = > {
if ( ! urlTraces . some ( ( t ) = > t . url === discoveredUrl ) ) {
2025-01-02 18:00:18 -03:00
urlTraces . push ( {
url : discoveredUrl ,
2025-01-10 18:35:10 -03:00
status : "mapped" ,
2025-01-02 18:00:18 -03:00
timing : {
discoveredAt : new Date ( ) . toISOString ( ) ,
} ,
usedInCompletion : false ,
} ) ;
}
} ) ;
// retry if only one url is returned
2025-01-10 18:35:10 -03:00
if ( uniqueUrls . length <= 1 ) {
2025-01-02 18:00:18 -03:00
const retryMapResults = await getMapResults ( {
url : baseUrl ,
teamId : options.teamId ,
plan : options.plan ,
allowExternalLinks : options.allowExternalLinks ,
origin : options.origin ,
limit : options.limit ,
ignoreSitemap : false ,
includeMetadata : true ,
includeSubdomains : options.includeSubdomains ,
} ) ;
2025-01-10 18:35:10 -03:00
2025-01-02 18:00:18 -03:00
mappedLinks = retryMapResults . mapResults as MapDocument [ ] ;
allUrls = [ . . . mappedLinks . map ( ( m ) = > m . url ) , . . . mapResults . links ] ;
uniqueUrls = removeDuplicateUrls ( allUrls ) ;
// Track all discovered URLs
2025-01-10 18:35:10 -03:00
uniqueUrls . forEach ( ( discoveredUrl ) = > {
if ( ! urlTraces . some ( ( t ) = > t . url === discoveredUrl ) ) {
2025-01-02 18:00:18 -03:00
urlTraces . push ( {
url : discoveredUrl ,
2025-01-10 18:35:10 -03:00
status : "mapped" ,
warning : "Broader search. Not limiting map results to prompt." ,
2025-01-02 18:00:18 -03:00
timing : {
discoveredAt : new Date ( ) . toISOString ( ) ,
} ,
usedInCompletion : false ,
} ) ;
}
} ) ;
}
2024-12-26 12:41:37 -03:00
// Track all discovered URLs
2025-01-10 18:35:10 -03:00
uniqueUrls . forEach ( ( discoveredUrl ) = > {
if ( ! urlTraces . some ( ( t ) = > t . url === discoveredUrl ) ) {
2024-12-26 12:41:37 -03:00
urlTraces . push ( {
url : discoveredUrl ,
2025-01-10 18:35:10 -03:00
status : "mapped" ,
2024-12-26 12:41:37 -03:00
timing : {
discoveredAt : new Date ( ) . toISOString ( ) ,
} ,
usedInCompletion : false ,
} ) ;
}
} ) ;
const existingUrls = new Set ( mappedLinks . map ( ( m ) = > m . url ) ) ;
const newUrls = uniqueUrls . filter ( ( url ) = > ! existingUrls . has ( url ) ) ;
mappedLinks = [
. . . mappedLinks ,
. . . newUrls . map ( ( url ) = > ( { url , title : "" , description : "" } ) ) ,
] ;
if ( mappedLinks . length === 0 ) {
mappedLinks = [ { url : baseUrl , title : "" , description : "" } ] ;
}
2024-12-31 18:06:07 -03:00
// Limit initial set of links (1000)
2025-01-13 22:30:15 -03:00
mappedLinks = mappedLinks . slice (
0 ,
extractConfig . RERANKING . MAX_INITIAL_RANKING_LIMIT ,
) ;
2024-12-26 12:41:37 -03:00
2025-01-14 01:45:50 -03:00
2025-01-14 02:13:42 -03:00
updateExtractCallback ( mappedLinks . map ( ( x ) = > x . url ) ) ;
2025-01-14 01:45:50 -03:00
2025-01-13 22:30:15 -03:00
// Perform reranking using either prompt or schema
let searchQuery = "" ;
2024-12-26 12:41:37 -03:00
if ( options . prompt ) {
2025-01-13 22:30:15 -03:00
searchQuery = options . allowExternalLinks
2024-12-26 12:41:37 -03:00
? ` ${ options . prompt } ${ urlWithoutWww } `
: ` ${ options . prompt } site: ${ urlWithoutWww } ` ;
2025-01-13 22:30:15 -03:00
} else if ( options . schema ) {
// Generate search query from schema using basic completion
try {
const schemaString = JSON . stringify ( options . schema , null , 2 ) ;
const prompt = ` Given this JSON schema, generate a natural language search query that would help find relevant pages containing this type of data. Focus on the key properties and their descriptions and keep it very concise. Schema: ${ schemaString } ` ;
searchQuery =
( await generateBasicCompletion ( prompt ) ) ? ?
"Extract the data according to the schema: " + schemaString ;
if ( options . allowExternalLinks ) {
searchQuery = ` ${ searchQuery } ${ urlWithoutWww } ` ;
} else {
searchQuery = ` ${ searchQuery } site: ${ urlWithoutWww } ` ;
}
} catch ( error ) {
console . error ( "Error generating search query from schema:" , error ) ;
searchQuery = urlWithoutWww ; // Fallback to just the domain
}
} else {
searchQuery = urlWithoutWww ;
}
2024-12-26 12:41:37 -03:00
2025-01-13 22:30:15 -03:00
// dumpToFile(
// "mapped-links.txt",
// mappedLinks,
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
// );
2025-01-19 22:04:12 -03:00
const rerankerResult = await rerankLinksWithLLM ( mappedLinks , searchQuery , urlTraces ) ;
mappedLinks = rerankerResult . mapDocument ;
let tokensUsed = rerankerResult . tokensUsed ;
2025-01-13 22:30:15 -03:00
// 2nd Pass, useful for when the first pass returns too many links
if ( mappedLinks . length > 100 ) {
2025-01-19 22:04:12 -03:00
const rerankerResult = await rerankLinksWithLLM (
2025-01-13 22:30:15 -03:00
mappedLinks ,
searchQuery ,
urlTraces ,
) ;
2025-01-19 22:04:12 -03:00
mappedLinks = rerankerResult . mapDocument ;
tokensUsed += rerankerResult . tokensUsed ;
2024-12-26 12:41:37 -03:00
}
2025-01-13 22:30:15 -03:00
// dumpToFile(
// "llm-links.txt",
// mappedLinks,
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
// );
// Remove title and description from mappedLinks
mappedLinks = mappedLinks . map ( ( link ) = > ( { url : link.url } ) ) ;
2025-01-10 18:35:10 -03:00
return mappedLinks . map ( ( x ) = > x . url ) ;
2024-12-26 12:41:37 -03:00
} catch ( error ) {
2025-01-10 18:35:10 -03:00
trace . status = "error" ;
2024-12-26 12:41:37 -03:00
trace . error = error . message ;
trace . usedInCompletion = false ;
return [ ] ;
}
2025-01-10 18:35:10 -03:00
}