import { MapDocument, URLTrace } from "../../controllers/v1/types"; import { getMapResults } from "../../controllers/v1/map"; import { PlanType } from "../../types"; import { removeDuplicateUrls } from "../validateUrl"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { generateBasicCompletion } from "../LLM-extraction"; import { buildRefrasedPrompt } from "./build-prompts"; import { rerankLinksWithLLM } from "./reranker"; import { extractConfig } from "./config"; import { updateExtract } from "./extract-redis"; import { ExtractStep } from "./extract-redis"; interface ProcessUrlOptions { url: string; prompt?: string; schema?: any; teamId: string; plan: PlanType; allowExternalLinks?: boolean; origin?: string; limit?: number; includeSubdomains?: boolean; } export async function processUrl( options: ProcessUrlOptions, urlTraces: URLTrace[], updateExtractCallback: (links: string[]) => void, ): Promise { const trace: URLTrace = { url: options.url, status: "mapped", timing: { discoveredAt: new Date().toISOString(), }, }; urlTraces.push(trace); if (!options.url.includes("/*") && !options.allowExternalLinks) { if (!isUrlBlocked(options.url)) { trace.usedInCompletion = true; return [options.url]; } trace.status = "error"; trace.error = "URL is blocked"; trace.usedInCompletion = false; return []; } const baseUrl = options.url.replace("/*", ""); let urlWithoutWww = baseUrl.replace("www.", ""); let rephrasedPrompt = options.prompt; if (options.prompt) { rephrasedPrompt = ( await generateBasicCompletion( buildRefrasedPrompt(options.prompt, baseUrl), ) ) ?.replace('"', "") .replace("/", "") ?? options.prompt; } try { const mapResults = await getMapResults({ url: baseUrl, search: rephrasedPrompt, teamId: options.teamId, plan: options.plan, allowExternalLinks: options.allowExternalLinks, origin: options.origin, limit: options.limit, ignoreSitemap: false, includeMetadata: true, includeSubdomains: options.includeSubdomains, }); let mappedLinks = mapResults.mapResults as MapDocument[]; let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; let uniqueUrls = removeDuplicateUrls(allUrls); // Track all discovered URLs uniqueUrls.forEach((discoveredUrl) => { if (!urlTraces.some((t) => t.url === discoveredUrl)) { urlTraces.push({ url: discoveredUrl, status: "mapped", timing: { discoveredAt: new Date().toISOString(), }, usedInCompletion: false, }); } }); // retry if only one url is returned if (uniqueUrls.length <= 1) { const retryMapResults = await getMapResults({ url: baseUrl, teamId: options.teamId, plan: options.plan, allowExternalLinks: options.allowExternalLinks, origin: options.origin, limit: options.limit, ignoreSitemap: false, includeMetadata: true, includeSubdomains: options.includeSubdomains, }); mappedLinks = retryMapResults.mapResults as MapDocument[]; allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; uniqueUrls = removeDuplicateUrls(allUrls); // Track all discovered URLs uniqueUrls.forEach((discoveredUrl) => { if (!urlTraces.some((t) => t.url === discoveredUrl)) { urlTraces.push({ url: discoveredUrl, status: "mapped", warning: "Broader search. Not limiting map results to prompt.", timing: { discoveredAt: new Date().toISOString(), }, usedInCompletion: false, }); } }); } // Track all discovered URLs uniqueUrls.forEach((discoveredUrl) => { if (!urlTraces.some((t) => t.url === discoveredUrl)) { urlTraces.push({ url: discoveredUrl, status: "mapped", timing: { discoveredAt: new Date().toISOString(), }, usedInCompletion: false, }); } }); const existingUrls = new Set(mappedLinks.map((m) => m.url)); const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url)); mappedLinks = [ ...mappedLinks, ...newUrls.map((url) => ({ url, title: "", description: "" })), ]; if (mappedLinks.length === 0) { mappedLinks = [{ url: baseUrl, title: "", description: "" }]; } // Limit initial set of links (1000) mappedLinks = mappedLinks.slice( 0, extractConfig.RERANKING.MAX_INITIAL_RANKING_LIMIT, ); updateExtractCallback(mappedLinks.map((x) => x.url)); // Perform reranking using either prompt or schema let searchQuery = ""; if (options.prompt) { searchQuery = options.allowExternalLinks ? `${options.prompt} ${urlWithoutWww}` : `${options.prompt} site:${urlWithoutWww}`; } else if (options.schema) { // Generate search query from schema using basic completion try { const schemaString = JSON.stringify(options.schema, null, 2); const prompt = `Given this JSON schema, generate a natural language search query that would help find relevant pages containing this type of data. Focus on the key properties and their descriptions and keep it very concise. Schema: ${schemaString}`; searchQuery = (await generateBasicCompletion(prompt)) ?? "Extract the data according to the schema: " + schemaString; if (options.allowExternalLinks) { searchQuery = `${searchQuery} ${urlWithoutWww}`; } else { searchQuery = `${searchQuery} site:${urlWithoutWww}`; } } catch (error) { console.error("Error generating search query from schema:", error); searchQuery = urlWithoutWww; // Fallback to just the domain } } else { searchQuery = urlWithoutWww; } // dumpToFile( // "mapped-links.txt", // mappedLinks, // (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}` // ); const rerankerResult = await rerankLinksWithLLM(mappedLinks, searchQuery, urlTraces); mappedLinks = rerankerResult.mapDocument; let tokensUsed = rerankerResult.tokensUsed; // 2nd Pass, useful for when the first pass returns too many links if (mappedLinks.length > 100) { const rerankerResult = await rerankLinksWithLLM( mappedLinks, searchQuery, urlTraces, ); mappedLinks = rerankerResult.mapDocument; tokensUsed += rerankerResult.tokensUsed; } // dumpToFile( // "llm-links.txt", // mappedLinks, // (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}` // ); // Remove title and description from mappedLinks mappedLinks = mappedLinks.map((link) => ({ url: link.url })); return mappedLinks.map((x) => x.url); } catch (error) { trace.status = "error"; trace.error = error.message; trace.usedInCompletion = false; return []; } }