diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 0ca6a3de..2791df3c 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -101,7 +101,7 @@ export async function performExtraction(options: ExtractServiceOptions): Promise mode: "llm", systemPrompt: (request.systemPrompt ? `${request.systemPrompt}\n` : "") + - "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + + "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + links.join(", "), prompt: request.prompt, schema: request.schema, diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 9f255ad7..af250fcd 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -66,8 +66,56 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace }); let mappedLinks = mapResults.mapResults as MapDocument[]; - const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; - const uniqueUrls = removeDuplicateUrls(allUrls); + let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; + let uniqueUrls = removeDuplicateUrls(allUrls); + + // Track all discovered URLs + uniqueUrls.forEach(discoveredUrl => { + if (!urlTraces.some(t => t.url === discoveredUrl)) { + urlTraces.push({ + url: discoveredUrl, + status: 'mapped', + timing: { + discoveredAt: new Date().toISOString(), + }, + usedInCompletion: false, + }); + } + }); + + // retry if only one url is returned + if (uniqueUrls.length === 1) { + const retryMapResults = await getMapResults({ + url: baseUrl, + teamId: options.teamId, + plan: options.plan, + allowExternalLinks: options.allowExternalLinks, + origin: options.origin, + limit: options.limit, + ignoreSitemap: false, + includeMetadata: true, + includeSubdomains: options.includeSubdomains, + }); + + mappedLinks = retryMapResults.mapResults as MapDocument[]; + allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; + uniqueUrls = removeDuplicateUrls(allUrls); + + // Track all discovered URLs + uniqueUrls.forEach(discoveredUrl => { + if (!urlTraces.some(t => t.url === discoveredUrl)) { + urlTraces.push({ + url: discoveredUrl, + status: 'mapped', + warning: 'Broader search. Not limiting map results to prompt.', + timing: { + discoveredAt: new Date().toISOString(), + }, + usedInCompletion: false, + }); + } + }); + } // Track all discovered URLs uniqueUrls.forEach(discoveredUrl => {