diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index aeedc792..6b13f762 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -19,8 +19,15 @@ import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; import { performCosineSimilarity } from "../../lib/map-cosine"; import { Logger } from "../../lib/logger"; +import Redis from "ioredis"; configDotenv(); +const redis = new Redis(process.env.REDIS_URL); + +// Max Links that /map can return +const MAX_MAP_LIMIT = 5000; +// Max Links that "Smart /map" can return +const MAX_FIRE_ENGINE_RESULTS = 1000; export async function mapController( req: RequestWithAuth<{}, MapResponse, MapRequest>, @@ -30,8 +37,7 @@ export async function mapController( req.body = mapRequestSchema.parse(req.body); - - const limit : number = req.body.limit ?? 5000; + const limit: number = req.body.limit ?? MAX_MAP_LIMIT; const id = uuidv4(); let links: string[] = [req.body.url]; @@ -53,35 +59,54 @@ export async function mapController( ? `"${req.body.search}" site:${urlWithoutWww}` : `site:${req.body.url}`; - const maxResults = 5000; const resultsPerPage = 100; - const maxPages = Math.ceil(maxResults / resultsPerPage); + const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); - const fetchPage = async (page: number) => { - return fireEngineMap(mapUrl, { - numResults: resultsPerPage, - page: page - }); - }; + const cacheKey = `fireEngineMap:${mapUrl}`; + const cachedResult = await redis.get(cacheKey); + + let allResults: any[]; + let pagePromises: Promise[]; + + if (cachedResult) { + allResults = JSON.parse(cachedResult); + } else { + const fetchPage = async (page: number) => { + return fireEngineMap(mapUrl, { + numResults: resultsPerPage, + page: page, + }); + }; + + pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); + allResults = await Promise.all(pagePromises); + + await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours + } - const pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); - // Parallelize sitemap fetch with serper search - const [sitemap, ...allResults] = await Promise.all([ + const [sitemap, ...searchResults] = await Promise.all([ req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), - ...pagePromises + ...(cachedResult ? [] : pagePromises), ]); + if (!cachedResult) { + allResults = searchResults; + } + if (sitemap !== null) { sitemap.forEach((x) => { links.push(x.url); }); } - let mapResults = allResults.flat().filter(result => result !== null && result !== undefined); + let mapResults = allResults + .flat() + .filter((result) => result !== null && result !== undefined); - if (mapResults.length > maxResults) { - mapResults = mapResults.slice(0, maxResults); + const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit); + if (mapResults.length > minumumCutoff) { + mapResults = mapResults.slice(0, minumumCutoff); } if (mapResults.length > 0) { @@ -102,17 +127,19 @@ export async function mapController( // Perform cosine similarity between the search query and the list of links if (req.body.search) { const searchQuery = req.body.search.toLowerCase(); - + links = performCosineSimilarity(links, searchQuery); } - links = links.map((x) => { - try { - return checkAndUpdateURLForMap(x).url.trim() - } catch (_) { - return null; - } - }).filter(x => x !== null); + links = links + .map((x) => { + try { + return checkAndUpdateURLForMap(x).url.trim(); + } catch (_) { + return null; + } + }) + .filter((x) => x !== null); // allows for subdomains to be included links = links.filter((x) => isSameDomain(x, req.body.url)); @@ -125,8 +152,10 @@ export async function mapController( // remove duplicates that could be due to http/https or www links = removeDuplicateUrls(links); - billTeam(req.auth.team_id, 1).catch(error => { - Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`); + billTeam(req.auth.team_id, 1).catch((error) => { + Logger.error( + `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}` + ); // Optionally, you could notify an admin or add to a retry queue here }); @@ -134,7 +163,7 @@ export async function mapController( const timeTakenInSeconds = (endTime - startTime) / 1000; const linksToReturn = links.slice(0, limit); - + logJob({ job_id: id, success: links.length > 0, @@ -158,3 +187,51 @@ export async function mapController( scrape_id: req.body.origin?.includes("website") ? id : undefined, }); } + +// Subdomain sitemap url checking + +// // For each result, check for subdomains, get their sitemaps and add them to the links +// const processedUrls = new Set(); +// const processedSubdomains = new Set(); + +// for (const result of links) { +// let url; +// let hostParts; +// try { +// url = new URL(result); +// hostParts = url.hostname.split('.'); +// } catch (e) { +// continue; +// } + +// console.log("hostParts", hostParts); +// // Check if it's a subdomain (more than 2 parts, and not 'www') +// if (hostParts.length > 2 && hostParts[0] !== 'www') { +// const subdomain = hostParts[0]; +// console.log("subdomain", subdomain); +// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`; +// console.log("subdomainUrl", subdomainUrl); + +// if (!processedSubdomains.has(subdomainUrl)) { +// processedSubdomains.add(subdomainUrl); + +// const subdomainCrawl = crawlToCrawler(id, { +// originUrl: subdomainUrl, +// crawlerOptions: legacyCrawlerOptions(req.body), +// pageOptions: {}, +// team_id: req.auth.team_id, +// createdAt: Date.now(), +// plan: req.auth.plan, +// }); +// const subdomainSitemap = await subdomainCrawl.tryGetSitemap(); +// if (subdomainSitemap) { +// subdomainSitemap.forEach((x) => { +// if (!processedUrls.has(x.url)) { +// processedUrls.add(x.url); +// links.push(x.url); +// } +// }); +// } +// } +// } +// }