From f5b84e15e1d8fe2e22c5d44bce685c28a5e4d752 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 5 Sep 2024 17:52:27 -0300 Subject: [PATCH 1/3] Update sitemap.ts --- apps/api/src/scraper/WebScraper/sitemap.ts | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index b1a6a6ff..13dfc26e 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -36,17 +36,15 @@ export async function getLinksFromSitemap( const root = parsed.urlset || parsed.sitemapindex; if (root && root.sitemap) { - for (const sitemap of root.sitemap) { - if (sitemap.loc && sitemap.loc.length > 0) { - await getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode }); - } - } + const sitemapPromises = root.sitemap + .filter(sitemap => sitemap.loc && sitemap.loc.length > 0) + .map(sitemap => getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode })); + await Promise.all(sitemapPromises); } else if (root && root.url) { - for (const url of root.url) { - if (url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) { - allUrls.push(url.loc[0]); - } - } + const validUrls = root.url + .filter(url => url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) + .map(url => url.loc[0]); + allUrls.push(...validUrls); } } catch (error) { Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`); From a0f9ab2be74b53fa0f5af8632c344900245a2b2b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 6 Sep 2024 20:14:47 -0300 Subject: [PATCH 2/3] Update map.ts --- apps/api/src/controllers/v1/map.ts | 44 +++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index e6abd9ae..9142f5c7 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -47,24 +47,42 @@ export async function mapController( const crawler = crawlToCrawler(id, sc); - const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap(); - - if (sitemap !== null) { - sitemap.map((x) => { - links.push(x.url); - }); - } - let urlWithoutWww = req.body.url.replace("www.", ""); let mapUrl = req.body.search ? `"${req.body.search}" site:${urlWithoutWww}` : `site:${req.body.url}`; - // www. seems to exclude subdomains in some cases - const mapResults = await fireEngineMap(mapUrl, { - // limit to 100 results (beta) - numResults: Math.min(limit, 100), - }); + + const maxResults = 5000; + const resultsPerPage = 100; + const maxPages = Math.ceil(maxResults / resultsPerPage); + + const fetchPage = async (page: number) => { + return fireEngineMap(mapUrl, { + numResults: resultsPerPage, + page: page + }); + }; + + const pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); + + // Parallelize sitemap fetch with serper search + const [sitemap, ...allResults] = await Promise.all([ + req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), + ...pagePromises + ]); + + if (sitemap !== null) { + sitemap.forEach((x) => { + links.push(x.url); + }); + } + + let mapResults = allResults.flat().filter(result => result !== null && result !== undefined); + + if (mapResults.length > maxResults) { + mapResults = mapResults.slice(0, maxResults); + } if (mapResults.length > 0) { if (req.body.search) { From 9da3432596eb78dd55b7b94ff1a67518d1aca6c9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 16 Sep 2024 12:03:14 -0400 Subject: [PATCH 3/3] Update map.ts --- apps/api/src/controllers/v1/map.ts | 133 +++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 28 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index aeedc792..6b13f762 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -19,8 +19,15 @@ import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; import { performCosineSimilarity } from "../../lib/map-cosine"; import { Logger } from "../../lib/logger"; +import Redis from "ioredis"; configDotenv(); +const redis = new Redis(process.env.REDIS_URL); + +// Max Links that /map can return +const MAX_MAP_LIMIT = 5000; +// Max Links that "Smart /map" can return +const MAX_FIRE_ENGINE_RESULTS = 1000; export async function mapController( req: RequestWithAuth<{}, MapResponse, MapRequest>, @@ -30,8 +37,7 @@ export async function mapController( req.body = mapRequestSchema.parse(req.body); - - const limit : number = req.body.limit ?? 5000; + const limit: number = req.body.limit ?? MAX_MAP_LIMIT; const id = uuidv4(); let links: string[] = [req.body.url]; @@ -53,35 +59,54 @@ export async function mapController( ? `"${req.body.search}" site:${urlWithoutWww}` : `site:${req.body.url}`; - const maxResults = 5000; const resultsPerPage = 100; - const maxPages = Math.ceil(maxResults / resultsPerPage); + const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); - const fetchPage = async (page: number) => { - return fireEngineMap(mapUrl, { - numResults: resultsPerPage, - page: page - }); - }; + const cacheKey = `fireEngineMap:${mapUrl}`; + const cachedResult = await redis.get(cacheKey); + + let allResults: any[]; + let pagePromises: Promise[]; + + if (cachedResult) { + allResults = JSON.parse(cachedResult); + } else { + const fetchPage = async (page: number) => { + return fireEngineMap(mapUrl, { + numResults: resultsPerPage, + page: page, + }); + }; + + pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); + allResults = await Promise.all(pagePromises); + + await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours + } - const pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); - // Parallelize sitemap fetch with serper search - const [sitemap, ...allResults] = await Promise.all([ + const [sitemap, ...searchResults] = await Promise.all([ req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), - ...pagePromises + ...(cachedResult ? [] : pagePromises), ]); + if (!cachedResult) { + allResults = searchResults; + } + if (sitemap !== null) { sitemap.forEach((x) => { links.push(x.url); }); } - let mapResults = allResults.flat().filter(result => result !== null && result !== undefined); + let mapResults = allResults + .flat() + .filter((result) => result !== null && result !== undefined); - if (mapResults.length > maxResults) { - mapResults = mapResults.slice(0, maxResults); + const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit); + if (mapResults.length > minumumCutoff) { + mapResults = mapResults.slice(0, minumumCutoff); } if (mapResults.length > 0) { @@ -102,17 +127,19 @@ export async function mapController( // Perform cosine similarity between the search query and the list of links if (req.body.search) { const searchQuery = req.body.search.toLowerCase(); - + links = performCosineSimilarity(links, searchQuery); } - links = links.map((x) => { - try { - return checkAndUpdateURLForMap(x).url.trim() - } catch (_) { - return null; - } - }).filter(x => x !== null); + links = links + .map((x) => { + try { + return checkAndUpdateURLForMap(x).url.trim(); + } catch (_) { + return null; + } + }) + .filter((x) => x !== null); // allows for subdomains to be included links = links.filter((x) => isSameDomain(x, req.body.url)); @@ -125,8 +152,10 @@ export async function mapController( // remove duplicates that could be due to http/https or www links = removeDuplicateUrls(links); - billTeam(req.auth.team_id, 1).catch(error => { - Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`); + billTeam(req.auth.team_id, 1).catch((error) => { + Logger.error( + `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}` + ); // Optionally, you could notify an admin or add to a retry queue here }); @@ -134,7 +163,7 @@ export async function mapController( const timeTakenInSeconds = (endTime - startTime) / 1000; const linksToReturn = links.slice(0, limit); - + logJob({ job_id: id, success: links.length > 0, @@ -158,3 +187,51 @@ export async function mapController( scrape_id: req.body.origin?.includes("website") ? id : undefined, }); } + +// Subdomain sitemap url checking + +// // For each result, check for subdomains, get their sitemaps and add them to the links +// const processedUrls = new Set(); +// const processedSubdomains = new Set(); + +// for (const result of links) { +// let url; +// let hostParts; +// try { +// url = new URL(result); +// hostParts = url.hostname.split('.'); +// } catch (e) { +// continue; +// } + +// console.log("hostParts", hostParts); +// // Check if it's a subdomain (more than 2 parts, and not 'www') +// if (hostParts.length > 2 && hostParts[0] !== 'www') { +// const subdomain = hostParts[0]; +// console.log("subdomain", subdomain); +// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`; +// console.log("subdomainUrl", subdomainUrl); + +// if (!processedSubdomains.has(subdomainUrl)) { +// processedSubdomains.add(subdomainUrl); + +// const subdomainCrawl = crawlToCrawler(id, { +// originUrl: subdomainUrl, +// crawlerOptions: legacyCrawlerOptions(req.body), +// pageOptions: {}, +// team_id: req.auth.team_id, +// createdAt: Date.now(), +// plan: req.auth.plan, +// }); +// const subdomainSitemap = await subdomainCrawl.tryGetSitemap(); +// if (subdomainSitemap) { +// subdomainSitemap.forEach((x) => { +// if (!processedUrls.has(x.url)) { +// processedUrls.add(x.url); +// links.push(x.url); +// } +// }); +// } +// } +// } +// }