Nick: /map almost good

This commit is contained in:
Nicolas
2024-08-16 19:33:57 -04:00
parent 4c1b74dab3
commit ab48353226
8 changed files with 286 additions and 31 deletions
+2
View File
@@ -6,3 +6,5 @@ dump.rdb
/mongo-data /mongo-data
/.next/ /.next/
.rdb
+43 -18
View File
@@ -1,16 +1,29 @@
import { Response } from "express"; import { Response } from "express";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { legacyCrawlerOptions, mapRequestSchema, RequestWithAuth } from "./types"; import {
legacyCrawlerOptions,
mapRequestSchema,
RequestWithAuth,
} from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse , MapRequest } from "./types"; import { MapResponse, MapRequest } from "./types";
import { Logger } from "../../lib/logger"; import { Logger } from "../../lib/logger";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { search } from "../../search"; import { search } from "../../search";
import { checkAndUpdateURL } from "../../lib/validateUrl"; import {
checkAndUpdateURL,
checkAndUpdateURLForMap,
isSameDomain,
isSameSubdomain,
} from "../../lib/validateUrl";
import { fireEngineMap } from "../../search/fireEngine";
configDotenv(); configDotenv();
export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response<MapResponse>) { export async function mapController(
req: RequestWithAuth<{}, MapResponse, MapRequest>,
res: Response<MapResponse>
) {
req.body = mapRequestSchema.parse(req.body); req.body = mapRequestSchema.parse(req.body);
const id = uuidv4(); const id = uuidv4();
@@ -28,30 +41,42 @@ export async function mapController(req: RequestWithAuth<{}, MapResponse, MapReq
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap(); const sitemap = sc.crawlerOptions.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null) { if (sitemap !== null) {
sitemap.map(x => { links.push(x.url); }); sitemap.map((x) => {
links.push(x.url);
});
} }
const searchResults = await search({ const mapResults = await fireEngineMap(req.body.url, {
query: `site:${req.body.url}`, numResults: 50,
advanced: false, });
num_results: 50,
lang: "en",
country: "us",
location: "United States",
})
if (searchResults.length > 0) { if (mapResults.length > 0) {
searchResults.map(x => { links.push(x.url); }); mapResults.map((x) => {
links.push(x.url);
});
} }
links = links.map(x => checkAndUpdateURL(x).url); links = links.map((x) => checkAndUpdateURLForMap(x).url);
// allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.url));
// if includeSubdomains is false, filter out subdomains
if (!req.body.includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x, req.body.url));
}
// remove duplicates that could be due to http/https or www
links = [...new Set(links)]; links = [...new Set(links)];
return res.status(200).json({ return res.status(200).json({
success: true, success: true,
links links,
}); });
} }
+4 -2
View File
@@ -34,7 +34,7 @@ export const scrapeOptions = z.object({
export type ScrapeOptions = z.infer<typeof scrapeOptions>; export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeRequestSchema = scrapeOptions.extend({ export const scrapeRequestSchema = scrapeOptions.extend({
url, url: z.string().url(),
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
}); });
@@ -91,7 +91,9 @@ export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions.extend({ export const mapRequestSchema = crawlerOptions.extend({
url, url,
origin: z.string().optional().default("api") origin: z.string().optional().default("api"),
includeSubdomains: z.boolean().default(false),
searchEngine: z.string().optional(),
}); });
// export type MapRequest = { // export type MapRequest = {
+88
View File
@@ -0,0 +1,88 @@
import { isSameDomain } from "./validateUrl";
import { isSameSubdomain } from "./validateUrl";
describe("isSameDomain", () => {
it("should return true for a subdomain", () => {
const result = isSameDomain("http://sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return true for the same domain", () => {
const result = isSameDomain("http://example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return false for different domains", () => {
const result = isSameDomain("http://example.com", "http://another.com");
expect(result).toBe(false);
});
it("should return true for a subdomain with different protocols", () => {
const result = isSameDomain("https://sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return false for invalid URLs", () => {
const result = isSameDomain("invalid-url", "http://example.com");
expect(result).toBe(false);
const result2 = isSameDomain("http://example.com", "invalid-url");
expect(result2).toBe(false);
});
it("should return true for a subdomain with www prefix", () => {
const result = isSameDomain("http://www.sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return true for the same domain with www prefix", () => {
const result = isSameDomain("http://docs.s.s.example.com", "http://example.com");
expect(result).toBe(true);
});
});
describe("isSameSubdomain", () => {
it("should return false for a subdomain", () => {
const result = isSameSubdomain("http://example.com", "http://docs.example.com");
expect(result).toBe(false);
});
it("should return true for the same subdomain", () => {
const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return false for different subdomains", () => {
const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com");
expect(result).toBe(false);
});
it("should return false for different domains", () => {
const result = isSameSubdomain("http://example.com", "http://another.com");
expect(result).toBe(false);
});
it("should return false for invalid URLs", () => {
const result = isSameSubdomain("invalid-url", "http://example.com");
expect(result).toBe(false);
const result2 = isSameSubdomain("http://example.com", "invalid-url");
expect(result2).toBe(false);
});
it("should return true for the same subdomain with different protocols", () => {
const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return true for the same subdomain with www prefix", () => {
const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return false for a subdomain with www prefix and different subdomain", () => {
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
expect(result).toBe(false);
});
});
+89 -5
View File
@@ -1,9 +1,8 @@
const protocolIncluded = (url: string) => { const protocolIncluded = (url: string) => {
// if :// not in the start of the url assume http (maybe https?) // if :// not in the start of the url assume http (maybe https?)
// regex checks if :// appears before any . // regex checks if :// appears before any .
return(/^([^.:]+:\/\/)/.test(url)); return /^([^.:]+:\/\/)/.test(url);
} };
const getURLobj = (s: string) => { const getURLobj = (s: string) => {
// URL fails if we dont include the protocol ie google.com // URL fails if we dont include the protocol ie google.com
@@ -18,7 +17,6 @@ const getURLobj = (s: string) => {
}; };
export const checkAndUpdateURL = (url: string) => { export const checkAndUpdateURL = (url: string) => {
if (!protocolIncluded(url)) { if (!protocolIncluded(url)) {
url = `http://${url}`; url = `http://${url}`;
} }
@@ -30,9 +28,95 @@ export const checkAndUpdateURL = (url: string) => {
const typedUrlObj = urlObj as URL; const typedUrlObj = urlObj as URL;
if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL"); throw new Error("Invalid URL");
} }
return { urlObj: typedUrlObj, url: url }; return { urlObj: typedUrlObj, url: url };
};
/**
* Same domain check
* It checks if the domain of the url is the same as the base url
* It accounts true for subdomains and www.subdomains
* @param url
* @param baseUrl
* @returns
*/
export function isSameDomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
return domain1 === domain2;
} }
export function isSameSubdomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
// Check if the domains are the same and the subdomains are the same
return domain1 === domain2 && subdomain1 === subdomain2;
}
export const checkAndUpdateURLForMap = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
// remove last slash if present
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
// remove any query params
url = url.split("?")[0];
return { urlObj: typedUrlObj, url: url };
};
+1 -1
View File
@@ -4,7 +4,7 @@ import { SearchResult } from "../../src/lib/entities";
dotenv.config(); dotenv.config();
export async function fireEngineSearch(q: string, options: { export async function fireEngineMap(q: string, options: {
tbs?: string; tbs?: string;
filter?: string; filter?: string;
lang?: string; lang?: string;
+14 -5
View File
@@ -1,7 +1,8 @@
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { SearchResult } from "../../src/lib/entities"; import { SearchResult } from "../../src/lib/entities";
import { googleSearch } from "./googlesearch"; import { googleSearch } from "./googlesearch";
import { fireEngineSearch } from "./fireEngine"; import { fireEngineMap } from "./fireEngine";
import { serper_search } from "./serper";
export async function search({ export async function search({
query, query,
@@ -27,10 +28,18 @@ export async function search({
proxy?: string; proxy?: string;
sleep_interval?: number; sleep_interval?: number;
timeout?: number; timeout?: number;
}) : Promise<SearchResult[]> { }): Promise<SearchResult[]> {
try { try {
if (process.env.FIRE_ENGINE_BETA_URL) {
return await fireEngineSearch(query, {numResults: num_results, tbs, filter, lang, country, location}); if (process.env.SERPER_API_KEY) {
return await serper_search(query, {
num_results,
tbs,
filter,
lang,
country,
location,
});
} }
return await googleSearch( return await googleSearch(
query, query,
@@ -46,6 +55,6 @@ export async function search({
); );
} catch (error) { } catch (error) {
Logger.error(`Error in search function: ${error}`); Logger.error(`Error in search function: ${error}`);
return [] return [];
} }
} }
+45
View File
@@ -0,0 +1,45 @@
import axios from "axios";
import dotenv from "dotenv";
import { SearchResult } from "../../src/lib/entities";
dotenv.config();
export async function serper_search(q, options: {
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
num_results: number;
page?: number;
}): Promise<SearchResult[]> {
let data = JSON.stringify({
q: q,
hl: options.lang,
gl: options.country,
location: options.location,
tbs: options.tbs,
num: options.num_results,
page: options.page ?? 1,
});
let config = {
method: "POST",
url: "https://google.serper.dev/search",
headers: {
"X-API-KEY": process.env.SERPER_API_KEY,
"Content-Type": "application/json",
},
data: data,
};
const response = await axios(config);
if (response && response.data && Array.isArray(response.data.organic)) {
return response.data.organic.map((a) => ({
url: a.link,
title: a.title,
description: a.snippet,
}));
}else{
return [];
}
}