Nick: /map almost good
This commit is contained in:
@@ -6,3 +6,5 @@ dump.rdb
|
|||||||
/mongo-data
|
/mongo-data
|
||||||
|
|
||||||
/.next/
|
/.next/
|
||||||
|
|
||||||
|
.rdb
|
||||||
@@ -1,16 +1,29 @@
|
|||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { legacyCrawlerOptions, mapRequestSchema, RequestWithAuth } from "./types";
|
import {
|
||||||
|
legacyCrawlerOptions,
|
||||||
|
mapRequestSchema,
|
||||||
|
RequestWithAuth,
|
||||||
|
} from "./types";
|
||||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||||
import { MapResponse , MapRequest } from "./types";
|
import { MapResponse, MapRequest } from "./types";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { search } from "../../search";
|
import { search } from "../../search";
|
||||||
import { checkAndUpdateURL } from "../../lib/validateUrl";
|
import {
|
||||||
|
checkAndUpdateURL,
|
||||||
|
checkAndUpdateURLForMap,
|
||||||
|
isSameDomain,
|
||||||
|
isSameSubdomain,
|
||||||
|
} from "../../lib/validateUrl";
|
||||||
|
import { fireEngineMap } from "../../search/fireEngine";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
|
export async function mapController(
|
||||||
|
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||||
|
res: Response<MapResponse>
|
||||||
|
) {
|
||||||
req.body = mapRequestSchema.parse(req.body);
|
req.body = mapRequestSchema.parse(req.body);
|
||||||
|
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
@@ -28,30 +41,42 @@ export async function mapController(req: RequestWithAuth<{}, MapResponse, MapReq
|
|||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||||
|
? null
|
||||||
|
: await crawler.tryGetSitemap();
|
||||||
|
|
||||||
if (sitemap !== null) {
|
if (sitemap !== null) {
|
||||||
sitemap.map(x => { links.push(x.url); });
|
sitemap.map((x) => {
|
||||||
|
links.push(x.url);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const searchResults = await search({
|
const mapResults = await fireEngineMap(req.body.url, {
|
||||||
query: `site:${req.body.url}`,
|
numResults: 50,
|
||||||
advanced: false,
|
});
|
||||||
num_results: 50,
|
|
||||||
lang: "en",
|
|
||||||
country: "us",
|
|
||||||
location: "United States",
|
|
||||||
})
|
|
||||||
|
|
||||||
if (searchResults.length > 0) {
|
if (mapResults.length > 0) {
|
||||||
searchResults.map(x => { links.push(x.url); });
|
mapResults.map((x) => {
|
||||||
|
links.push(x.url);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
links = links.map(x => checkAndUpdateURL(x).url);
|
links = links.map((x) => checkAndUpdateURLForMap(x).url);
|
||||||
|
|
||||||
|
// allows for subdomains to be included
|
||||||
|
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||||
|
|
||||||
|
// if includeSubdomains is false, filter out subdomains
|
||||||
|
if (!req.body.includeSubdomains) {
|
||||||
|
links = links.filter((x) => isSameSubdomain(x, req.body.url));
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove duplicates that could be due to http/https or www
|
||||||
|
|
||||||
links = [...new Set(links)];
|
links = [...new Set(links)];
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
links
|
links,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ export const scrapeOptions = z.object({
|
|||||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||||
|
|
||||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||||
url,
|
url: z.string().url(),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -91,7 +91,9 @@ export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
|||||||
|
|
||||||
export const mapRequestSchema = crawlerOptions.extend({
|
export const mapRequestSchema = crawlerOptions.extend({
|
||||||
url,
|
url,
|
||||||
origin: z.string().optional().default("api")
|
origin: z.string().optional().default("api"),
|
||||||
|
includeSubdomains: z.boolean().default(false),
|
||||||
|
searchEngine: z.string().optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
// export type MapRequest = {
|
// export type MapRequest = {
|
||||||
|
|||||||
@@ -0,0 +1,88 @@
|
|||||||
|
import { isSameDomain } from "./validateUrl";
|
||||||
|
import { isSameSubdomain } from "./validateUrl";
|
||||||
|
|
||||||
|
describe("isSameDomain", () => {
|
||||||
|
it("should return true for a subdomain", () => {
|
||||||
|
const result = isSameDomain("http://sub.example.com", "http://example.com");
|
||||||
|
expect(result).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return true for the same domain", () => {
|
||||||
|
const result = isSameDomain("http://example.com", "http://example.com");
|
||||||
|
expect(result).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false for different domains", () => {
|
||||||
|
const result = isSameDomain("http://example.com", "http://another.com");
|
||||||
|
expect(result).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return true for a subdomain with different protocols", () => {
|
||||||
|
const result = isSameDomain("https://sub.example.com", "http://example.com");
|
||||||
|
expect(result).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false for invalid URLs", () => {
|
||||||
|
const result = isSameDomain("invalid-url", "http://example.com");
|
||||||
|
expect(result).toBe(false);
|
||||||
|
const result2 = isSameDomain("http://example.com", "invalid-url");
|
||||||
|
expect(result2).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return true for a subdomain with www prefix", () => {
|
||||||
|
const result = isSameDomain("http://www.sub.example.com", "http://example.com");
|
||||||
|
expect(result).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return true for the same domain with www prefix", () => {
|
||||||
|
const result = isSameDomain("http://docs.s.s.example.com", "http://example.com");
|
||||||
|
expect(result).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
describe("isSameSubdomain", () => {
|
||||||
|
it("should return false for a subdomain", () => {
|
||||||
|
const result = isSameSubdomain("http://example.com", "http://docs.example.com");
|
||||||
|
expect(result).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return true for the same subdomain", () => {
|
||||||
|
const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com");
|
||||||
|
expect(result).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false for different subdomains", () => {
|
||||||
|
const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com");
|
||||||
|
expect(result).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false for different domains", () => {
|
||||||
|
const result = isSameSubdomain("http://example.com", "http://another.com");
|
||||||
|
expect(result).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false for invalid URLs", () => {
|
||||||
|
const result = isSameSubdomain("invalid-url", "http://example.com");
|
||||||
|
expect(result).toBe(false);
|
||||||
|
const result2 = isSameSubdomain("http://example.com", "invalid-url");
|
||||||
|
expect(result2).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return true for the same subdomain with different protocols", () => {
|
||||||
|
const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com");
|
||||||
|
expect(result).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return true for the same subdomain with www prefix", () => {
|
||||||
|
const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com");
|
||||||
|
expect(result).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false for a subdomain with www prefix and different subdomain", () => {
|
||||||
|
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
|
||||||
|
expect(result).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,9 +1,8 @@
|
|||||||
|
|
||||||
const protocolIncluded = (url: string) => {
|
const protocolIncluded = (url: string) => {
|
||||||
// if :// not in the start of the url assume http (maybe https?)
|
// if :// not in the start of the url assume http (maybe https?)
|
||||||
// regex checks if :// appears before any .
|
// regex checks if :// appears before any .
|
||||||
return(/^([^.:]+:\/\/)/.test(url));
|
return /^([^.:]+:\/\/)/.test(url);
|
||||||
}
|
};
|
||||||
|
|
||||||
const getURLobj = (s: string) => {
|
const getURLobj = (s: string) => {
|
||||||
// URL fails if we dont include the protocol ie google.com
|
// URL fails if we dont include the protocol ie google.com
|
||||||
@@ -18,7 +17,6 @@ const getURLobj = (s: string) => {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export const checkAndUpdateURL = (url: string) => {
|
export const checkAndUpdateURL = (url: string) => {
|
||||||
|
|
||||||
if (!protocolIncluded(url)) {
|
if (!protocolIncluded(url)) {
|
||||||
url = `http://${url}`;
|
url = `http://${url}`;
|
||||||
}
|
}
|
||||||
@@ -30,9 +28,95 @@ export const checkAndUpdateURL = (url: string) => {
|
|||||||
|
|
||||||
const typedUrlObj = urlObj as URL;
|
const typedUrlObj = urlObj as URL;
|
||||||
|
|
||||||
if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||||
throw new Error("Invalid URL");
|
throw new Error("Invalid URL");
|
||||||
}
|
}
|
||||||
|
|
||||||
return { urlObj: typedUrlObj, url: url };
|
return { urlObj: typedUrlObj, url: url };
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Same domain check
|
||||||
|
* It checks if the domain of the url is the same as the base url
|
||||||
|
* It accounts true for subdomains and www.subdomains
|
||||||
|
* @param url
|
||||||
|
* @param baseUrl
|
||||||
|
* @returns
|
||||||
|
*/
|
||||||
|
export function isSameDomain(url: string, baseUrl: string) {
|
||||||
|
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||||
|
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||||
|
|
||||||
|
if (error1 || error2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const typedUrlObj1 = urlObj1 as URL;
|
||||||
|
const typedUrlObj2 = urlObj2 as URL;
|
||||||
|
|
||||||
|
const cleanHostname = (hostname: string) => {
|
||||||
|
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
||||||
|
};
|
||||||
|
|
||||||
|
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
||||||
|
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
||||||
|
|
||||||
|
return domain1 === domain2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export function isSameSubdomain(url: string, baseUrl: string) {
|
||||||
|
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||||
|
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||||
|
|
||||||
|
if (error1 || error2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const typedUrlObj1 = urlObj1 as URL;
|
||||||
|
const typedUrlObj2 = urlObj2 as URL;
|
||||||
|
|
||||||
|
const cleanHostname = (hostname: string) => {
|
||||||
|
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
||||||
|
};
|
||||||
|
|
||||||
|
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
||||||
|
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
||||||
|
|
||||||
|
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
|
||||||
|
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
|
||||||
|
|
||||||
|
// Check if the domains are the same and the subdomains are the same
|
||||||
|
return domain1 === domain2 && subdomain1 === subdomain2;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export const checkAndUpdateURLForMap = (url: string) => {
|
||||||
|
if (!protocolIncluded(url)) {
|
||||||
|
url = `http://${url}`;
|
||||||
|
}
|
||||||
|
// remove last slash if present
|
||||||
|
if (url.endsWith("/")) {
|
||||||
|
url = url.slice(0, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const { error, urlObj } = getURLobj(url);
|
||||||
|
if (error) {
|
||||||
|
throw new Error("Invalid URL");
|
||||||
|
}
|
||||||
|
|
||||||
|
const typedUrlObj = urlObj as URL;
|
||||||
|
|
||||||
|
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||||
|
throw new Error("Invalid URL");
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove any query params
|
||||||
|
url = url.split("?")[0];
|
||||||
|
|
||||||
|
return { urlObj: typedUrlObj, url: url };
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import { SearchResult } from "../../src/lib/entities";
|
|||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
export async function fireEngineSearch(q: string, options: {
|
export async function fireEngineMap(q: string, options: {
|
||||||
tbs?: string;
|
tbs?: string;
|
||||||
filter?: string;
|
filter?: string;
|
||||||
lang?: string;
|
lang?: string;
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { SearchResult } from "../../src/lib/entities";
|
import { SearchResult } from "../../src/lib/entities";
|
||||||
import { googleSearch } from "./googlesearch";
|
import { googleSearch } from "./googlesearch";
|
||||||
import { fireEngineSearch } from "./fireEngine";
|
import { fireEngineMap } from "./fireEngine";
|
||||||
|
import { serper_search } from "./serper";
|
||||||
|
|
||||||
export async function search({
|
export async function search({
|
||||||
query,
|
query,
|
||||||
@@ -27,10 +28,18 @@ export async function search({
|
|||||||
proxy?: string;
|
proxy?: string;
|
||||||
sleep_interval?: number;
|
sleep_interval?: number;
|
||||||
timeout?: number;
|
timeout?: number;
|
||||||
}) : Promise<SearchResult[]> {
|
}): Promise<SearchResult[]> {
|
||||||
try {
|
try {
|
||||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
|
||||||
return await fireEngineSearch(query, {numResults: num_results, tbs, filter, lang, country, location});
|
if (process.env.SERPER_API_KEY) {
|
||||||
|
return await serper_search(query, {
|
||||||
|
num_results,
|
||||||
|
tbs,
|
||||||
|
filter,
|
||||||
|
lang,
|
||||||
|
country,
|
||||||
|
location,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
return await googleSearch(
|
return await googleSearch(
|
||||||
query,
|
query,
|
||||||
@@ -46,6 +55,6 @@ export async function search({
|
|||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error in search function: ${error}`);
|
Logger.error(`Error in search function: ${error}`);
|
||||||
return []
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,45 @@
|
|||||||
|
import axios from "axios";
|
||||||
|
import dotenv from "dotenv";
|
||||||
|
import { SearchResult } from "../../src/lib/entities";
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
export async function serper_search(q, options: {
|
||||||
|
tbs?: string;
|
||||||
|
filter?: string;
|
||||||
|
lang?: string;
|
||||||
|
country?: string;
|
||||||
|
location?: string;
|
||||||
|
num_results: number;
|
||||||
|
page?: number;
|
||||||
|
}): Promise<SearchResult[]> {
|
||||||
|
let data = JSON.stringify({
|
||||||
|
q: q,
|
||||||
|
hl: options.lang,
|
||||||
|
gl: options.country,
|
||||||
|
location: options.location,
|
||||||
|
tbs: options.tbs,
|
||||||
|
num: options.num_results,
|
||||||
|
page: options.page ?? 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
let config = {
|
||||||
|
method: "POST",
|
||||||
|
url: "https://google.serper.dev/search",
|
||||||
|
headers: {
|
||||||
|
"X-API-KEY": process.env.SERPER_API_KEY,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
data: data,
|
||||||
|
};
|
||||||
|
const response = await axios(config);
|
||||||
|
if (response && response.data && Array.isArray(response.data.organic)) {
|
||||||
|
return response.data.organic.map((a) => ({
|
||||||
|
url: a.link,
|
||||||
|
title: a.title,
|
||||||
|
description: a.snippet,
|
||||||
|
}));
|
||||||
|
}else{
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user