Merge branch 'main' into feat/queue-scrapes

This commit is contained in:
Nicolas
2024-07-30 12:44:07 -04:00
81 changed files with 1034 additions and 505 deletions
+11
View File
@@ -57,3 +57,14 @@ SELF_HOSTED_WEBHOOK_URL=
# Resend API Key for transactional emails # Resend API Key for transactional emails
RESEND_API_KEY= RESEND_API_KEY=
# LOGGING_LEVEL determines the verbosity of logs that the system will output.
# Available levels are:
# NONE - No logs will be output.
# ERROR - For logging error messages that indicate a failure in a specific operation.
# WARN - For logging potentially harmful situations that are not necessarily errors.
# INFO - For logging informational messages that highlight the progress of the application.
# DEBUG - For logging detailed information on the flow through the system, primarily used for debugging.
# TRACE - For logging more detailed information than the DEBUG level.
# Set LOGGING_LEVEL to one of the above options to control logging output.
LOGGING_LEVEL=INFO
+3 -1
View File
@@ -3,4 +3,6 @@
.env .env
*.csv *.csv
dump.rdb dump.rdb
/mongo-data /mongo-data
/.next/
+5 -5
View File
@@ -4,15 +4,15 @@
# #
app = 'firecrawl-scraper-js' app = 'firecrawl-scraper-js'
primary_region = 'mia' primary_region = 'iad'
kill_signal = 'SIGINT' kill_signal = 'SIGINT'
kill_timeout = '30s' kill_timeout = '30s'
[build] [build]
[processes] [processes]
app = 'node --max-old-space-size=4096 dist/src/index.js' app = 'node --max-old-space-size=8192 dist/src/index.js'
worker = 'node --max-old-space-size=4096 dist/src/services/queue-worker.js' worker = 'node --max-old-space-size=8192 dist/src/services/queue-worker.js'
[http_service] [http_service]
internal_port = 8080 internal_port = 8080
@@ -24,8 +24,8 @@ kill_timeout = '30s'
[http_service.concurrency] [http_service.concurrency]
type = "requests" type = "requests"
hard_limit = 100 hard_limit = 200
soft_limit = 50 soft_limit = 75
[[http_service.checks]] [[http_service.checks]]
grace_period = "20s" grace_period = "20s"
+2
View File
@@ -26,6 +26,7 @@
"license": "ISC", "license": "ISC",
"devDependencies": { "devDependencies": {
"@flydotio/dockerfile": "^0.4.10", "@flydotio/dockerfile": "^0.4.10",
"@jest/globals": "^29.7.0",
"@tsconfig/recommended": "^1.0.3", "@tsconfig/recommended": "^1.0.3",
"@types/body-parser": "^1.19.2", "@types/body-parser": "^1.19.2",
"@types/bull": "^4.10.0", "@types/bull": "^4.10.0",
@@ -63,6 +64,7 @@
"axios": "^1.3.4", "axios": "^1.3.4",
"bottleneck": "^2.19.5", "bottleneck": "^2.19.5",
"bull": "^4.15.0", "bull": "^4.15.0",
"cacheable-lookup": "^6.1.0",
"cheerio": "^1.0.0-rc.12", "cheerio": "^1.0.0-rc.12",
"cohere": "^1.1.1", "cohere": "^1.1.1",
"cors": "^2.8.5", "cors": "^2.8.5",
+16 -4
View File
@@ -59,6 +59,9 @@ importers:
bull: bull:
specifier: ^4.15.0 specifier: ^4.15.0
version: 4.15.0 version: 4.15.0
cacheable-lookup:
specifier: ^6.1.0
version: 6.1.0
cheerio: cheerio:
specifier: ^1.0.0-rc.12 specifier: ^1.0.0-rc.12
version: 1.0.0-rc.12 version: 1.0.0-rc.12
@@ -189,6 +192,9 @@ importers:
'@flydotio/dockerfile': '@flydotio/dockerfile':
specifier: ^0.4.10 specifier: ^0.4.10
version: 0.4.11 version: 0.4.11
'@jest/globals':
specifier: ^29.7.0
version: 29.7.0
'@tsconfig/recommended': '@tsconfig/recommended':
specifier: ^1.0.3 specifier: ^1.0.3
version: 1.0.6 version: 1.0.6
@@ -1937,6 +1943,10 @@ packages:
resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==} resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==}
engines: {node: '>= 0.8'} engines: {node: '>= 0.8'}
cacheable-lookup@6.1.0:
resolution: {integrity: sha512-KJ/Dmo1lDDhmW2XDPMo+9oiy/CeqosPguPCrgcVzKyZrL6pM1gU2GmPY/xo6OQPTUaA/c0kwHuywB4E6nmT9ww==}
engines: {node: '>=10.6.0'}
call-bind@1.0.7: call-bind@1.0.7:
resolution: {integrity: sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==} resolution: {integrity: sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==}
engines: {node: '>= 0.4'} engines: {node: '>= 0.4'}
@@ -4369,8 +4379,8 @@ packages:
engines: {node: '>=14.17'} engines: {node: '>=14.17'}
hasBin: true hasBin: true
typescript@5.5.3: typescript@5.5.4:
resolution: {integrity: sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==} resolution: {integrity: sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==}
engines: {node: '>=14.17'} engines: {node: '>=14.17'}
hasBin: true hasBin: true
@@ -6917,6 +6927,8 @@ snapshots:
bytes@3.1.2: {} bytes@3.1.2: {}
cacheable-lookup@6.1.0: {}
call-bind@1.0.7: call-bind@1.0.7:
dependencies: dependencies:
es-define-property: 1.0.0 es-define-property: 1.0.0
@@ -8927,7 +8939,7 @@ snapshots:
csv-parse: 5.5.6 csv-parse: 5.5.6
gpt3-tokenizer: 1.1.5 gpt3-tokenizer: 1.1.5
openai: 3.3.0 openai: 3.3.0
typescript: 5.5.3 typescript: 5.5.4
uuid: 9.0.1 uuid: 9.0.1
zod: 3.23.8 zod: 3.23.8
transitivePeerDependencies: transitivePeerDependencies:
@@ -9519,7 +9531,7 @@ snapshots:
typescript@5.4.5: {} typescript@5.4.5: {}
typescript@5.5.3: {} typescript@5.5.4: {}
typesense@1.8.2(@babel/runtime@7.24.6): typesense@1.8.2(@babel/runtime@7.24.6):
dependencies: dependencies:
@@ -858,7 +858,6 @@ describe("E2E Tests for API Routes", () => {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
} }
} }
console.log(crawlData)
expect(crawlData.length).toBeGreaterThan(0); expect(crawlData.length).toBeGreaterThan(0);
expect(crawlData).toEqual(expect.arrayContaining([ expect(crawlData).toEqual(expect.arrayContaining([
expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }), expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),
+87
View File
@@ -0,0 +1,87 @@
import { Request, Response } from "express";
import { Job } from "bull";
import { Logger } from "../../lib/logger";
import { getWebScraperQueue } from "../../services/queue-service";
import { checkAlerts } from "../../services/alerts";
export async function cleanBefore24hCompleteJobsController(
req: Request,
res: Response
) {
Logger.info("🐂 Cleaning jobs older than 24h");
try {
const webScraperQueue = getWebScraperQueue();
const batchSize = 10;
const numberOfBatches = 9; // Adjust based on your needs
const completedJobsPromises: Promise<Job[]>[] = [];
for (let i = 0; i < numberOfBatches; i++) {
completedJobsPromises.push(
webScraperQueue.getJobs(
["completed"],
i * batchSize,
i * batchSize + batchSize,
true
)
);
}
const completedJobs: Job[] = (
await Promise.all(completedJobsPromises)
).flat();
const before24hJobs =
completedJobs.filter(
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
) || [];
let count = 0;
if (!before24hJobs) {
return res.status(200).send(`No jobs to remove.`);
}
for (const job of before24hJobs) {
try {
await job.remove();
count++;
} catch (jobError) {
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
}
}
return res.status(200).send(`Removed ${count} completed jobs.`);
} catch (error) {
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
return res.status(500).send("Failed to clean jobs");
}
}
export async function checkQueuesController(req: Request, res: Response) {
try {
await checkAlerts();
return res.status(200).send("Alerts initialized");
} catch (error) {
Logger.debug(`Failed to initialize alerts: ${error}`);
return res.status(500).send("Failed to initialize alerts");
}
}
// Use this as a "health check" that way we dont destroy the server
export async function queuesController(req: Request, res: Response) {
try {
const webScraperQueue = getWebScraperQueue();
const [webScraperActive] = await Promise.all([
webScraperQueue.getActiveCount(),
]);
const noActiveJobs = webScraperActive === 0;
// 200 if no active jobs, 503 if there are active jobs
return res.status(noActiveJobs ? 200 : 500).json({
webScraperActive,
noActiveJobs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
@@ -0,0 +1,86 @@
import { Request, Response } from "express";
import Redis from "ioredis";
import { Logger } from "../../lib/logger";
import { sendSlackWebhook } from "../../services/alerts/slack";
import { redisRateLimitClient } from "../../services/rate-limiter";
export async function redisHealthController(req: Request, res: Response) {
const retryOperation = async (operation, retries = 3) => {
for (let attempt = 1; attempt <= retries; attempt++) {
try {
return await operation();
} catch (error) {
if (attempt === retries) throw error;
Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
}
}
};
try {
const queueRedis = new Redis(process.env.REDIS_URL);
const testKey = "test";
const testValue = "test";
// Test queueRedis
let queueRedisHealth;
try {
await retryOperation(() => queueRedis.set(testKey, testValue));
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
await retryOperation(() => queueRedis.del(testKey));
} catch (error) {
Logger.error(`queueRedis health check failed: ${error}`);
queueRedisHealth = null;
}
// Test redisRateLimitClient
let redisRateLimitHealth;
try {
await retryOperation(() => redisRateLimitClient.set(testKey, testValue));
redisRateLimitHealth = await retryOperation(() =>
redisRateLimitClient.get(testKey)
);
await retryOperation(() => redisRateLimitClient.del(testKey));
} catch (error) {
Logger.error(`redisRateLimitClient health check failed: ${error}`);
redisRateLimitHealth = null;
}
const healthStatus = {
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
redisRateLimitClient:
redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
};
if (
healthStatus.queueRedis === "healthy" &&
healthStatus.redisRateLimitClient === "healthy"
) {
Logger.info("Both Redis instances are healthy");
return res.status(200).json({ status: "healthy", details: healthStatus });
} else {
Logger.info(
`Redis instances health check: ${JSON.stringify(healthStatus)}`
);
await sendSlackWebhook(
`[REDIS DOWN] Redis instances health check: ${JSON.stringify(
healthStatus
)}`,
true
);
return res
.status(500)
.json({ status: "unhealthy", details: healthStatus });
}
} catch (error) {
Logger.error(`Redis health check failed: ${error}`);
await sendSlackWebhook(
`[REDIS DOWN] Redis instances health check: ${error.message}`,
true
);
return res
.status(500)
.json({ status: "unhealthy", message: error.message });
}
}
+16 -7
View File
@@ -6,6 +6,7 @@ import { withAuth } from "../../src/lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible"; import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from '@hyperdx/node-opentelemetry'; import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
import { sendNotification } from "../services/notification/email_notification"; import { sendNotification } from "../services/notification/email_notification";
import { Logger } from "../lib/logger";
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> { export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
return withAuth(supaAuthenticateUser)(req, res, mode); return withAuth(supaAuthenticateUser)(req, res, mode);
@@ -17,7 +18,7 @@ function setTrace(team_id: string, api_key: string) {
api_key api_key
}); });
} catch (error) { } catch (error) {
console.error('Error setting trace attributes:', error); Logger.error(`Error setting trace attributes: ${error.message}`);
} }
} }
@@ -82,12 +83,15 @@ export async function supaAuthenticateUser(
// $$ language plpgsql; // $$ language plpgsql;
if (error) { if (error) {
console.error('Error fetching key and price_id:', error); Logger.warn(`Error fetching key and price_id: ${error.message}`);
} else { } else {
// console.log('Key and Price ID:', data); // console.log('Key and Price ID:', data);
} }
if (error || !data || data.length === 0) { if (error || !data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
return { return {
success: false, success: false,
error: "Unauthorized: Invalid token", error: "Unauthorized: Invalid token",
@@ -135,7 +139,7 @@ export async function supaAuthenticateUser(
try { try {
await rateLimiter.consume(team_endpoint_token); await rateLimiter.consume(team_endpoint_token);
} catch (rateLimiterRes) { } catch (rateLimiterRes) {
console.error(rateLimiterRes); Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1; const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext); const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
@@ -177,7 +181,10 @@ export async function supaAuthenticateUser(
.select("*") .select("*")
.eq("key", normalizedApi); .eq("key", normalizedApi);
if (error || !data || data.length === 0) { if (error || !data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
return { return {
success: false, success: false,
error: "Unauthorized: Invalid token", error: "Unauthorized: Invalid token",
@@ -190,7 +197,6 @@ export async function supaAuthenticateUser(
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""}; return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
} }
function getPlanByPriceId(price_id: string) { function getPlanByPriceId(price_id: string) {
switch (price_id) { switch (price_id) {
case process.env.STRIPE_PRICE_ID_STARTER: case process.env.STRIPE_PRICE_ID_STARTER:
@@ -199,11 +205,14 @@ function getPlanByPriceId(price_id: string) {
return 'standard'; return 'standard';
case process.env.STRIPE_PRICE_ID_SCALE: case process.env.STRIPE_PRICE_ID_SCALE:
return 'scale'; return 'scale';
case process.env.STRIPE_PRICE_ID_HOBBY || process.env.STRIPE_PRICE_ID_HOBBY_YEARLY: case process.env.STRIPE_PRICE_ID_HOBBY:
case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
return 'hobby'; return 'hobby';
case process.env.STRIPE_PRICE_ID_STANDARD_NEW || process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY: case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
return 'standardnew'; return 'standardnew';
case process.env.STRIPE_PRICE_ID_GROWTH || process.env.STRIPE_PRICE_ID_GROWTH_YEARLY: case process.env.STRIPE_PRICE_ID_GROWTH:
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
return 'growth'; return 'growth';
default: default:
return 'free'; return 'free';
+4 -3
View File
@@ -5,6 +5,7 @@ import { addWebScraperJob } from "../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../src/services/queue-service"; import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabase_service } from "../../src/services/supabase"; import { supabase_service } from "../../src/services/supabase";
import { billTeam } from "../../src/services/billing/credit_billing"; import { billTeam } from "../../src/services/billing/credit_billing";
import { Logger } from "../../src/lib/logger";
export async function crawlCancelController(req: Request, res: Response) { export async function crawlCancelController(req: Request, res: Response) {
try { try {
@@ -43,7 +44,7 @@ export async function crawlCancelController(req: Request, res: Response) {
const { partialDocs } = await job.progress(); const { partialDocs } = await job.progress();
if (partialDocs && partialDocs.length > 0 && jobState === "active") { if (partialDocs && partialDocs.length > 0 && jobState === "active") {
console.log("Billing team for partial docs..."); Logger.info("Billing team for partial docs...");
// Note: the credits that we will bill them here might be lower than the actual // Note: the credits that we will bill them here might be lower than the actual
// due to promises that are not yet resolved // due to promises that are not yet resolved
await billTeam(team_id, partialDocs.length); await billTeam(team_id, partialDocs.length);
@@ -55,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) {
await job.discard(); await job.discard();
await job.moveToFailed(Error("Job cancelled by user"), true); await job.moveToFailed(Error("Job cancelled by user"), true);
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
} }
const newJobState = await job.getState(); const newJobState = await job.getState();
@@ -64,7 +65,7 @@ export async function crawlCancelController(req: Request, res: Response) {
status: "cancelled" status: "cancelled"
}); });
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
+2 -1
View File
@@ -4,6 +4,7 @@ import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs"; import { addWebScraperJob } from "../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../src/services/queue-service"; import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import { Logger } from "../../src/lib/logger";
export async function crawlStatusController(req: Request, res: Response) { export async function crawlStatusController(req: Request, res: Response) {
try { try {
@@ -44,7 +45,7 @@ export async function crawlStatusController(req: Request, res: Response) {
partial_data: jobStatus == 'completed' ? [] : partialDocs, partial_data: jobStatus == 'completed' ? [] : partialDocs,
}); });
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
+7 -4
View File
@@ -10,6 +10,8 @@ import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create"; import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger";
export async function crawlController(req: Request, res: Response) { export async function crawlController(req: Request, res: Response) {
try { try {
@@ -30,7 +32,7 @@ export async function crawlController(req: Request, res: Response) {
try { try {
createIdempotencyKey(req); createIdempotencyKey(req);
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
@@ -60,10 +62,11 @@ export async function crawlController(req: Request, res: Response) {
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
if (mode === "single_urls" && !url.includes(",")) { if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
try { try {
const a = new WebScraperDataProvider(); const a = new WebScraperDataProvider();
await a.setOptions({ await a.setOptions({
jobId: uuidv4(),
mode: "single_urls", mode: "single_urls",
urls: [url], urls: [url],
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
@@ -83,7 +86,7 @@ export async function crawlController(req: Request, res: Response) {
documents: docs, documents: docs,
}); });
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
@@ -101,7 +104,7 @@ export async function crawlController(req: Request, res: Response) {
res.json({ jobId: job.id }); res.json({ jobId: job.id });
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
+2 -1
View File
@@ -3,6 +3,7 @@ import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types"; import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs"; import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { Logger } from "../../src/lib/logger";
export async function crawlPreviewController(req: Request, res: Response) { export async function crawlPreviewController(req: Request, res: Response) {
try { try {
@@ -39,7 +40,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
res.json({ jobId: job.id }); res.json({ jobId: job.id });
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
+6
View File
@@ -0,0 +1,6 @@
import { Request, Response } from "express";
export async function livenessController(req: Request, res: Response) {
//TODO: add checks if the application is live and healthy like checking the redis connection
res.status(200).json({ status: "ok" });
}
+6
View File
@@ -0,0 +1,6 @@
import { Request, Response } from "express";
export async function readinessController(req: Request, res: Response) {
// TODO: add checks when the application is ready to serve traffic
res.status(200).json({ status: "ok" });
}
+9 -2
View File
@@ -12,8 +12,11 @@ import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOri
import { addWebScraperJob } from '../services/queue-jobs'; import { addWebScraperJob } from '../services/queue-jobs';
import { getWebScraperQueue } from '../services/queue-service'; import { getWebScraperQueue } from '../services/queue-service';
import { supabase_service } from '../services/supabase'; import { supabase_service } from '../services/supabase';
import { v4 as uuidv4 } from "uuid";
import { Logger } from '../lib/logger';
export async function scrapeHelper( export async function scrapeHelper(
jobId: string,
req: Request, req: Request,
team_id: string, team_id: string,
crawlerOptions: any, crawlerOptions: any,
@@ -148,7 +151,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(402).json({ error: "Insufficient credits" }); return res.status(402).json({ error: "Insufficient credits" });
} }
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
earlyReturn = true; earlyReturn = true;
return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
} }
@@ -163,8 +166,11 @@ export async function scrapeController(req: Request, res: Response) {
checkCredits(); checkCredits();
} }
const jobId = uuidv4();
const startTime = new Date().getTime(); const startTime = new Date().getTime();
const result = await scrapeHelper( const result = await scrapeHelper(
jobId,
req, req,
team_id, team_id,
crawlerOptions, crawlerOptions,
@@ -205,6 +211,7 @@ export async function scrapeController(req: Request, res: Response) {
} }
logJob({ logJob({
job_id: jobId,
success: result.success, success: result.success,
message: result.error, message: result.error,
num_docs: 1, num_docs: 1,
@@ -224,7 +231,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(result.returnCode).json(result); return res.status(result.returnCode).json(result);
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
+10 -2
View File
@@ -7,8 +7,11 @@ import { logJob } from "../services/logging/log_job";
import { PageOptions, SearchOptions } from "../lib/entities"; import { PageOptions, SearchOptions } from "../lib/entities";
import { search } from "../search"; import { search } from "../search";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../lib/logger";
export async function searchHelper( export async function searchHelper(
jobId: string,
req: Request, req: Request,
team_id: string, team_id: string,
crawlerOptions: any, crawlerOptions: any,
@@ -75,6 +78,7 @@ export async function searchHelper(
const a = new WebScraperDataProvider(); const a = new WebScraperDataProvider();
await a.setOptions({ await a.setOptions({
jobId,
mode: "single_urls", mode: "single_urls",
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
crawlerOptions: { crawlerOptions: {
@@ -148,6 +152,8 @@ export async function searchController(req: Request, res: Response) {
const searchOptions = req.body.searchOptions ?? { limit: 7 }; const searchOptions = req.body.searchOptions ?? { limit: 7 };
const jobId = uuidv4();
try { try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } = const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1); await checkTeamCredits(team_id, 1);
@@ -155,11 +161,12 @@ export async function searchController(req: Request, res: Response) {
return res.status(402).json({ error: "Insufficient credits" }); return res.status(402).json({ error: "Insufficient credits" });
} }
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: "Internal server error" }); return res.status(500).json({ error: "Internal server error" });
} }
const startTime = new Date().getTime(); const startTime = new Date().getTime();
const result = await searchHelper( const result = await searchHelper(
jobId,
req, req,
team_id, team_id,
crawlerOptions, crawlerOptions,
@@ -169,6 +176,7 @@ export async function searchController(req: Request, res: Response) {
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;
logJob({ logJob({
job_id: jobId,
success: result.success, success: result.success,
message: result.error, message: result.error,
num_docs: result.data ? result.data.length : 0, num_docs: result.data ? result.data.length : 0,
@@ -183,7 +191,7 @@ export async function searchController(req: Request, res: Response) {
}); });
return res.status(result.returnCode).json(result); return res.status(result.returnCode).json(result);
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
+2 -1
View File
@@ -1,6 +1,7 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { getWebScraperQueue } from "../../src/services/queue-service"; import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import { Logger } from "../../src/lib/logger";
export async function crawlJobStatusPreviewController(req: Request, res: Response) { export async function crawlJobStatusPreviewController(req: Request, res: Response) {
try { try {
@@ -35,7 +36,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
partial_data: jobStatus == 'completed' ? [] : partialDocs, partial_data: jobStatus == 'completed' ? [] : partialDocs,
}); });
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
+1
View File
@@ -4,6 +4,7 @@ async function example() {
const example = new WebScraperDataProvider(); const example = new WebScraperDataProvider();
await example.setOptions({ await example.setOptions({
jobId: "TEST",
mode: "crawl", mode: "crawl",
urls: ["https://mendable.ai"], urls: ["https://mendable.ai"],
crawlerOptions: {}, crawlerOptions: {},
+35 -174
View File
@@ -7,21 +7,30 @@ import { v0Router } from "./routes/v0";
import { initSDK } from "@hyperdx/node-opentelemetry"; import { initSDK } from "@hyperdx/node-opentelemetry";
import cluster from "cluster"; import cluster from "cluster";
import os from "os"; import os from "os";
import { Job } from "bull"; import { Logger } from "./lib/logger";
import { sendSlackWebhook } from "./services/alerts/slack"; import { adminRouter } from "./routes/admin";
import { checkAlerts } from "./services/alerts"; import { ScrapeEvents } from "./lib/scrape-events";
import Redis from "ioredis"; import http from 'node:http';
import { redisRateLimitClient } from "./services/rate-limiter"; import https from 'node:https';
import CacheableLookup from 'cacheable-lookup';
const { createBullBoard } = require("@bull-board/api"); const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { BullAdapter } = require("@bull-board/api/bullAdapter");
const { ExpressAdapter } = require("@bull-board/express"); const { ExpressAdapter } = require("@bull-board/express");
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
console.log(`Number of CPUs: ${numCPUs} available`); Logger.info(`Number of CPUs: ${numCPUs} available`);
const cacheable = new CacheableLookup({
// this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme
lookup:false
});
cacheable.install(http.globalAgent);
cacheable.install(https.globalAgent)
if (cluster.isMaster) { if (cluster.isMaster) {
console.log(`Master ${process.pid} is running`); Logger.info(`Master ${process.pid} is running`);
// Fork workers. // Fork workers.
for (let i = 0; i < numCPUs; i++) { for (let i = 0; i < numCPUs; i++) {
@@ -30,8 +39,8 @@ if (cluster.isMaster) {
cluster.on("exit", (worker, code, signal) => { cluster.on("exit", (worker, code, signal) => {
if (code !== null) { if (code !== null) {
console.log(`Worker ${worker.process.pid} exited`); Logger.info(`Worker ${worker.process.pid} exited`);
console.log("Starting a new worker"); Logger.info("Starting a new worker");
cluster.fork(); cluster.fork();
} }
}); });
@@ -45,7 +54,6 @@ if (cluster.isMaster) {
app.use(cors()); // Add this line to enable CORS app.use(cors()); // Add this line to enable CORS
const serverAdapter = new ExpressAdapter(); const serverAdapter = new ExpressAdapter();
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
@@ -70,6 +78,7 @@ if (cluster.isMaster) {
// register router // register router
app.use(v0Router); app.use(v0Router);
app.use(adminRouter);
const DEFAULT_PORT = process.env.PORT ?? 3002; const DEFAULT_PORT = process.env.PORT ?? 3002;
const HOST = process.env.HOST ?? "localhost"; const HOST = process.env.HOST ?? "localhost";
@@ -81,14 +90,9 @@ if (cluster.isMaster) {
function startServer(port = DEFAULT_PORT) { function startServer(port = DEFAULT_PORT) {
const server = app.listen(Number(port), HOST, () => { const server = app.listen(Number(port), HOST, () => {
console.log(`Worker ${process.pid} listening on port ${port}`); Logger.info(`Worker ${process.pid} listening on port ${port}`);
console.log( Logger.info(
`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
);
console.log("");
console.log("1. Make sure Redis is running on port 6379 by default");
console.log(
"2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 "
); );
}); });
return server; return server;
@@ -98,27 +102,6 @@ if (cluster.isMaster) {
startServer(); startServer();
} }
// Use this as a "health check" that way we dont destroy the server
app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
try {
const webScraperQueue = getWebScraperQueue();
const [webScraperActive] = await Promise.all([
webScraperQueue.getActiveCount(),
]);
const noActiveJobs = webScraperActive === 0;
// 200 if no active jobs, 503 if there are active jobs
return res.status(noActiveJobs ? 200 : 500).json({
webScraperActive,
noActiveJobs,
});
} catch (error) {
console.error(error);
return res.status(500).json({ error: error.message });
}
});
app.get(`/serverHealthCheck`, async (req, res) => { app.get(`/serverHealthCheck`, async (req, res) => {
try { try {
const webScraperQueue = getWebScraperQueue(); const webScraperQueue = getWebScraperQueue();
@@ -132,7 +115,7 @@ if (cluster.isMaster) {
waitingJobs, waitingJobs,
}); });
} catch (error) { } catch (error) {
console.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
}); });
@@ -177,13 +160,13 @@ if (cluster.isMaster) {
}); });
if (!response.ok) { if (!response.ok) {
console.error("Failed to send Slack notification"); Logger.error("Failed to send Slack notification");
} }
} }
}, timeout); }, timeout);
} }
} catch (error) { } catch (error) {
console.error(error); Logger.debug(error);
} }
}; };
@@ -191,140 +174,18 @@ if (cluster.isMaster) {
} }
}); });
app.get(
`/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
async (req, res) => {
try {
await checkAlerts();
return res.status(200).send("Alerts initialized");
} catch (error) {
console.error("Failed to initialize alerts:", error);
return res.status(500).send("Failed to initialize alerts");
}
}
);
app.get(
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
async (req, res) => {
try {
const webScraperQueue = getWebScraperQueue();
const batchSize = 10;
const numberOfBatches = 9; // Adjust based on your needs
const completedJobsPromises: Promise<Job[]>[] = [];
for (let i = 0; i < numberOfBatches; i++) {
completedJobsPromises.push(
webScraperQueue.getJobs(
["completed"],
i * batchSize,
i * batchSize + batchSize,
true
)
);
}
const completedJobs: Job[] = (
await Promise.all(completedJobsPromises)
).flat();
const before24hJobs =
completedJobs.filter(
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
) || [];
let count = 0;
if (!before24hJobs) {
return res.status(200).send(`No jobs to remove.`);
}
for (const job of before24hJobs) {
try {
await job.remove();
count++;
} catch (jobError) {
console.error(`Failed to remove job with ID ${job.id}:`, jobError);
}
}
return res.status(200).send(`Removed ${count} completed jobs.`);
} catch (error) {
console.error("Failed to clean last 24h complete jobs:", error);
return res.status(500).send("Failed to clean jobs");
}
}
);
app.get("/is-production", (req, res) => { app.get("/is-production", (req, res) => {
res.send({ isProduction: global.isProduction }); res.send({ isProduction: global.isProduction });
}); });
app.get( Logger.info(`Worker ${process.pid} started`);
`/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
async (req, res) => {
try {
const queueRedis = new Redis(process.env.REDIS_URL);
const testKey = "test";
const testValue = "test";
// Test queueRedis
let queueRedisHealth;
try {
await queueRedis.set(testKey, testValue);
queueRedisHealth = await queueRedis.get(testKey);
await queueRedis.del(testKey);
} catch (error) {
console.error("queueRedis health check failed:", error);
queueRedisHealth = null;
}
// Test redisRateLimitClient
let redisRateLimitHealth;
try {
await redisRateLimitClient.set(testKey, testValue);
redisRateLimitHealth = await redisRateLimitClient.get(testKey);
await redisRateLimitClient.del(testKey);
} catch (error) {
console.error("redisRateLimitClient health check failed:", error);
redisRateLimitHealth = null;
}
const healthStatus = {
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
redisRateLimitClient:
redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
};
if (
healthStatus.queueRedis === "healthy" &&
healthStatus.redisRateLimitClient === "healthy"
) {
console.log("Both Redis instances are healthy");
return res
.status(200)
.json({ status: "healthy", details: healthStatus });
} else {
console.log("Redis instances health check:", healthStatus);
await sendSlackWebhook(
`[REDIS DOWN] Redis instances health check: ${JSON.stringify(
healthStatus
)}`,
true
);
return res
.status(500)
.json({ status: "unhealthy", details: healthStatus });
}
} catch (error) {
console.error("Redis health check failed:", error);
await sendSlackWebhook(
`[REDIS DOWN] Redis instances health check: ${error.message}`,
true
);
return res
.status(500)
.json({ status: "unhealthy", message: error.message });
}
}
);
console.log(`Worker ${process.pid} started`);
} }
const wsq = getWebScraperQueue();
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
+2 -1
View File
@@ -4,6 +4,7 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation
import { generateOpenAICompletions } from "./models"; import { generateOpenAICompletions } from "./models";
import { Document, ExtractorOptions } from "../entities"; import { Document, ExtractorOptions } from "../entities";
import { Logger } from "../logger";
// Generate completion using OpenAI // Generate completion using OpenAI
export async function generateCompletions( export async function generateCompletions(
@@ -44,7 +45,7 @@ export async function generateCompletions(
return completionResult; return completionResult;
} catch (error) { } catch (error) {
console.error(`Error generating completions: ${error}`); Logger.error(`Error generating completions: ${error}`);
throw new Error(`Error generating completions: ${error.message}`); throw new Error(`Error generating completions: ${error.message}`);
} }
default: default:
+1 -1
View File
@@ -48,7 +48,7 @@ function prepareOpenAIDoc(
export async function generateOpenAICompletions({ export async function generateOpenAICompletions({
client, client,
model = "gpt-4o", model = process.env.MODEL_NAME || "gpt-4o",
document, document,
schema, //TODO - add zod dynamic type checking schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt, prompt = defaultPrompt,
+1 -1
View File
@@ -1,6 +1,6 @@
export const defaultOrigin = "api"; export const defaultOrigin = "api";
export const defaultTimeout = 30000; // 30 seconds export const defaultTimeout = 45000; // 45 seconds
export const defaultPageOptions = { export const defaultPageOptions = {
onlyMainContent: false, onlyMainContent: false,
+2
View File
@@ -56,6 +56,7 @@ export type CrawlerOptions = {
} }
export type WebScraperOptions = { export type WebScraperOptions = {
jobId: string;
urls: string[]; urls: string[];
mode: "single_urls" | "sitemap" | "crawl"; mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: CrawlerOptions; crawlerOptions?: CrawlerOptions;
@@ -138,4 +139,5 @@ export interface FireEngineOptions{
engine?: string; engine?: string;
blockMedia?: boolean; blockMedia?: boolean;
blockAds?: boolean; blockAds?: boolean;
disableJsDom?: boolean;
} }
+53
View File
@@ -0,0 +1,53 @@
enum LogLevel {
NONE = 'NONE', // No logs will be output.
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors.
INFO = 'INFO', // For logging informational messages that highlight the progress of the application.
DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging.
TRACE = 'TRACE' // For logging more detailed information than the DEBUG level.
}
export class Logger {
static colors = {
ERROR: '\x1b[31m%s\x1b[0m', // Red
WARN: '\x1b[33m%s\x1b[0m', // Yellow
INFO: '\x1b[34m%s\x1b[0m', // Blue
DEBUG: '\x1b[36m%s\x1b[0m', // Cyan
TRACE: '\x1b[35m%s\x1b[0m' // Magenta
};
static log (message: string, level: LogLevel) {
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO;
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
const currentLevelIndex = levels.indexOf(logLevel);
const messageLevelIndex = levels.indexOf(level);
if (currentLevelIndex >= messageLevelIndex) {
const color = Logger.colors[level];
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
// if (process.env.USE_DB_AUTH) {
// save to supabase? another place?
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
// }
}
}
static error(message: string | any) {
Logger.log(message, LogLevel.ERROR);
}
static warn(message: string) {
Logger.log(message, LogLevel.WARN);
}
static info(message: string) {
Logger.log(message, LogLevel.INFO);
}
static debug(message: string) {
Logger.log(message, LogLevel.DEBUG);
}
static trace(message: string) {
Logger.log(message, LogLevel.TRACE);
}
}
+84
View File
@@ -0,0 +1,84 @@
import { Job, JobId } from "bull";
import type { baseScrapers } from "../scraper/WebScraper/single_url";
import { supabase_service as supabase } from "../services/supabase";
import { Logger } from "./logger";
export type ScrapeErrorEvent = {
type: "error",
message: string,
stack?: string,
}
export type ScrapeScrapeEvent = {
type: "scrape",
url: string,
worker?: string,
method: (typeof baseScrapers)[number],
result: null | {
success: boolean,
response_code?: number,
response_size?: number,
error?: string | object,
// proxy?: string,
time_taken: number,
},
}
export type ScrapeQueueEvent = {
type: "queue",
event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed" | "failed",
worker?: string,
}
export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent;
export class ScrapeEvents {
static async insert(jobId: string, content: ScrapeEvent) {
if (jobId === "TEST") return null;
if (process.env.USE_DB_AUTHENTICATION) {
try {
const result = await supabase.from("scrape_events").insert({
job_id: jobId,
type: content.type,
content: content,
// created_at
}).select().single();
return (result.data as any).id;
} catch (error) {
Logger.error(`Error inserting scrape event: ${error}`);
return null;
}
}
return null;
}
static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) {
if (logId === null) return;
try {
const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any;
await supabase.from("scrape_events").update({
content: {
...previousLog.content,
result,
}
}).eq("id", logId);
} catch (error) {
Logger.error(`Error updating scrape result: ${error}`);
}
}
static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) {
try {
await this.insert(((job as any).id ? (job as any).id : job) as string, {
type: "queue",
event,
worker: process.env.FLY_MACHINE_ID,
});
} catch (error) {
Logger.error(`Error logging job event: ${error}`);
}
}
}
+3 -2
View File
@@ -1,4 +1,5 @@
import { AuthResponse } from "../../src/types"; import { AuthResponse } from "../../src/types";
import { Logger } from "./logger";
let warningCount = 0; let warningCount = 0;
@@ -8,7 +9,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
return async function (...args: U): Promise<T> { return async function (...args: U): Promise<T> {
if (process.env.USE_DB_AUTHENTICATION === "false") { if (process.env.USE_DB_AUTHENTICATION === "false") {
if (warningCount < 5) { if (warningCount < 5) {
console.warn("WARNING - You're bypassing authentication"); Logger.warn("You're bypassing authentication");
warningCount++; warningCount++;
} }
return { success: true } as T; return { success: true } as T;
@@ -16,7 +17,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
try { try {
return await originalFunction(...args); return await originalFunction(...args);
} catch (error) { } catch (error) {
console.error("Error in withAuth function: ", error); Logger.error(`Error in withAuth function: ${error}`);
return { success: false, error: error.message } as T; return { success: false, error: error.message } as T;
} }
} }
+10 -2
View File
@@ -10,6 +10,8 @@ import { DocumentUrl, Progress } from "../lib/entities";
import { billTeam } from "../services/billing/credit_billing"; import { billTeam } from "../services/billing/credit_billing";
import { Document } from "../lib/entities"; import { Document } from "../lib/entities";
import { supabase_service } from "../services/supabase"; import { supabase_service } from "../services/supabase";
import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
export async function startWebScraperPipeline({ export async function startWebScraperPipeline({
job, job,
@@ -23,6 +25,7 @@ export async function startWebScraperPipeline({
crawlerOptions: job.data.crawlerOptions, crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions, pageOptions: job.data.pageOptions,
inProgress: (progress) => { inProgress: (progress) => {
Logger.debug(`🐂 Job in progress ${job.id}`);
if (progress.currentDocument) { if (progress.currentDocument) {
partialDocs.push(progress.currentDocument); partialDocs.push(progress.currentDocument);
if (partialDocs.length > 50) { if (partialDocs.length > 50) {
@@ -32,9 +35,12 @@ export async function startWebScraperPipeline({
} }
}, },
onSuccess: (result) => { onSuccess: (result) => {
Logger.debug(`🐂 Job completed ${job.id}`);
saveJob(job, result); saveJob(job, result);
}, },
onError: (error) => { onError: (error) => {
Logger.error(`🐂 Job failed ${job.id}`);
ScrapeEvents.logJobEvent(job, "failed");
job.moveToFailed(error); job.moveToFailed(error);
}, },
team_id: job.data.team_id, team_id: job.data.team_id,
@@ -56,6 +62,7 @@ export async function runWebScraper({
const provider = new WebScraperDataProvider(); const provider = new WebScraperDataProvider();
if (mode === "crawl") { if (mode === "crawl") {
await provider.setOptions({ await provider.setOptions({
jobId: bull_job_id,
mode: mode, mode: mode,
urls: [url], urls: [url],
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
@@ -64,6 +71,7 @@ export async function runWebScraper({
}); });
} else { } else {
await provider.setOptions({ await provider.setOptions({
jobId: bull_job_id,
mode: mode, mode: mode,
urls: url.split(","), urls: url.split(","),
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
@@ -108,7 +116,6 @@ export async function runWebScraper({
// this return doesn't matter too much for the job completion result // this return doesn't matter too much for the job completion result
return { success: true, message: "", docs: filteredDocs }; return { success: true, message: "", docs: filteredDocs };
} catch (error) { } catch (error) {
console.error("Error running web scraper", error);
onError(error); onError(error);
return { success: false, message: error.message, docs: [] }; return { success: false, message: error.message, docs: [] };
} }
@@ -135,7 +142,8 @@ const saveJob = async (job: Job, result: any) => {
// I think the job won't exist here anymore // I think the job won't exist here anymore
} }
} }
ScrapeEvents.logJobEvent(job, "completed");
} catch (error) { } catch (error) {
console.error("Failed to update job status:", error); Logger.error(`🐂 Failed to update job status: ${error}`);
} }
}; };
+29
View File
@@ -0,0 +1,29 @@
import express from "express";
import { redisHealthController } from "../controllers/admin/redis-health";
import {
checkQueuesController,
cleanBefore24hCompleteJobsController,
queuesController,
} from "../controllers/admin/queue";
export const adminRouter = express.Router();
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
redisHealthController
);
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
cleanBefore24hCompleteJobsController
);
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
checkQueuesController
);
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
queuesController
);
+5
View File
@@ -7,6 +7,8 @@ import { crawlJobStatusPreviewController } from "../../src/controllers/status";
import { searchController } from "../../src/controllers/search"; import { searchController } from "../../src/controllers/search";
import { crawlCancelController } from "../../src/controllers/crawl-cancel"; import { crawlCancelController } from "../../src/controllers/crawl-cancel";
import { keyAuthController } from "../../src/controllers/keyAuth"; import { keyAuthController } from "../../src/controllers/keyAuth";
import { livenessController } from "../controllers/liveness";
import { readinessController } from "../controllers/readiness";
export const v0Router = express.Router(); export const v0Router = express.Router();
@@ -23,3 +25,6 @@ v0Router.get("/v0/keyAuth", keyAuthController);
// Search routes // Search routes
v0Router.post("/v0/search", searchController); v0Router.post("/v0/search", searchController);
// Health/Probe routes
v0Router.get("/v0/health/liveness", livenessController);
v0Router.get("/v0/health/readiness", readinessController);
@@ -42,6 +42,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({ crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -76,6 +77,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({ crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -104,6 +106,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({ crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -133,6 +136,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({ crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -161,6 +165,7 @@ describe('WebCrawler', () => {
// Setup the crawler with the specific test case options // Setup the crawler with the specific test case options
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -194,6 +199,7 @@ describe('WebCrawler', () => {
const limit = 2; // Set a limit for the number of links const limit = 2; // Set a limit for the number of links
crawler = new WebCrawler({ crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl, initialUrl: initialUrl,
includes: [], includes: [],
excludes: [], excludes: [],
@@ -0,0 +1,15 @@
import CacheableLookup from 'cacheable-lookup';
import https from 'node:https';
import axios from "axios";
describe("DNS", () => {
it("cached dns", async () => {
const cachedDns = new CacheableLookup();
cachedDns.install(https.globalAgent);
jest.spyOn(cachedDns, "lookupAsync");
const res = await axios.get("https://example.com");
expect(res.status).toBe(200);
expect(cachedDns.lookupAsync).toHaveBeenCalled();
});
});
@@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => {
const pageOptionsWithHtml: PageOptions = { includeHtml: true }; const pageOptionsWithHtml: PageOptions = { includeHtml: true };
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false }; const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml); const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml); const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
expect(resultWithHtml.html).toBeDefined(); expect(resultWithHtml.html).toBeDefined();
expect(resultWithoutHtml.html).toBeUndefined(); expect(resultWithoutHtml.html).toBeUndefined();
@@ -27,7 +27,7 @@ it('should return a list of links on the mendable.ai page', async () => {
const url = 'https://mendable.ai'; const url = 'https://mendable.ai';
const pageOptions: PageOptions = { includeHtml: true }; const pageOptions: PageOptions = { includeHtml: true };
const result = await scrapSingleUrl(url, pageOptions); const result = await scrapSingleUrl("TEST", url, pageOptions);
// Check if the result contains a list of links // Check if the result contains a list of links
expect(result.linksOnPage).toBeDefined(); expect(result.linksOnPage).toBeDefined();
+20 -9
View File
@@ -8,8 +8,10 @@ import { scrapSingleUrl } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout"; import { axiosTimeout } from "../../../src/lib/timeout";
import { Logger } from "../../../src/lib/logger";
export class WebCrawler { export class WebCrawler {
private jobId: string;
private initialUrl: string; private initialUrl: string;
private baseUrl: string; private baseUrl: string;
private includes: string[]; private includes: string[];
@@ -26,6 +28,7 @@ export class WebCrawler {
private allowExternalContentLinks: boolean; private allowExternalContentLinks: boolean;
constructor({ constructor({
jobId,
initialUrl, initialUrl,
includes, includes,
excludes, excludes,
@@ -36,6 +39,7 @@ export class WebCrawler {
allowBackwardCrawling = false, allowBackwardCrawling = false,
allowExternalContentLinks = false allowExternalContentLinks = false
}: { }: {
jobId: string;
initialUrl: string; initialUrl: string;
includes?: string[]; includes?: string[];
excludes?: string[]; excludes?: string[];
@@ -46,6 +50,7 @@ export class WebCrawler {
allowBackwardCrawling?: boolean; allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean; allowExternalContentLinks?: boolean;
}) { }) {
this.jobId = jobId;
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin; this.baseUrl = new URL(initialUrl).origin;
this.includes = includes ?? []; this.includes = includes ?? [];
@@ -64,7 +69,7 @@ export class WebCrawler {
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
return sitemapLinks return sitemapLinks
.filter((link) => { .filter((link) => {
const url = new URL(link); const url = new URL(link.trim(), this.baseUrl);
const path = url.pathname; const path = url.pathname;
const depth = getURLDepth(url.toString()); const depth = getURLDepth(url.toString());
@@ -116,7 +121,7 @@ export class WebCrawler {
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
// Check if the link is disallowed by robots.txt // Check if the link is disallowed by robots.txt
if (!isAllowed) { if (!isAllowed) {
console.log(`Link disallowed by robots.txt: ${link}`); Logger.debug(`Link disallowed by robots.txt: ${link}`);
return false; return false;
} }
@@ -133,15 +138,19 @@ export class WebCrawler {
limit: number = 10000, limit: number = 10000,
maxDepth: number = 10 maxDepth: number = 10
): Promise<{ url: string, html: string }[]> { ): Promise<{ url: string, html: string }[]> {
Logger.debug(`Crawler starting with ${this.initialUrl}`);
// Fetch and parse robots.txt // Fetch and parse robots.txt
try { try {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
this.robots = robotsParser(this.robotsTxtUrl, response.data); this.robots = robotsParser(this.robotsTxtUrl, response.data);
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
} catch (error) { } catch (error) {
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
} }
if(!crawlerOptions?.ignoreSitemap){ if (!crawlerOptions?.ignoreSitemap){
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
@@ -175,6 +184,7 @@ export class WebCrawler {
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
): Promise<{ url: string, html: string }[]> { ): Promise<{ url: string, html: string }[]> {
const queue = async.queue(async (task: string, callback) => { const queue = async.queue(async (task: string, callback) => {
Logger.debug(`Crawling ${task}`);
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) { if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
@@ -216,16 +226,18 @@ export class WebCrawler {
} }
}, concurrencyLimit); }, concurrencyLimit);
Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
queue.push( queue.push(
urls.filter( urls.filter(
(url) => (url) =>
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent") !this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
), ),
(err) => { (err) => {
if (err) console.error(err); if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
} }
); );
await queue.drain(); await queue.drain();
Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
@@ -253,7 +265,7 @@ export class WebCrawler {
// If it is the first link, fetch with single url // If it is the first link, fetch with single url
if (this.visited.size === 1) { if (this.visited.size === 1) {
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
content = page.html ?? ""; content = page.html ?? "";
pageStatusCode = page.metadata?.pageStatusCode; pageStatusCode = page.metadata?.pageStatusCode;
pageError = page.metadata?.pageError || undefined; pageError = page.metadata?.pageError || undefined;
@@ -282,7 +294,6 @@ export class WebCrawler {
const urlObj = new URL(fullUrl); const urlObj = new URL(fullUrl);
const path = urlObj.pathname; const path = urlObj.pathname;
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
if (this.isInternalLink(fullUrl) && if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) && this.noSections(fullUrl) &&
@@ -452,7 +463,7 @@ export class WebCrawler {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }); sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
} }
} catch (error) { } catch (error) {
console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }); const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
if (response) { if (response) {
sitemapLinks = response; sitemapLinks = response;
@@ -467,7 +478,7 @@ export class WebCrawler {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap }); sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap });
} }
} catch (error) { } catch (error) {
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
} }
} }
@@ -1,10 +1,12 @@
import { Logger } from "../../../lib/logger";
export async function handleCustomScraping( export async function handleCustomScraping(
text: string, text: string,
url: string url: string
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> { ): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
// Check for Readme Docs special case // Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) { if (text.includes('<meta name="readme-deploy"')) {
console.log( Logger.debug(
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms` `Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
); );
return { return {
@@ -19,7 +21,7 @@ export async function handleCustomScraping(
// Check for Vanta security portals // Check for Vanta security portals
if (text.includes('<link href="https://static.vanta.com')) { if (text.includes('<link href="https://static.vanta.com')) {
console.log( Logger.debug(
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms` `Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
); );
return { return {
@@ -34,7 +36,7 @@ export async function handleCustomScraping(
const googleDriveMetaMatch = text.match(googleDriveMetaPattern); const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
if (googleDriveMetaMatch) { if (googleDriveMetaMatch) {
const url = googleDriveMetaMatch[1]; const url = googleDriveMetaMatch[1];
console.log(`Google Drive PDF link detected: ${url}`); Logger.debug(`Google Drive PDF link detected: ${url}`);
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/); const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
if (fileIdMatch) { if (fileIdMatch) {
+9 -4
View File
@@ -19,8 +19,10 @@ import { generateCompletions } from "../../lib/LLM-extraction";
import { getWebScraperQueue } from "../../../src/services/queue-service"; import { getWebScraperQueue } from "../../../src/services/queue-service";
import { fetchAndProcessDocx } from "./utils/docxProcessor"; import { fetchAndProcessDocx } from "./utils/docxProcessor";
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils"; import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
import { Logger } from "../../lib/logger";
export class WebScraperDataProvider { export class WebScraperDataProvider {
private jobId: string;
private bullJobId: string; private bullJobId: string;
private urls: string[] = [""]; private urls: string[] = [""];
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
@@ -65,6 +67,7 @@ export class WebScraperDataProvider {
batchUrls.map(async (url, index) => { batchUrls.map(async (url, index) => {
const existingHTML = allHtmls ? allHtmls[i + index] : ""; const existingHTML = allHtmls ? allHtmls[i + index] : "";
const result = await scrapSingleUrl( const result = await scrapSingleUrl(
this.jobId,
url, url,
this.pageOptions, this.pageOptions,
this.extractorOptions, this.extractorOptions,
@@ -89,14 +92,14 @@ export class WebScraperDataProvider {
const job = await getWebScraperQueue().getJob(this.bullJobId); const job = await getWebScraperQueue().getJob(this.bullJobId);
const jobStatus = await job.getState(); const jobStatus = await job.getState();
if (jobStatus === "failed") { if (jobStatus === "failed") {
console.error( Logger.info(
"Job has failed or has been cancelled by the user. Stopping the job..." "Job has failed or has been cancelled by the user. Stopping the job..."
); );
return [] as Document[]; return [] as Document[];
} }
} }
} catch (error) { } catch (error) {
console.error(error); Logger.error(error.message);
return [] as Document[]; return [] as Document[];
} }
} }
@@ -165,6 +168,7 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: this.jobId,
initialUrl: this.urls[0], initialUrl: this.urls[0],
includes: this.includes, includes: this.includes,
excludes: this.excludes, excludes: this.excludes,
@@ -270,7 +274,7 @@ export class WebScraperDataProvider {
this.mode === "single_urls" && links.length > 0 this.mode === "single_urls" && links.length > 0
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch( ? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
(error) => { (error) => {
console.error("Failed to fetch sitemap data:", error); Logger.debug(`Failed to fetch sitemap data: ${error}`);
return null; return null;
} }
) )
@@ -460,7 +464,7 @@ export class WebScraperDataProvider {
let documents: Document[] = []; let documents: Document[] = [];
for (const url of urls) { for (const url of urls) {
const normalizedUrl = this.normalizeUrl(url); const normalizedUrl = this.normalizeUrl(url);
console.log( Logger.debug(
"Getting cached document for web-scraper-cache:" + normalizedUrl "Getting cached document for web-scraper-cache:" + normalizedUrl
); );
const cachedDocumentString = await getValue( const cachedDocumentString = await getValue(
@@ -499,6 +503,7 @@ export class WebScraperDataProvider {
throw new Error("Urls are required"); throw new Error("Urls are required");
} }
this.jobId = options.jobId;
this.bullJobId = options.bullJobId; this.bullJobId = options.bullJobId;
this.urls = options.urls; this.urls = options.urls;
this.mode = options.mode; this.mode = options.mode;
@@ -2,6 +2,7 @@ import axios from "axios";
import { logScrape } from "../../../services/logging/scrape_log"; import { logScrape } from "../../../services/logging/scrape_log";
import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global"; import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";
/** /**
* Scrapes a URL with Axios * Scrapes a URL with Axios
@@ -34,9 +35,7 @@ export async function scrapWithFetch(
}); });
if (response.status !== 200) { if (response.status !== 200) {
console.error( Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`);
`[Axios] Error fetching url: ${url} with status: ${response.status}`
);
logParams.error_message = response.statusText; logParams.error_message = response.statusText;
logParams.response_code = response.status; logParams.response_code = response.status;
return { return {
@@ -63,10 +62,10 @@ export async function scrapWithFetch(
} catch (error) { } catch (error) {
if (error.code === "ECONNABORTED") { if (error.code === "ECONNABORTED") {
logParams.error_message = "Request timed out"; logParams.error_message = "Request timed out";
console.log(`[Axios] Request timed out for ${url}`); Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
} else { } else {
logParams.error_message = error.message || error; logParams.error_message = error.message || error;
console.error(`[Axios] Error fetching url: ${url} -> ${error}`); Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
} }
return { content: "", pageStatusCode: null, pageError: logParams.error_message }; return { content: "", pageStatusCode: null, pageError: logParams.error_message };
} finally { } finally {
@@ -4,6 +4,7 @@ import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url"; import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global"; import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";
/** /**
* Scrapes a URL with Fire-Engine * Scrapes a URL with Fire-Engine
@@ -59,12 +60,10 @@ export async function scrapWithFireEngine({
let engine = engineParam; // do we want fireEngineOptions as first choice? let engine = engineParam; // do we want fireEngineOptions as first choice?
console.log( Logger.info(
`[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
); );
// console.log(fireEngineOptionsParam)
const response = await axios.post( const response = await axios.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint, process.env.FIRE_ENGINE_BETA_URL + endpoint,
{ {
@@ -84,15 +83,15 @@ export async function scrapWithFireEngine({
); );
if (response.status !== 200) { if (response.status !== 200) {
console.error( Logger.debug(
`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}` `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`
); );
logParams.error_message = response.data?.pageError; logParams.error_message = response.data?.pageError;
logParams.response_code = response.data?.pageStatusCode; logParams.response_code = response.data?.pageStatusCode;
if(response.data && response.data?.pageStatusCode !== 200) { if(response.data && response.data?.pageStatusCode !== 200) {
console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`); Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`);
} }
return { return {
@@ -130,10 +129,10 @@ export async function scrapWithFireEngine({
} }
} catch (error) { } catch (error) {
if (error.code === "ECONNABORTED") { if (error.code === "ECONNABORTED") {
console.log(`[Fire-Engine] Request timed out for ${url}`); Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`);
logParams.error_message = "Request timed out"; logParams.error_message = "Request timed out";
} else { } else {
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
logParams.error_message = error.message || error; logParams.error_message = error.message || error;
} }
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
@@ -3,6 +3,7 @@ import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url"; import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global"; import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";
/** /**
* Scrapes a URL with Playwright * Scrapes a URL with Playwright
@@ -51,8 +52,8 @@ export async function scrapWithPlaywright(
); );
if (response.status !== 200) { if (response.status !== 200) {
console.error( Logger.debug(
`[Playwright] Error fetching url: ${url} with status: ${response.status}` `⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
); );
logParams.error_message = response.data?.pageError; logParams.error_message = response.data?.pageError;
logParams.response_code = response.data?.pageStatusCode; logParams.response_code = response.data?.pageStatusCode;
@@ -86,8 +87,8 @@ export async function scrapWithPlaywright(
}; };
} catch (jsonError) { } catch (jsonError) {
logParams.error_message = jsonError.message || jsonError; logParams.error_message = jsonError.message || jsonError;
console.error( Logger.debug(
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` `⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
); );
return { content: "", pageStatusCode: null, pageError: logParams.error_message }; return { content: "", pageStatusCode: null, pageError: logParams.error_message };
} }
@@ -95,10 +96,10 @@ export async function scrapWithPlaywright(
} catch (error) { } catch (error) {
if (error.code === "ECONNABORTED") { if (error.code === "ECONNABORTED") {
logParams.error_message = "Request timed out"; logParams.error_message = "Request timed out";
console.log(`[Playwright] Request timed out for ${url}`); Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
} else { } else {
logParams.error_message = error.message || error; logParams.error_message = error.message || error;
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
} }
return { content: "", pageStatusCode: null, pageError: logParams.error_message }; return { content: "", pageStatusCode: null, pageError: logParams.error_message };
} finally { } finally {
@@ -3,6 +3,7 @@ import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global"; import { universalTimeout } from "../global";
import { ScrapingBeeClient } from "scrapingbee"; import { ScrapingBeeClient } from "scrapingbee";
import { Logger } from "../../../lib/logger";
/** /**
* Scrapes a URL with ScrapingBee * Scrapes a URL with ScrapingBee
@@ -56,8 +57,8 @@ export async function scrapWithScrapingBee(
text = decoder.decode(response.data); text = decoder.decode(response.data);
logParams.success = true; logParams.success = true;
} catch (decodeError) { } catch (decodeError) {
console.error( Logger.debug(
`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` `⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
); );
logParams.error_message = decodeError.message || decodeError; logParams.error_message = decodeError.message || decodeError;
} }
@@ -72,7 +73,7 @@ export async function scrapWithScrapingBee(
}; };
} }
} catch (error) { } catch (error) {
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
logParams.error_message = error.message || error; logParams.error_message = error.message || error;
logParams.response_code = error.response?.status; logParams.response_code = error.response?.status;
return { return {
+54 -20
View File
@@ -17,17 +17,20 @@ import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { scrapWithPlaywright } from "./scrapers/playwright"; import { scrapWithPlaywright } from "./scrapers/playwright";
import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
import { extractLinks } from "./utils/utils"; import { extractLinks } from "./utils/utils";
import { Logger } from "../../lib/logger";
import { ScrapeEvents } from "../../lib/scrape-events";
import { clientSideError } from "../../strings";
dotenv.config(); dotenv.config();
const baseScrapers = [ export const baseScrapers = [
"fire-engine", "fire-engine",
"fire-engine;chrome-cdp", "fire-engine;chrome-cdp",
"scrapingBee", "scrapingBee",
"playwright", process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
"scrapingBeeLoad", "scrapingBeeLoad",
"fetch", "fetch",
] as const; ].filter(Boolean);
export async function generateRequestParams( export async function generateRequestParams(
url: string, url: string,
@@ -48,7 +51,7 @@ export async function generateRequestParams(
return defaultParams; return defaultParams;
} }
} catch (error) { } catch (error) {
console.error(`Error generating URL key: ${error}`); Logger.error(`Error generating URL key: ${error}`);
return defaultParams; return defaultParams;
} }
} }
@@ -82,22 +85,22 @@ function getScrapingFallbackOrder(
}); });
let defaultOrder = [ let defaultOrder = [
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
"scrapingBee", "scrapingBee",
"fire-engine", process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
"fire-engine;chrome-cdp",
"playwright",
"scrapingBeeLoad", "scrapingBeeLoad",
"fetch", "fetch",
]; ].filter(Boolean);
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
defaultOrder = [ defaultOrder = [
"fire-engine", "fire-engine",
"playwright", process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
...defaultOrder.filter( ...defaultOrder.filter(
(scraper) => scraper !== "fire-engine" && scraper !== "playwright" (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
), ),
]; ].filter(Boolean);
} }
const filteredDefaultOrder = defaultOrder.filter( const filteredDefaultOrder = defaultOrder.filter(
@@ -117,6 +120,7 @@ function getScrapingFallbackOrder(
export async function scrapSingleUrl( export async function scrapSingleUrl(
jobId: string,
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { pageOptions: PageOptions = {
onlyMainContent: true, onlyMainContent: true,
@@ -144,6 +148,15 @@ export async function scrapSingleUrl(
} = { text: "", screenshot: "", metadata: {} }; } = { text: "", screenshot: "", metadata: {} };
let screenshot = ""; let screenshot = "";
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(jobId, {
type: "scrape",
url,
worker: process.env.FLY_MACHINE_ID,
method,
result: null,
});
switch (method) { switch (method) {
case "fire-engine": case "fire-engine":
case "fire-engine;chrome-cdp": case "fire-engine;chrome-cdp":
@@ -154,7 +167,6 @@ export async function scrapSingleUrl(
} }
if (process.env.FIRE_ENGINE_BETA_URL) { if (process.env.FIRE_ENGINE_BETA_URL) {
console.log(`Scraping ${url} with Fire Engine`);
const response = await scrapWithFireEngine({ const response = await scrapWithFireEngine({
url, url,
waitFor: pageOptions.waitFor, waitFor: pageOptions.waitFor,
@@ -254,8 +266,19 @@ export async function scrapSingleUrl(
} }
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
const text = await parseMarkdown(cleanedHtml);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: scraperResponse.text.length,
success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
error: scraperResponse.metadata.pageError,
response_code: scraperResponse.metadata.pageStatusCode,
time_taken: Date.now() - timer,
});
return { return {
text: await parseMarkdown(cleanedHtml), text,
html: cleanedHtml, html: cleanedHtml,
rawHtml: scraperResponse.text, rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot, screenshot: scraperResponse.screenshot,
@@ -277,7 +300,7 @@ export async function scrapSingleUrl(
try { try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
} catch (error) { } catch (error) {
console.error(`Invalid URL key, trying: ${urlToScrap}`); Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
} }
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = getScrapingFallbackOrder( const scrapersInOrder = getScrapingFallbackOrder(
@@ -289,7 +312,7 @@ export async function scrapSingleUrl(
for (const scraper of scrapersInOrder) { for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it // If exists text coming from crawler, use it
if (existingHtml && existingHtml.trim().length >= 100) { if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) {
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions); let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
text = await parseMarkdown(cleanedHtml); text = await parseMarkdown(cleanedHtml);
html = cleanedHtml; html = cleanedHtml;
@@ -311,12 +334,18 @@ export async function scrapSingleUrl(
pageError = undefined; pageError = undefined;
} }
if (text && text.trim().length >= 100) break; if (text && text.trim().length >= 100) {
if (pageStatusCode && pageStatusCode == 404) break; Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; break;
if (nextScraperIndex < scrapersInOrder.length) {
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
} }
if (pageStatusCode && pageStatusCode == 404) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
break;
}
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
// if (nextScraperIndex < scrapersInOrder.length) {
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
// }
} }
if (!text) { if (!text) {
@@ -372,7 +401,12 @@ export async function scrapSingleUrl(
return document; return document;
} catch (error) { } catch (error) {
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
ScrapeEvents.insert(jobId, {
type: "error",
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
stack: error.stack,
});
return { return {
content: "", content: "",
markdown: "", markdown: "",
+4 -3
View File
@@ -3,6 +3,7 @@ import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js"; import { parseStringPromise } from "xml2js";
import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { WebCrawler } from "./crawler"; import { WebCrawler } from "./crawler";
import { Logger } from "../../lib/logger";
export async function getLinksFromSitemap( export async function getLinksFromSitemap(
{ {
@@ -22,11 +23,11 @@ export async function getLinksFromSitemap(
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data; content = response.data;
} else if (mode === 'fire-engine') { } else if (mode === 'fire-engine') {
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { method: "get", mobileProxy: true },options:{endpoint:"request"} }); const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"tlsclient", disableJsDom: true, mobileProxy: true } });
content = response.html; content = response.html;
} }
} catch (error) { } catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`); Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
return allUrls; return allUrls;
} }
@@ -48,7 +49,7 @@ export async function getLinksFromSitemap(
} }
} }
} catch (error) { } catch (error) {
console.error(`Error processing ${sitemapUrl}: ${error}`); Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
} }
return allUrls; return allUrls;
@@ -1,3 +1,4 @@
import { Logger } from '../../../../lib/logger';
import { isUrlBlocked } from '../blocklist'; import { isUrlBlocked } from '../blocklist';
describe('isUrlBlocked', () => { describe('isUrlBlocked', () => {
@@ -19,7 +20,7 @@ describe('isUrlBlocked', () => {
blockedUrls.forEach(url => { blockedUrls.forEach(url => {
if (!isUrlBlocked(url)) { if (!isUrlBlocked(url)) {
console.log(`URL not blocked: ${url}`); Logger.debug(`URL not blocked: ${url}`);
} }
expect(isUrlBlocked(url)).toBe(true); expect(isUrlBlocked(url)).toBe(true);
}); });
@@ -1,3 +1,5 @@
import { Logger } from "../../../lib/logger";
const socialMediaBlocklist = [ const socialMediaBlocklist = [
'facebook.com', 'facebook.com',
'x.com', 'x.com',
@@ -59,7 +61,7 @@ export function isUrlBlocked(url: string): boolean {
return isBlocked; return isBlocked;
} catch (e) { } catch (e) {
// If an error occurs (e.g., invalid URL), return false // If an error occurs (e.g., invalid URL), return false
console.error(`Error parsing the following URL: ${url}`); Logger.error(`Error parsing the following URL: ${url}`);
return false; return false;
} }
} }
@@ -212,4 +212,24 @@ export const urlSpecificParams = {
engine: "playwright", engine: "playwright",
} }
}, },
"mendable.ai":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
mobileProxy: true,
method: "get",
engine: "chrome-cdp",
},
},
},
"developer.apple.com":{
defaultScraper: "fire-engine",
params:{
engine: "playwright",
wait: 2000,
fireEngineOptions: {
blockMedia: false,
}
},
},
}; };
@@ -1,5 +1,6 @@
import Anthropic from '@anthropic-ai/sdk'; import Anthropic from '@anthropic-ai/sdk';
import axios from 'axios'; import axios from 'axios';
import { Logger } from '../../../lib/logger';
export async function getImageDescription( export async function getImageDescription(
imageUrl: string, imageUrl: string,
@@ -82,7 +83,7 @@ export async function getImageDescription(
} }
} }
} catch (error) { } catch (error) {
console.error("Error generating image alt text:", error?.message); Logger.error(`Error generating image alt text: ${error}`);
return ""; return "";
} }
} }
@@ -1,4 +1,6 @@
import { CheerioAPI } from "cheerio"; import { CheerioAPI } from "cheerio";
import { Logger } from "../../../lib/logger";
interface Metadata { interface Metadata {
title?: string; title?: string;
description?: string; description?: string;
@@ -105,7 +107,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
} catch (error) { } catch (error) {
console.error("Error extracting metadata:", error); Logger.error(`Error extracting metadata: ${error}`);
} }
return { return {
@@ -7,14 +7,20 @@ import pdf from "pdf-parse";
import path from "path"; import path from "path";
import os from "os"; import os from "os";
import { axiosTimeout } from "../../../lib/timeout"; import { axiosTimeout } from "../../../lib/timeout";
import { Logger } from "../../../lib/logger";
dotenv.config(); dotenv.config();
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url); try {
const content = await processPdfToText(tempFilePath, parsePDF); const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
fs.unlinkSync(tempFilePath); // Clean up the temporary file const content = await processPdfToText(tempFilePath, parsePDF);
return { content, pageStatusCode, pageError }; fs.unlinkSync(tempFilePath); // Clean up the temporary file
return { content, pageStatusCode, pageError };
} catch (error) {
Logger.error(`Failed to fetch and process PDF: ${error.message}`);
return { content: "", pageStatusCode: 500, pageError: error.message };
}
} }
async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> { async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
@@ -39,6 +45,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
let content = ""; let content = "";
if (process.env.LLAMAPARSE_API_KEY && parsePDF) { if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
Logger.debug("Processing pdf document w/ LlamaIndex");
const apiKey = process.env.LLAMAPARSE_API_KEY; const apiKey = process.env.LLAMAPARSE_API_KEY;
const headers = { const headers = {
Authorization: `Bearer ${apiKey}`, Authorization: `Bearer ${apiKey}`,
@@ -81,7 +88,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
} }
} catch (error) { } catch (error) {
console.error("Error fetching result w/ LlamaIndex"); Logger.debug("Error fetching result w/ LlamaIndex");
attempt++; attempt++;
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
// You may want to handle specific errors differently // You may want to handle specific errors differently
@@ -93,7 +100,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
} }
content = resultResponse.data[resultType]; content = resultResponse.data[resultType];
} catch (error) { } catch (error) {
console.error("Error processing pdf document w/ LlamaIndex(2)"); Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
content = await processPdf(filePath); content = await processPdf(filePath);
} }
} else if (parsePDF) { } else if (parsePDF) {
@@ -1,3 +1,4 @@
import { Logger } from "../../../lib/logger";
import { Document } from "../../../lib/entities"; import { Document } from "../../../lib/entities";
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => { export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
@@ -39,7 +40,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
return documents; return documents;
} catch (error) { } catch (error) {
console.error("Error replacing paths with absolute paths", error); Logger.debug(`Error replacing paths with absolute paths: ${error}`);
return documents; return documents;
} }
}; };
@@ -78,7 +79,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
return documents; return documents;
} catch (error) { } catch (error) {
console.error("Error replacing img paths with absolute paths", error); Logger.error(`Error replacing img paths with absolute paths: ${error}`);
return documents; return documents;
} }
}; };
@@ -1,5 +1,6 @@
import axios from "axios"; import axios from "axios";
import * as cheerio from "cheerio"; import * as cheerio from "cheerio";
import { Logger } from "../../../lib/logger";
export async function attemptScrapWithRequests( export async function attemptScrapWithRequests(
@@ -9,13 +10,13 @@ export async function attemptScrapWithRequests(
const response = await axios.get(urlToScrap, { timeout: 15000 }); const response = await axios.get(urlToScrap, { timeout: 15000 });
if (!response.data) { if (!response.data) {
console.log("Failed normal requests as well"); Logger.debug("Failed normal requests as well");
return null; return null;
} }
return response.data; return response.data;
} catch (error) { } catch (error) {
console.error(`Error in attemptScrapWithRequests: ${error}`); Logger.debug(`Error in attemptScrapWithRequests: ${error}`);
return null; return null;
} }
} }
+3 -2
View File
@@ -2,6 +2,7 @@ import axios from 'axios';
import * as cheerio from 'cheerio'; import * as cheerio from 'cheerio';
import * as querystring from 'querystring'; import * as querystring from 'querystring';
import { SearchResult } from '../../src/lib/entities'; import { SearchResult } from '../../src/lib/entities';
import { Logger } from '../../src/lib/logger';
const _useragent_list = [ const _useragent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
@@ -96,7 +97,7 @@ export async function google_search(term: string, advanced = false, num_results
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
} catch (error) { } catch (error) {
if (error.message === 'Too many requests') { if (error.message === 'Too many requests') {
console.warn('Too many requests, breaking the loop'); Logger.warn('Too many requests, breaking the loop');
break; break;
} }
throw error; throw error;
@@ -107,7 +108,7 @@ export async function google_search(term: string, advanced = false, num_results
} }
} }
if (attempts >= maxAttempts) { if (attempts >= maxAttempts) {
console.warn('Max attempts reached, breaking the loop'); Logger.warn('Max attempts reached, breaking the loop');
} }
return results return results
} }
+2 -1
View File
@@ -1,3 +1,4 @@
import { Logger } from "../../src/lib/logger";
import { SearchResult } from "../../src/lib/entities"; import { SearchResult } from "../../src/lib/entities";
import { google_search } from "./googlesearch"; import { google_search } from "./googlesearch";
import { serper_search } from "./serper"; import { serper_search } from "./serper";
@@ -47,7 +48,7 @@ export async function search({
timeout timeout
); );
} catch (error) { } catch (error) {
console.error("Error in search function: ", error); Logger.error(`Error in search function: ${error}`);
return [] return []
} }
// if process.env.SERPER_API_KEY is set, use serper // if process.env.SERPER_API_KEY is set, use serper
+7 -6
View File
@@ -1,3 +1,4 @@
import { Logger } from "../../../src/lib/logger";
import { getWebScraperQueue } from "../queue-service"; import { getWebScraperQueue } from "../queue-service";
import { sendSlackWebhook } from "./slack"; import { sendSlackWebhook } from "./slack";
@@ -9,13 +10,13 @@ export async function checkAlerts() {
process.env.ALERT_NUM_ACTIVE_JOBS && process.env.ALERT_NUM_ACTIVE_JOBS &&
process.env.ALERT_NUM_WAITING_JOBS process.env.ALERT_NUM_WAITING_JOBS
) { ) {
console.info("Initializing alerts"); Logger.info("Initializing alerts");
const checkActiveJobs = async () => { const checkActiveJobs = async () => {
try { try {
const webScraperQueue = getWebScraperQueue(); const webScraperQueue = getWebScraperQueue();
const activeJobs = await webScraperQueue.getActiveCount(); const activeJobs = await webScraperQueue.getActiveCount();
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) { if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
console.warn( Logger.warn(
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.` `Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
); );
sendSlackWebhook( sendSlackWebhook(
@@ -23,12 +24,12 @@ export async function checkAlerts() {
true true
); );
} else { } else {
console.info( Logger.info(
`Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}` `Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`
); );
} }
} catch (error) { } catch (error) {
console.error("Failed to check active jobs:", error); Logger.error(`Failed to check active jobs: ${error}`);
} }
}; };
@@ -38,7 +39,7 @@ export async function checkAlerts() {
const paused = await webScraperQueue.getPausedCount(); const paused = await webScraperQueue.getPausedCount();
if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) { if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
console.warn( Logger.warn(
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.` `Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
); );
sendSlackWebhook( sendSlackWebhook(
@@ -57,6 +58,6 @@ export async function checkAlerts() {
// setInterval(checkAll, 10000); // Run every // setInterval(checkAll, 10000); // Run every
} }
} catch (error) { } catch (error) {
console.error("Failed to initialize alerts:", error); Logger.error(`Failed to initialize alerts: ${error}`);
} }
} }
+3 -2
View File
@@ -1,4 +1,5 @@
import axios from "axios"; import axios from "axios";
import { Logger } from "../../../src/lib/logger";
export async function sendSlackWebhook( export async function sendSlackWebhook(
message: string, message: string,
@@ -16,8 +17,8 @@ export async function sendSlackWebhook(
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
}); });
console.log("Webhook sent successfully:", response.data); Logger.log("Webhook sent successfully:", response.data);
} catch (error) { } catch (error) {
console.error("Error sending webhook:", error); Logger.debug(`Error sending webhook: ${error}`);
} }
} }
@@ -2,6 +2,7 @@ import { NotificationType } from "../../types";
import { withAuth } from "../../lib/withAuth"; import { withAuth } from "../../lib/withAuth";
import { sendNotification } from "../notification/email_notification"; import { sendNotification } from "../notification/email_notification";
import { supabase_service } from "../supabase"; import { supabase_service } from "../supabase";
import { Logger } from "../../lib/logger";
const FREE_CREDITS = 500; const FREE_CREDITS = 500;
@@ -12,7 +13,7 @@ export async function supaBillTeam(team_id: string, credits: number) {
if (team_id === "preview") { if (team_id === "preview") {
return { success: true, message: "Preview team, no credits used" }; return { success: true, message: "Preview team, no credits used" };
} }
console.log(`Billing team ${team_id} for ${credits} credits`); Logger.info(`Billing team ${team_id} for ${credits} credits`);
// When the API is used, you can log the credit usage in the credit_usage table: // When the API is used, you can log the credit usage in the credit_usage table:
// team_id: The ID of the team using the API. // team_id: The ID of the team using the API.
// subscription_id: The ID of the team's active subscription. // subscription_id: The ID of the team's active subscription.
@@ -218,7 +219,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
0 0
); );
console.log("totalCreditsUsed", totalCreditsUsed); Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
const end = new Date(); const end = new Date();
end.setDate(end.getDate() + 30); end.setDate(end.getDate() + 30);
@@ -262,14 +263,14 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
}); });
if (creditUsageError) { if (creditUsageError) {
console.error("Error calculating credit usage:", creditUsageError); Logger.error(`Error calculating credit usage: ${creditUsageError}`);
} }
if (creditUsages && creditUsages.length > 0) { if (creditUsages && creditUsages.length > 0) {
totalCreditsUsed = creditUsages[0].total_credits_used; totalCreditsUsed = creditUsages[0].total_credits_used;
} }
} catch (error) { } catch (error) {
console.error("Error calculating credit usage:", error); Logger.error(`Error calculating credit usage: ${error}`);
} }
// Adjust total credits used by subtracting coupon value // Adjust total credits used by subtracting coupon value
+2 -1
View File
@@ -1,5 +1,6 @@
import { Request } from "express"; import { Request } from "express";
import { supabase_service } from "../supabase"; import { supabase_service } from "../supabase";
import { Logger } from "../../../src/lib/logger";
export async function createIdempotencyKey( export async function createIdempotencyKey(
req: Request, req: Request,
@@ -14,7 +15,7 @@ export async function createIdempotencyKey(
.insert({ key: idempotencyKey }); .insert({ key: idempotencyKey });
if (error) { if (error) {
console.error("Failed to create idempotency key:", error); Logger.error(`Failed to create idempotency key: ${error}`);
throw error; throw error;
} }
@@ -1,6 +1,7 @@
import { Request } from "express"; import { Request } from "express";
import { supabase_service } from "../supabase"; import { supabase_service } from "../supabase";
import { validate as isUuid } from 'uuid'; import { validate as isUuid } from 'uuid';
import { Logger } from "../../../src/lib/logger";
export async function validateIdempotencyKey( export async function validateIdempotencyKey(
req: Request, req: Request,
@@ -13,7 +14,7 @@ export async function validateIdempotencyKey(
// Ensure idempotencyKey is treated as a string // Ensure idempotencyKey is treated as a string
const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey; const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey;
if (!isUuid(key)) { if (!isUuid(key)) {
console.error("Invalid idempotency key provided in the request headers."); Logger.debug("Invalid idempotency key provided in the request headers.");
return false; return false;
} }
@@ -23,7 +24,7 @@ export async function validateIdempotencyKey(
.eq("key", idempotencyKey); .eq("key", idempotencyKey);
if (error) { if (error) {
console.error(error); Logger.error(`Error validating idempotency key: ${error}`);
} }
if (!data || data.length === 0) { if (!data || data.length === 0) {
+2 -1
View File
@@ -1,4 +1,5 @@
import { supabase_service } from "../supabase"; import { supabase_service } from "../supabase";
import { Logger } from "../../../src/lib/logger";
import "dotenv/config"; import "dotenv/config";
export async function logCrawl(job_id: string, team_id: string) { export async function logCrawl(job_id: string, team_id: string) {
@@ -13,7 +14,7 @@ export async function logCrawl(job_id: string, team_id: string) {
}, },
]); ]);
} catch (error) { } catch (error) {
console.error("Error logging crawl job:\n", error); Logger.error(`Error logging crawl job to supabase:\n${error}`);
} }
} }
} }
+3 -2
View File
@@ -3,6 +3,7 @@ import { supabase_service } from "../supabase";
import { FirecrawlJob } from "../../types"; import { FirecrawlJob } from "../../types";
import { posthog } from "../posthog"; import { posthog } from "../posthog";
import "dotenv/config"; import "dotenv/config";
import { Logger } from "../../lib/logger";
export async function logJob(job: FirecrawlJob) { export async function logJob(job: FirecrawlJob) {
try { try {
@@ -68,9 +69,9 @@ export async function logJob(job: FirecrawlJob) {
posthog.capture(phLog); posthog.capture(phLog);
} }
if (error) { if (error) {
console.error("Error logging job:\n", error); Logger.error(`Error logging job: ${error.message}`);
} }
} catch (error) { } catch (error) {
console.error("Error logging job:\n", error); Logger.error(`Error logging job: ${error.message}`);
} }
} }
+4 -3
View File
@@ -2,15 +2,16 @@ import "dotenv/config";
import { ScrapeLog } from "../../types"; import { ScrapeLog } from "../../types";
import { supabase_service } from "../supabase"; import { supabase_service } from "../supabase";
import { PageOptions } from "../../lib/entities"; import { PageOptions } from "../../lib/entities";
import { Logger } from "../../lib/logger";
export async function logScrape( export async function logScrape(
scrapeLog: ScrapeLog, scrapeLog: ScrapeLog,
pageOptions?: PageOptions pageOptions?: PageOptions
) { ) {
if (process.env.USE_DB_AUTHENTICATION === "false") { if (process.env.USE_DB_AUTHENTICATION === "false") {
Logger.debug("Skipping logging scrape to Supabase");
return; return;
} }
try { try {
// Only log jobs in production // Only log jobs in production
// if (process.env.ENV !== "production") { // if (process.env.ENV !== "production") {
@@ -43,9 +44,9 @@ export async function logScrape(
]); ]);
if (error) { if (error) {
console.error("Error logging proxy:\n", error); Logger.error(`Error logging proxy:\n${error}`);
} }
} catch (error) { } catch (error) {
console.error("Error logging proxy:\n", error); Logger.error(`Error logging proxy:\n${error}`);
} }
} }
+4 -3
View File
@@ -1,19 +1,20 @@
import { Logtail } from "@logtail/node"; import { Logtail } from "@logtail/node";
import "dotenv/config"; import "dotenv/config";
import { Logger } from "../lib/logger";
// A mock Logtail class to handle cases where LOGTAIL_KEY is not provided // A mock Logtail class to handle cases where LOGTAIL_KEY is not provided
class MockLogtail { class MockLogtail {
info(message: string, context?: Record<string, any>): void { info(message: string, context?: Record<string, any>): void {
console.log(message, context); Logger.debug(`${message} - ${context}`);
} }
error(message: string, context: Record<string, any> = {}): void { error(message: string, context: Record<string, any> = {}): void {
console.error(message, context); Logger.error(`${message} - ${context}`);
} }
} }
// Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class // Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class
// Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided // Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided
export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => { export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => {
console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more."); Logger.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
return new MockLogtail(); return new MockLogtail();
})(); })();
@@ -2,6 +2,7 @@ import { supabase_service } from "../supabase";
import { withAuth } from "../../lib/withAuth"; import { withAuth } from "../../lib/withAuth";
import { Resend } from "resend"; import { Resend } from "resend";
import { NotificationType } from "../../types"; import { NotificationType } from "../../types";
import { Logger } from "../../../src/lib/logger";
const emailTemplates: Record< const emailTemplates: Record<
NotificationType, NotificationType,
@@ -52,11 +53,11 @@ async function sendEmailNotification(
}); });
if (error) { if (error) {
console.error("Error sending email: ", error); Logger.debug(`Error sending email: ${error}`);
return { success: false }; return { success: false };
} }
} catch (error) { } catch (error) {
console.error("Error sending email (2): ", error); Logger.debug(`Error sending email (2): ${error}`);
return { success: false }; return { success: false };
} }
} }
@@ -70,7 +71,28 @@ export async function sendNotificationInternal(
if (team_id === "preview") { if (team_id === "preview") {
return { success: true }; return { success: true };
} }
const fifteenDaysAgo = new Date();
fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15);
const { data, error } = await supabase_service const { data, error } = await supabase_service
.from("user_notifications")
.select("*")
.eq("team_id", team_id)
.eq("notification_type", notificationType)
.gte("sent_date", fifteenDaysAgo.toISOString());
if (error) {
Logger.debug(`Error fetching notifications: ${error}`);
return { success: false };
}
if (data.length !== 0) {
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} in the last 15 days`);
return { success: false };
}
const { data: recentData, error: recentError } = await supabase_service
.from("user_notifications") .from("user_notifications")
.select("*") .select("*")
.eq("team_id", team_id) .eq("team_id", team_id)
@@ -78,14 +100,16 @@ export async function sendNotificationInternal(
.gte("sent_date", startDateString) .gte("sent_date", startDateString)
.lte("sent_date", endDateString); .lte("sent_date", endDateString);
if (error) { if (recentError) {
console.error("Error fetching notifications: ", error); Logger.debug(`Error fetching recent notifications: ${recentError}`);
return { success: false }; return { success: false };
} }
if (data.length !== 0) { if (recentData.length !== 0) {
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} within the specified date range`);
return { success: false }; return { success: false };
} else { } else {
console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`);
// get the emails from the user with the team_id // get the emails from the user with the team_id
const { data: emails, error: emailsError } = await supabase_service const { data: emails, error: emailsError } = await supabase_service
.from("users") .from("users")
@@ -93,7 +117,7 @@ export async function sendNotificationInternal(
.eq("team_id", team_id); .eq("team_id", team_id);
if (emailsError) { if (emailsError) {
console.error("Error fetching emails: ", emailsError); Logger.debug(`Error fetching emails: ${emailsError}`);
return { success: false }; return { success: false };
} }
@@ -112,7 +136,7 @@ export async function sendNotificationInternal(
]); ]);
if (insertError) { if (insertError) {
console.error("Error inserting notification record: ", insertError); Logger.debug(`Error inserting notification record: ${insertError}`);
return { success: false }; return { success: false };
} }
+2 -1
View File
@@ -1,5 +1,6 @@
import { PostHog } from 'posthog-node'; import { PostHog } from 'posthog-node';
import "dotenv/config"; import "dotenv/config";
import { Logger } from '../../src/lib/logger';
export default function PostHogClient() { export default function PostHogClient() {
const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, { const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, {
@@ -19,7 +20,7 @@ class MockPostHog {
export const posthog = process.env.POSTHOG_API_KEY export const posthog = process.env.POSTHOG_API_KEY
? PostHogClient() ? PostHogClient()
: (() => { : (() => {
console.warn( Logger.warn(
"POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more." "POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more."
); );
return new MockPostHog(); return new MockPostHog();
+2 -1
View File
@@ -1,5 +1,6 @@
import Queue from "bull"; import Queue from "bull";
import { Queue as BullQueue } from "bull"; import { Queue as BullQueue } from "bull";
import { Logger } from "../lib/logger";
let webScraperQueue: BullQueue; let webScraperQueue: BullQueue;
@@ -16,7 +17,7 @@ export function getWebScraperQueue() {
attempts: 5 attempts: 5
} }
}); });
console.log("Web scraper queue created"); Logger.info("Web scraper queue created");
} }
return webScraperQueue; return webScraperQueue;
} }
+17 -7
View File
@@ -7,8 +7,10 @@ import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job"; import { logJob } from "./logging/log_job";
import { initSDK } from '@hyperdx/node-opentelemetry'; import { initSDK } from '@hyperdx/node-opentelemetry';
import { Job } from "bull"; import { Job } from "bull";
import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
if(process.env.ENV === 'production') { if (process.env.ENV === 'production') {
initSDK({ initSDK({
consoleCapture: true, consoleCapture: true,
additionalInstrumentations: [], additionalInstrumentations: [],
@@ -18,7 +20,8 @@ if(process.env.ENV === 'production') {
const wsq = getWebScraperQueue(); const wsq = getWebScraperQueue();
async function processJob(job: Job, done) { async function processJob(job: Job, done) {
console.log("taking job", job.id); Logger.debug(`🐂 Worker taking job ${job.id}`);
try { try {
job.progress({ job.progress({
current: 1, current: 1,
@@ -60,18 +63,18 @@ async function processJob(job: Job, done) {
pageOptions: job.data.pageOptions, pageOptions: job.data.pageOptions,
origin: job.data.origin, origin: job.data.origin,
}); });
console.log("job done", job.id); Logger.debug(`🐂 Job done ${job.id}`);
done(null, data); done(null, data);
} catch (error) { } catch (error) {
console.log("job errored", job.id, error); Logger.error(`🐂 Job errored ${job.id} - ${error}`);
if (await getWebScraperQueue().isPaused(false)) { if (await getWebScraperQueue().isPaused(false)) {
console.log("queue is paused, ignoring"); Logger.debug("🐂Queue is paused, ignoring");
return; return;
} }
if (error instanceof CustomError) { if (error instanceof CustomError) {
// Here we handle the error, then save the failed job // Here we handle the error, then save the failed job
console.error(error.message); // or any other error handling Logger.error(error.message); // or any other error handling
logtail.error("Custom error while ingesting", { logtail.error("Custom error while ingesting", {
job_id: job.id, job_id: job.id,
@@ -79,7 +82,7 @@ async function processJob(job: Job, done) {
dataIngestionJob: error.dataIngestionJob, dataIngestionJob: error.dataIngestionJob,
}); });
} }
console.log(error); Logger.error(error);
logtail.error("Overall error ingesting", { logtail.error("Overall error ingesting", {
job_id: job.id, job_id: job.id,
@@ -117,3 +120,10 @@ wsq.process(
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
processJob processJob
); );
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
+7 -6
View File
@@ -1,14 +1,15 @@
import Redis from "ioredis"; import Redis from "ioredis";
import { redisRateLimitClient } from "./rate-limiter"; import { redisRateLimitClient } from "./rate-limiter";
import { Logger } from "../lib/logger";
// Listen to 'error' events to the Redis connection // Listen to 'error' events to the Redis connection
redisRateLimitClient.on("error", (error) => { redisRateLimitClient.on("error", (error) => {
try { try {
if (error.message === "ECONNRESET") { if (error.message === "ECONNRESET") {
console.log("Connection to Redis Session Store timed out."); Logger.error("Connection to Redis Session Rate Limit Store timed out.");
} else if (error.message === "ECONNREFUSED") { } else if (error.message === "ECONNREFUSED") {
console.log("Connection to Redis Session Store refused!"); Logger.error("Connection to Redis Session Rate Limit Store refused!");
} else console.log(error); } else Logger.error(error);
} catch (error) {} } catch (error) {}
}); });
@@ -16,15 +17,15 @@ redisRateLimitClient.on("error", (error) => {
redisRateLimitClient.on("reconnecting", (err) => { redisRateLimitClient.on("reconnecting", (err) => {
try { try {
if (redisRateLimitClient.status === "reconnecting") if (redisRateLimitClient.status === "reconnecting")
console.log("Reconnecting to Redis Session Store..."); Logger.info("Reconnecting to Redis Session Rate Limit Store...");
else console.log("Error reconnecting to Redis Session Store."); else Logger.error("Error reconnecting to Redis Session Rate Limit Store.");
} catch (error) {} } catch (error) {}
}); });
// Listen to the 'connect' event to Redis // Listen to the 'connect' event to Redis
redisRateLimitClient.on("connect", (err) => { redisRateLimitClient.on("connect", (err) => {
try { try {
if (!err) console.log("Connected to Redis Session Store!"); if (!err) Logger.info("Connected to Redis Session Rate Limit Store!");
} catch (error) {} } catch (error) {}
}); });
+11 -5
View File
@@ -1,4 +1,5 @@
import { createClient, SupabaseClient } from "@supabase/supabase-js"; import { createClient, SupabaseClient } from "@supabase/supabase-js";
import { Logger } from "../lib/logger";
// SupabaseService class initializes the Supabase client conditionally based on environment variables. // SupabaseService class initializes the Supabase client conditionally based on environment variables.
class SupabaseService { class SupabaseService {
@@ -10,13 +11,13 @@ class SupabaseService {
// Only initialize the Supabase client if both URL and Service Token are provided. // Only initialize the Supabase client if both URL and Service Token are provided.
if (process.env.USE_DB_AUTHENTICATION === "false") { if (process.env.USE_DB_AUTHENTICATION === "false") {
// Warn the user that Authentication is disabled by setting the client to null // Warn the user that Authentication is disabled by setting the client to null
console.warn( Logger.warn(
"\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" "Authentication is disabled. Supabase client will not be initialized."
); );
this.client = null; this.client = null;
} else if (!supabaseUrl || !supabaseServiceToken) { } else if (!supabaseUrl || !supabaseServiceToken) {
console.error( Logger.error(
"\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" "Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable"
); );
} else { } else {
this.client = createClient(supabaseUrl, supabaseServiceToken); this.client = createClient(supabaseUrl, supabaseServiceToken);
@@ -35,10 +36,15 @@ export const supabase_service: SupabaseClient = new Proxy(
new SupabaseService(), new SupabaseService(),
{ {
get: function (target, prop, receiver) { get: function (target, prop, receiver) {
if (process.env.USE_DB_AUTHENTICATION === "false") {
Logger.debug(
"Attempted to access Supabase client when it's not configured."
);
}
const client = target.getClient(); const client = target.getClient();
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
if (client === null) { if (client === null) {
console.error( Logger.error(
"Attempted to access Supabase client when it's not configured." "Attempted to access Supabase client when it's not configured."
); );
return () => { return () => {
+3 -8
View File
@@ -1,3 +1,4 @@
import { Logger } from "../../src/lib/logger";
import { supabase_service } from "./supabase"; import { supabase_service } from "./supabase";
export const callWebhook = async (teamId: string, jobId: string,data: any) => { export const callWebhook = async (teamId: string, jobId: string,data: any) => {
@@ -15,10 +16,7 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
.eq("team_id", teamId) .eq("team_id", teamId)
.limit(1); .limit(1);
if (error) { if (error) {
console.error( Logger.error(`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`);
`Error fetching webhook URL for team ID: ${teamId}`,
error.message
);
return null; return null;
} }
@@ -53,9 +51,6 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
}), }),
}); });
} catch (error) { } catch (error) {
console.error( Logger.debug(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`);
`Error sending webhook for team ID: ${teamId}`,
error.message
);
} }
}; };
+2
View File
@@ -1,2 +1,4 @@
export const errorNoResults = export const errorNoResults =
"No results found, please check the URL or contact us at help@mendable.ai to file a ticket."; "No results found, please check the URL or contact us at help@mendable.ai to file a ticket.";
export const clientSideError = "client-side exception has occurred"
+17 -3
View File
@@ -5,7 +5,7 @@ the HTML content of a specified URL. It supports optional proxy settings and med
from os import environ from os import environ
from fastapi import FastAPI from fastapi import FastAPI, Response
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from playwright.async_api import Browser, async_playwright from playwright.async_api import Browser, async_playwright
from pydantic import BaseModel from pydantic import BaseModel
@@ -39,14 +39,28 @@ async def shutdown_event():
"""Event handler for application shutdown to close the browser.""" """Event handler for application shutdown to close the browser."""
await browser.close() await browser.close()
@app.get("/health/liveness")
def liveness_probe():
"""Endpoint for liveness probe."""
return JSONResponse(content={"status": "ok"}, status_code=200)
@app.get("/health/readiness")
async def readiness_probe():
"""Endpoint for readiness probe. Checks if the browser instance is ready."""
if browser:
return JSONResponse(content={"status": "ok"}, status_code=200)
return JSONResponse(content={"status": "Service Unavailable"}, status_code=503)
@app.post("/html") @app.post("/html")
async def root(body: UrlModel): async def root(body: UrlModel):
""" """
Endpoint to fetch and return HTML content of a given URL. Endpoint to fetch and return HTML content of a given URL.
Args: Args:
body (UrlModel): The URL model containing the target URL, wait time, and timeout. body (UrlModel): The URL model containing the target URL, wait time, and timeout.
Returns: Returns:
JSONResponse: The HTML content of the page. JSONResponse: The HTML content of the page.
""" """
+8 -2
View File
@@ -1,5 +1,6 @@
import { createClient, SupabaseClient } from "@supabase/supabase-js"; import { createClient, SupabaseClient } from "@supabase/supabase-js";
import "dotenv/config"; import "dotenv/config";
// SupabaseService class initializes the Supabase client conditionally based on environment variables. // SupabaseService class initializes the Supabase client conditionally based on environment variables.
class SupabaseService { class SupabaseService {
private client: SupabaseClient | null = null; private client: SupabaseClient | null = null;
@@ -11,12 +12,12 @@ class SupabaseService {
if (process.env.USE_DB_AUTHENTICATION === "false") { if (process.env.USE_DB_AUTHENTICATION === "false") {
// Warn the user that Authentication is disabled by setting the client to null // Warn the user that Authentication is disabled by setting the client to null
console.warn( console.warn(
"\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" "Authentication is disabled. Supabase client will not be initialized."
); );
this.client = null; this.client = null;
} else if (!supabaseUrl || !supabaseServiceToken) { } else if (!supabaseUrl || !supabaseServiceToken) {
console.error( console.error(
"\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" "Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable"
); );
} else { } else {
this.client = createClient(supabaseUrl, supabaseServiceToken); this.client = createClient(supabaseUrl, supabaseServiceToken);
@@ -35,6 +36,11 @@ export const supabase_service: SupabaseClient = new Proxy(
new SupabaseService(), new SupabaseService(),
{ {
get: function (target, prop, receiver) { get: function (target, prop, receiver) {
if (process.env.USE_DB_AUTHENTICATION === "false") {
console.debug(
"Attempted to access Supabase client when it's not configured."
);
}
const client = target.getClient(); const client = target.getClient();
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
if (client === null) { if (client === null) {
-42
View File
@@ -1,42 +0,0 @@
#root {
max-width: 1280px;
margin: 0 auto;
padding: 2rem;
text-align: center;
}
.logo {
height: 6em;
padding: 1.5em;
will-change: filter;
transition: filter 300ms;
}
.logo:hover {
filter: drop-shadow(0 0 2em #646cffaa);
}
.logo.react:hover {
filter: drop-shadow(0 0 2em #61dafbaa);
}
@keyframes logo-spin {
from {
transform: rotate(0deg);
}
to {
transform: rotate(360deg);
}
}
@media (prefers-reduced-motion: no-preference) {
a:nth-of-type(2) .logo {
animation: logo-spin infinite 20s linear;
}
}
.card {
padding: 2em;
}
.read-the-docs {
color: #888;
}
-1
View File
@@ -1,4 +1,3 @@
import "./App.css";
import FirecrawlComponent from "./components/ingestion"; import FirecrawlComponent from "./components/ingestion";
function App() { function App() {
@@ -1 +0,0 @@
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="35.93" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 228"><path fill="#00D8FF" d="M210.483 73.824a171.49 171.49 0 0 0-8.24-2.597c.465-1.9.893-3.777 1.273-5.621c6.238-30.281 2.16-54.676-11.769-62.708c-13.355-7.7-35.196.329-57.254 19.526a171.23 171.23 0 0 0-6.375 5.848a155.866 155.866 0 0 0-4.241-3.917C100.759 3.829 77.587-4.822 63.673 3.233C50.33 10.957 46.379 33.89 51.995 62.588a170.974 170.974 0 0 0 1.892 8.48c-3.28.932-6.445 1.924-9.474 2.98C17.309 83.498 0 98.307 0 113.668c0 15.865 18.582 31.778 46.812 41.427a145.52 145.52 0 0 0 6.921 2.165a167.467 167.467 0 0 0-2.01 9.138c-5.354 28.2-1.173 50.591 12.134 58.266c13.744 7.926 36.812-.22 59.273-19.855a145.567 145.567 0 0 0 5.342-4.923a168.064 168.064 0 0 0 6.92 6.314c21.758 18.722 43.246 26.282 56.54 18.586c13.731-7.949 18.194-32.003 12.4-61.268a145.016 145.016 0 0 0-1.535-6.842c1.62-.48 3.21-.974 4.76-1.488c29.348-9.723 48.443-25.443 48.443-41.52c0-15.417-17.868-30.326-45.517-39.844Zm-6.365 70.984c-1.4.463-2.836.91-4.3 1.345c-3.24-10.257-7.612-21.163-12.963-32.432c5.106-11 9.31-21.767 12.459-31.957c2.619.758 5.16 1.557 7.61 2.4c23.69 8.156 38.14 20.213 38.14 29.504c0 9.896-15.606 22.743-40.946 31.14Zm-10.514 20.834c2.562 12.94 2.927 24.64 1.23 33.787c-1.524 8.219-4.59 13.698-8.382 15.893c-8.067 4.67-25.32-1.4-43.927-17.412a156.726 156.726 0 0 1-6.437-5.87c7.214-7.889 14.423-17.06 21.459-27.246c12.376-1.098 24.068-2.894 34.671-5.345a134.17 134.17 0 0 1 1.386 6.193ZM87.276 214.515c-7.882 2.783-14.16 2.863-17.955.675c-8.075-4.657-11.432-22.636-6.853-46.752a156.923 156.923 0 0 1 1.869-8.499c10.486 2.32 22.093 3.988 34.498 4.994c7.084 9.967 14.501 19.128 21.976 27.15a134.668 134.668 0 0 1-4.877 4.492c-9.933 8.682-19.886 14.842-28.658 17.94ZM50.35 144.747c-12.483-4.267-22.792-9.812-29.858-15.863c-6.35-5.437-9.555-10.836-9.555-15.216c0-9.322 13.897-21.212 37.076-29.293c2.813-.98 5.757-1.905 8.812-2.773c3.204 10.42 7.406 21.315 12.477 32.332c-5.137 11.18-9.399 22.249-12.634 32.792a134.718 134.718 0 0 1-6.318-1.979Zm12.378-84.26c-4.811-24.587-1.616-43.134 6.425-47.789c8.564-4.958 27.502 2.111 47.463 19.835a144.318 144.318 0 0 1 3.841 3.545c-7.438 7.987-14.787 17.08-21.808 26.988c-12.04 1.116-23.565 2.908-34.161 5.309a160.342 160.342 0 0 1-1.76-7.887Zm110.427 27.268a347.8 347.8 0 0 0-7.785-12.803c8.168 1.033 15.994 2.404 23.343 4.08c-2.206 7.072-4.956 14.465-8.193 22.045a381.151 381.151 0 0 0-7.365-13.322Zm-45.032-43.861c5.044 5.465 10.096 11.566 15.065 18.186a322.04 322.04 0 0 0-30.257-.006c4.974-6.559 10.069-12.652 15.192-18.18ZM82.802 87.83a323.167 323.167 0 0 0-7.227 13.238c-3.184-7.553-5.909-14.98-8.134-22.152c7.304-1.634 15.093-2.97 23.209-3.984a321.524 321.524 0 0 0-7.848 12.897Zm8.081 65.352c-8.385-.936-16.291-2.203-23.593-3.793c2.26-7.3 5.045-14.885 8.298-22.6a321.187 321.187 0 0 0 7.257 13.246c2.594 4.48 5.28 8.868 8.038 13.147Zm37.542 31.03c-5.184-5.592-10.354-11.779-15.403-18.433c4.902.192 9.899.29 14.978.29c5.218 0 10.376-.117 15.453-.343c-4.985 6.774-10.018 12.97-15.028 18.486Zm52.198-57.817c3.422 7.8 6.306 15.345 8.596 22.52c-7.422 1.694-15.436 3.058-23.88 4.071a382.417 382.417 0 0 0 7.859-13.026a347.403 347.403 0 0 0 7.425-13.565Zm-16.898 8.101a358.557 358.557 0 0 1-12.281 19.815a329.4 329.4 0 0 1-23.444.823c-7.967 0-15.716-.248-23.178-.732a310.202 310.202 0 0 1-12.513-19.846h.001a307.41 307.41 0 0 1-10.923-20.627a310.278 310.278 0 0 1 10.89-20.637l-.001.001a307.318 307.318 0 0 1 12.413-19.761c7.613-.576 15.42-.876 23.31-.876H128c7.926 0 15.743.303 23.354.883a329.357 329.357 0 0 1 12.335 19.695a358.489 358.489 0 0 1 11.036 20.54a329.472 329.472 0 0 1-11 20.722Zm22.56-122.124c8.572 4.944 11.906 24.881 6.52 51.026c-.344 1.668-.73 3.367-1.15 5.09c-10.622-2.452-22.155-4.275-34.23-5.408c-7.034-10.017-14.323-19.124-21.64-27.008a160.789 160.789 0 0 1 5.888-5.4c18.9-16.447 36.564-22.941 44.612-18.3ZM128 90.808c12.625 0 22.86 10.235 22.86 22.86s-10.235 22.86-22.86 22.86s-22.86-10.235-22.86-22.86s10.235-22.86 22.86-22.86Z"></path></svg>

Before

Width:  |  Height:  |  Size: 4.0 KiB

+130 -72
View File
@@ -15,11 +15,12 @@ import {
CollapsibleContent, CollapsibleContent,
CollapsibleTrigger, CollapsibleTrigger,
} from "@/components/ui/collapsible"; } from "@/components/ui/collapsible";
import { ChevronDown } from "lucide-react"; import { ChevronDown, ChevronLeft, ChevronRight } from "lucide-react";
// Hardcoded values (not recommended for production) //! Hardcoded values (not recommended for production)
//! Highly recommended to move all Firecrawl API calls to the backend (e.g. Next.js API route)
const FIRECRAWL_API_URL = "https://api.firecrawl.dev"; // Replace with your actual API URL whether it is local or using Firecrawl Cloud const FIRECRAWL_API_URL = "https://api.firecrawl.dev"; // Replace with your actual API URL whether it is local or using Firecrawl Cloud
const FIRECRAWL_API_KEY = ""; // Replace with your actual API key const FIRECRAWL_API_KEY = "fc-YOUR_API_KEY"; // Replace with your actual API key
interface FormData { interface FormData {
url: string; url: string;
@@ -100,7 +101,8 @@ export default function FirecrawlComponent() {
const [elapsedTime, setElapsedTime] = useState<number>(0); const [elapsedTime, setElapsedTime] = useState<number>(0);
const [showCrawlStatus, setShowCrawlStatus] = useState<boolean>(false); const [showCrawlStatus, setShowCrawlStatus] = useState<boolean>(false);
const [isScraping, setIsScraping] = useState<boolean>(false); const [isScraping, setIsScraping] = useState<boolean>(false);
const [showAllUrls, setShowAllUrls] = useState<boolean>(false); const [currentPage, setCurrentPage] = useState<number>(1);
const urlsPerPage = 10;
useEffect(() => { useEffect(() => {
let timer: NodeJS.Timeout; let timer: NodeJS.Timeout;
@@ -289,6 +291,7 @@ export default function FirecrawlComponent() {
const data: ScrapeResult = await response.json(); const data: ScrapeResult = await response.json();
newScrapeResults[url] = data; newScrapeResults[url] = data;
setCrawlStatus((prev) => ({ ...prev, current: index + 1 })); setCrawlStatus((prev) => ({ ...prev, current: index + 1 }));
setScrapeResults({ ...scrapeResults, ...newScrapeResults });
} catch (error) { } catch (error) {
console.error(`Error scraping ${url}:`, error); console.error(`Error scraping ${url}:`, error);
newScrapeResults[url] = { newScrapeResults[url] = {
@@ -312,22 +315,35 @@ export default function FirecrawlComponent() {
} }
} }
setScrapeResults(newScrapeResults);
setLoading(false); setLoading(false);
setIsScraping(false); setIsScraping(false);
}; };
const handlePageChange = (newPage: number) => {
setCurrentPage(newPage);
};
const paginatedUrls = crawledUrls.slice(
(currentPage - 1) * urlsPerPage,
currentPage * urlsPerPage
);
return ( return (
<div className="max-w-2xl mx-auto p-4"> <div className="max-w-2xl mx-auto p-4">
<Card> <Card>
<CardHeader className="flex items-center justify-between"> <CardHeader className="flex items-start justify-between mb-0 pb-4">
<CardTitle className="flex items-center space-x-2"> <CardTitle className="flex items-center justify-between w-full space-x-2">
<span>Extract web content with Firecrawl 🔥</span> <span className="text-base">Extract web content</span>
<a
href="https://www.firecrawl.dev"
className="text-xs text-gray-500 font-normal px-3 py-1 bg-zinc-100 rounded-xl hover:bg-zinc-200 transition-colors"
>
Powered by Firecrawl 🔥
</a>
</CardTitle> </CardTitle>
<div className="text-sm text-gray-500 w-11/12 items-center"> <div className="text-sm text-gray-500 w-11/12 items-center">
Use this component to quickly build your own UI for Firecrawl. Plug Use this component to quickly give your users the ability to connect
in your API key and the component will handle the rest. Learn more their AI apps to web data with Firecrawl. Learn more on the{" "}
on the{" "}
<a <a
href="https://docs.firecrawl.dev/" href="https://docs.firecrawl.dev/"
className="text-sm text-blue-500" className="text-sm text-blue-500"
@@ -347,7 +363,36 @@ export default function FirecrawlComponent() {
onChange={handleChange} onChange={handleChange}
/> />
<Button type="submit" variant="default" disabled={loading}> <Button type="submit" variant="default" disabled={loading}>
{loading ? "Running..." : "Run"} {loading ? (
<div
role="status"
className="flex items-center justify-between space-x-2"
>
<svg
className="animate-spin h-4 w-4 text-white"
xmlns="http://www.w3.org/2000/svg"
fill="none"
viewBox="0 0 24 24"
>
<circle
className="opacity-25"
cx="12"
cy="12"
r="10"
stroke="currentColor"
strokeWidth="4"
></circle>
<path
className="opacity-75"
fill="currentColor"
d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"
></path>
</svg>
<span className="sr-only">Loading...</span>
</div>
) : (
"Run"
)}
</Button> </Button>
</div> </div>
<Collapsible <Collapsible
@@ -361,7 +406,7 @@ export default function FirecrawlComponent() {
<ChevronDown className="h-4 w-4 opacity-50" /> <ChevronDown className="h-4 w-4 opacity-50" />
</Button> </Button>
</CollapsibleTrigger> </CollapsibleTrigger>
<CollapsibleContent className="space-y-4 mt-4"> <CollapsibleContent className="space-y-4 mt-4 px-2">
<div className="flex items-center space-x-2"> <div className="flex items-center space-x-2">
<Checkbox <Checkbox
id="crawlSubPages" id="crawlSubPages"
@@ -484,8 +529,8 @@ export default function FirecrawlComponent() {
className="text-sm cursor-pointer" className="text-sm cursor-pointer"
> >
{selectedUrls.length === crawledUrls.length {selectedUrls.length === crawledUrls.length
? "Unselect All" ? `Unselect All (${selectedUrls.length})`
: "Select All"} : `Select All (${selectedUrls.length})`}
</label> </label>
</> </>
)} )}
@@ -503,41 +548,57 @@ export default function FirecrawlComponent() {
!isScraping && ( !isScraping && (
<> <>
<ul className="pl-2"> <ul className="pl-2">
{(showAllUrls ? crawledUrls : crawledUrls.slice(0, 10)).map( {paginatedUrls.map((url, index) => (
(url, index) => ( <li
<li key={index} className="flex items-center space-x-2"> key={index}
<Checkbox className="flex items-center space-x-2 my-2 text-sm"
checked={selectedUrls.includes(url)}
onCheckedChange={() =>
setSelectedUrls((prev) =>
prev.includes(url)
? prev.filter((u) => u !== url)
: [...prev, url]
)
}
/>
<span>{url}</span>
</li>
)
)}
</ul>
{crawledUrls.length > 10 && (
<div className="flex justify-center mt-2">
<Button
variant="link"
onClick={() => setShowAllUrls(!showAllUrls)}
> >
{showAllUrls ? "Show Less" : "Show All"} <Checkbox
</Button> checked={selectedUrls.includes(url)}
</div> onCheckedChange={() =>
)} setSelectedUrls((prev) =>
prev.includes(url)
? prev.filter((u) => u !== url)
: [...prev, url]
)
}
/>
<span className="flex items-center max-w-lg">
{url.length > 70 ? `${url.slice(0, 70)}...` : url}
</span>
</li>
))}
</ul>
<div className="flex items-center justify-between mt-4">
<Button
variant="outline"
className="px-2"
onClick={() => handlePageChange(currentPage - 1)}
disabled={currentPage === 1}
>
<ChevronLeft className="h-5 w-5" />
</Button>
<span className="text-sm text-gray-500">
Page {currentPage} of{" "}
{Math.ceil(crawledUrls.length / urlsPerPage)}
</span>
<Button
variant="outline"
className="px-2"
onClick={() => handlePageChange(currentPage + 1)}
disabled={currentPage * urlsPerPage >= crawledUrls.length}
>
<ChevronRight className="h-5 w-5 " />
</Button>
</div>
</> </>
)} )}
</CardContent> </CardContent>
<CardFooter className="flex justify-center"> <CardFooter className="w-full flex justify-center">
{crawledUrls.length > 0 && !scrapingSelectedLoading && ( {crawledUrls.length > 0 && !scrapingSelectedLoading && (
<Button <Button
variant="default" variant="default"
className="w-full"
onClick={handleScrapeSelected} onClick={handleScrapeSelected}
disabled={loading || selectedUrls.length === 0} disabled={loading || selectedUrls.length === 0}
> >
@@ -549,45 +610,42 @@ export default function FirecrawlComponent() {
{Object.keys(scrapeResults).length > 0 && ( {Object.keys(scrapeResults).length > 0 && (
<div className="mt-4"> <div className="mt-4">
<h2 className="text-2xl font-bold mb-4">Scrape Results</h2> <h2 className="text-base font-bold ">Scrape Results</h2>
<div className="grid grid-cols-2 gap-4"> <p className="text-sm text-gray-500">
You can do whatever you want with the scrape results. Here is a
basic showcase of the markdown.
</p>
<div className="flex flex-col gap-4 mt-4 w-full">
{Object.entries(scrapeResults).map(([url, result]) => ( {Object.entries(scrapeResults).map(([url, result]) => (
<Card key={url} className="overflow-hidden"> <Card key={url} className="relative p-4 w-full">
<CardContent> <CardTitle className="text-sm font-normal flex flex-col">
<div className="max-h-60 overflow-y-auto"> <span>{result.data.metadata.title}</span>
<div className="text-base font-bold py-2"> <span className="text-xs text-gray-500">
{url {url
.replace(/^(https?:\/\/)?(www\.)?/, "") .replace(/^(https?:\/\/)?(www\.)?/, "")
.replace(/\/$/, "")} .replace(/\/$/, "")}
</div> </span>
</CardTitle>
<CardContent className="relative px-0 pt-2 !text-xs w-full">
<div className=" overflow-y-auto h-32 bg-zinc-100 rounded-md p-2 w-full">
{result.success ? ( {result.success ? (
<> <>
<pre className="text-xs whitespace-pre-wrap"> <pre className="text-xs whitespace-pre-wrap">
{result.data.markdown.trim().slice(0, 200)}... {result.data.markdown.trim()}
</pre> </pre>
</> </>
) : ( ) : (
<p className="text-red-500">Failed to scrape this URL</p> <>
<p className="text-red-500">
Failed to scrape this URL
</p>
<p className="text-zinc-500 font-mono">
{result.toString()}
</p>
</>
)} )}
</div> </div>
</CardContent> </CardContent>
<CardFooter className="flex justify-center">
<Button
variant="outline"
size="sm"
onClick={(event) => {
navigator.clipboard.writeText(result.data.markdown);
const button = event.currentTarget as HTMLButtonElement;
const originalText = button.textContent;
button.textContent = "Copied!";
setTimeout(() => {
button.textContent = originalText;
}, 2000);
}}
>
Copy Markdown
</Button>
</CardFooter>
</Card> </Card>
))} ))}
</div> </div>
+2
View File
@@ -13,6 +13,8 @@ x-common-service: &common-service
- PORT=${PORT:-3002} - PORT=${PORT:-3002}
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
- OPENAI_API_KEY=${OPENAI_API_KEY} - OPENAI_API_KEY=${OPENAI_API_KEY}
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
- MODEL_NAME=${MODEL_NAME:-gpt-4o}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY} - SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
@@ -4,12 +4,12 @@
2. Build Docker images, and host it in your Docker Registry (replace the target registry with your own) 2. Build Docker images, and host it in your Docker Registry (replace the target registry with your own)
1. API (which is also used as a worker image) 1. API (which is also used as a worker image)
1. ```bash 1. ```bash
docker build -t ghcr.io/winkk-dev/firecrawl:latest ../../apps/api docker build --no-cache -t ghcr.io/winkk-dev/firecrawl:latest ../../../apps/api
docker push ghcr.io/winkk-dev/firecrawl:latest docker push ghcr.io/winkk-dev/firecrawl:latest
``` ```
2. Playwright 2. Playwright
1. ```bash 1. ```bash
docker build -t ghcr.io/winkk-dev/firecrawl-playwright:latest ../../apps/playwright-service docker build --no-cache -t ghcr.io/winkk-dev/firecrawl-playwright:latest ../../../apps/playwright-service
docker push ghcr.io/winkk-dev/firecrawl-playwright:latest docker push ghcr.io/winkk-dev/firecrawl-playwright:latest
``` ```
3. Replace the image in [worker.yaml](worker.yaml), [api.yaml](api.yaml) and [playwright-service.yaml](playwright-service.yaml) 3. Replace the image in [worker.yaml](worker.yaml), [api.yaml](api.yaml) and [playwright-service.yaml](playwright-service.yaml)
+29 -10
View File
@@ -15,16 +15,35 @@ spec:
imagePullSecrets: imagePullSecrets:
- name: docker-registry-secret - name: docker-registry-secret
containers: containers:
- name: api - name: api
image: ghcr.io/winkk-dev/firecrawl:latest image: ghcr.io/winkk-dev/firecrawl:latest
args: [ "pnpm", "run", "start:production" ] imagePullPolicy: Always
ports: args: [ "pnpm", "run", "start:production" ]
- containerPort: 3002 ports:
envFrom: - containerPort: 3002
- configMapRef: envFrom:
name: firecrawl-config - configMapRef:
- secretRef: name: firecrawl-config
name: firecrawl-secret #- secretRef:
# name: firecrawl-secret
livenessProbe:
httpGet:
path: /v0/health/liveness
port: 3002
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 3
readinessProbe:
httpGet:
path: /v0/health/readiness
port: 3002
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 3
--- ---
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
@@ -7,8 +7,7 @@ data:
PORT: "3002" PORT: "3002"
HOST: "0.0.0.0" HOST: "0.0.0.0"
REDIS_URL: "redis://redis:6379" REDIS_URL: "redis://redis:6379"
PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000" REDIS_RATE_LIMIT_URL: "redis://redis:6379"
PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000/html"
USE_DB_AUTHENTICATION: "false" USE_DB_AUTHENTICATION: "false"
SUPABASE_ANON_TOKEN: "" HDX_NODE_BETA_MODE: "1"
SUPABASE_URL: ""
SUPABASE_SERVICE_TOKEN: ""
@@ -1,3 +1,10 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: playwright-service-config
data:
PORT: "3000"
---
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
@@ -15,13 +22,32 @@ spec:
imagePullSecrets: imagePullSecrets:
- name: docker-registry-secret - name: docker-registry-secret
containers: containers:
- name: playwright-service - name: playwright-service
image: ghcr.io/winkk-dev/firecrawl-playwright:latest image: ghcr.io/winkk-dev/firecrawl-playwright:latest
ports: imagePullPolicy: Always
- containerPort: 3000 ports:
envFrom: - containerPort: 3000
- configMapRef: envFrom:
name: firecrawl-config - configMapRef:
name: playwright-service-config
livenessProbe:
httpGet:
path: /health/liveness
port: 3000
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 3
readinessProbe:
httpGet:
path: /health/readiness
port: 3000
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 3
--- ---
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
@@ -15,10 +15,12 @@ spec:
imagePullSecrets: imagePullSecrets:
- name: docker-registry-secret - name: docker-registry-secret
containers: containers:
- name: worker - name: worker
image: ghcr.io/winkk-dev/firecrawl:latest image: ghcr.io/winkk-dev/firecrawl:latest
envFrom: imagePullPolicy: Always
- configMapRef: args: [ "pnpm", "run", "workers" ]
name: firecrawl-config envFrom:
- secretRef: - configMapRef:
name: firecrawl-secret name: firecrawl-config
#- secretRef:
# name: firecrawl-secret