diff --git a/apps/api/.env.example b/apps/api/.env.example index 5d8e746d..08ff7d7f 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -57,3 +57,14 @@ SELF_HOSTED_WEBHOOK_URL= # Resend API Key for transactional emails RESEND_API_KEY= + +# LOGGING_LEVEL determines the verbosity of logs that the system will output. +# Available levels are: +# NONE - No logs will be output. +# ERROR - For logging error messages that indicate a failure in a specific operation. +# WARN - For logging potentially harmful situations that are not necessarily errors. +# INFO - For logging informational messages that highlight the progress of the application. +# DEBUG - For logging detailed information on the flow through the system, primarily used for debugging. +# TRACE - For logging more detailed information than the DEBUG level. +# Set LOGGING_LEVEL to one of the above options to control logging output. +LOGGING_LEVEL=INFO diff --git a/apps/api/.gitignore b/apps/api/.gitignore index 66bccfed..edc2faf4 100644 --- a/apps/api/.gitignore +++ b/apps/api/.gitignore @@ -3,4 +3,6 @@ .env *.csv dump.rdb -/mongo-data \ No newline at end of file +/mongo-data + +/.next/ diff --git a/apps/api/fly.toml b/apps/api/fly.toml index 286bc086..94108e5f 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -4,15 +4,15 @@ # app = 'firecrawl-scraper-js' -primary_region = 'mia' +primary_region = 'iad' kill_signal = 'SIGINT' kill_timeout = '30s' [build] [processes] - app = 'node --max-old-space-size=4096 dist/src/index.js' - worker = 'node --max-old-space-size=4096 dist/src/services/queue-worker.js' + app = 'node --max-old-space-size=8192 dist/src/index.js' + worker = 'node --max-old-space-size=8192 dist/src/services/queue-worker.js' [http_service] internal_port = 8080 @@ -24,8 +24,8 @@ kill_timeout = '30s' [http_service.concurrency] type = "requests" - hard_limit = 100 - soft_limit = 50 + hard_limit = 200 + soft_limit = 75 [[http_service.checks]] grace_period = "20s" diff --git a/apps/api/package.json b/apps/api/package.json index da1b2b33..15e97377 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -26,6 +26,7 @@ "license": "ISC", "devDependencies": { "@flydotio/dockerfile": "^0.4.10", + "@jest/globals": "^29.7.0", "@tsconfig/recommended": "^1.0.3", "@types/body-parser": "^1.19.2", "@types/bull": "^4.10.0", @@ -63,6 +64,7 @@ "axios": "^1.3.4", "bottleneck": "^2.19.5", "bull": "^4.15.0", + "cacheable-lookup": "^6.1.0", "cheerio": "^1.0.0-rc.12", "cohere": "^1.1.1", "cors": "^2.8.5", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 02d8363b..ec83e18b 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -59,6 +59,9 @@ importers: bull: specifier: ^4.15.0 version: 4.15.0 + cacheable-lookup: + specifier: ^6.1.0 + version: 6.1.0 cheerio: specifier: ^1.0.0-rc.12 version: 1.0.0-rc.12 @@ -189,6 +192,9 @@ importers: '@flydotio/dockerfile': specifier: ^0.4.10 version: 0.4.11 + '@jest/globals': + specifier: ^29.7.0 + version: 29.7.0 '@tsconfig/recommended': specifier: ^1.0.3 version: 1.0.6 @@ -1937,6 +1943,10 @@ packages: resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==} engines: {node: '>= 0.8'} + cacheable-lookup@6.1.0: + resolution: {integrity: sha512-KJ/Dmo1lDDhmW2XDPMo+9oiy/CeqosPguPCrgcVzKyZrL6pM1gU2GmPY/xo6OQPTUaA/c0kwHuywB4E6nmT9ww==} + engines: {node: '>=10.6.0'} + call-bind@1.0.7: resolution: {integrity: sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==} engines: {node: '>= 0.4'} @@ -4369,8 +4379,8 @@ packages: engines: {node: '>=14.17'} hasBin: true - typescript@5.5.3: - resolution: {integrity: sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==} + typescript@5.5.4: + resolution: {integrity: sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==} engines: {node: '>=14.17'} hasBin: true @@ -6917,6 +6927,8 @@ snapshots: bytes@3.1.2: {} + cacheable-lookup@6.1.0: {} + call-bind@1.0.7: dependencies: es-define-property: 1.0.0 @@ -8927,7 +8939,7 @@ snapshots: csv-parse: 5.5.6 gpt3-tokenizer: 1.1.5 openai: 3.3.0 - typescript: 5.5.3 + typescript: 5.5.4 uuid: 9.0.1 zod: 3.23.8 transitivePeerDependencies: @@ -9519,7 +9531,7 @@ snapshots: typescript@5.4.5: {} - typescript@5.5.3: {} + typescript@5.5.4: {} typesense@1.8.2(@babel/runtime@7.24.6): dependencies: diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index 3e324d39..019bc968 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -858,7 +858,6 @@ describe("E2E Tests for API Routes", () => { await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again } } - console.log(crawlData) expect(crawlData.length).toBeGreaterThan(0); expect(crawlData).toEqual(expect.arrayContaining([ expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }), diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/admin/queue.ts new file mode 100644 index 00000000..cb5f99ed --- /dev/null +++ b/apps/api/src/controllers/admin/queue.ts @@ -0,0 +1,87 @@ +import { Request, Response } from "express"; + +import { Job } from "bull"; +import { Logger } from "../../lib/logger"; +import { getWebScraperQueue } from "../../services/queue-service"; +import { checkAlerts } from "../../services/alerts"; + +export async function cleanBefore24hCompleteJobsController( + req: Request, + res: Response +) { + Logger.info("🐂 Cleaning jobs older than 24h"); + try { + const webScraperQueue = getWebScraperQueue(); + const batchSize = 10; + const numberOfBatches = 9; // Adjust based on your needs + const completedJobsPromises: Promise[] = []; + for (let i = 0; i < numberOfBatches; i++) { + completedJobsPromises.push( + webScraperQueue.getJobs( + ["completed"], + i * batchSize, + i * batchSize + batchSize, + true + ) + ); + } + const completedJobs: Job[] = ( + await Promise.all(completedJobsPromises) + ).flat(); + const before24hJobs = + completedJobs.filter( + (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 + ) || []; + + let count = 0; + + if (!before24hJobs) { + return res.status(200).send(`No jobs to remove.`); + } + + for (const job of before24hJobs) { + try { + await job.remove(); + count++; + } catch (jobError) { + Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`); + } + } + return res.status(200).send(`Removed ${count} completed jobs.`); + } catch (error) { + Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`); + return res.status(500).send("Failed to clean jobs"); + } +} + + +export async function checkQueuesController(req: Request, res: Response) { + try { + await checkAlerts(); + return res.status(200).send("Alerts initialized"); + } catch (error) { + Logger.debug(`Failed to initialize alerts: ${error}`); + return res.status(500).send("Failed to initialize alerts"); + } + } + + // Use this as a "health check" that way we dont destroy the server +export async function queuesController(req: Request, res: Response) { + try { + const webScraperQueue = getWebScraperQueue(); + + const [webScraperActive] = await Promise.all([ + webScraperQueue.getActiveCount(), + ]); + + const noActiveJobs = webScraperActive === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noActiveJobs ? 200 : 500).json({ + webScraperActive, + noActiveJobs, + }); + } catch (error) { + Logger.error(error); + return res.status(500).json({ error: error.message }); + } + } \ No newline at end of file diff --git a/apps/api/src/controllers/admin/redis-health.ts b/apps/api/src/controllers/admin/redis-health.ts new file mode 100644 index 00000000..e35d6db9 --- /dev/null +++ b/apps/api/src/controllers/admin/redis-health.ts @@ -0,0 +1,86 @@ +import { Request, Response } from "express"; +import Redis from "ioredis"; +import { Logger } from "../../lib/logger"; +import { sendSlackWebhook } from "../../services/alerts/slack"; +import { redisRateLimitClient } from "../../services/rate-limiter"; + +export async function redisHealthController(req: Request, res: Response) { + const retryOperation = async (operation, retries = 3) => { + for (let attempt = 1; attempt <= retries; attempt++) { + try { + return await operation(); + } catch (error) { + if (attempt === retries) throw error; + Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`); + await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying + } + } + }; + + try { + const queueRedis = new Redis(process.env.REDIS_URL); + + const testKey = "test"; + const testValue = "test"; + + // Test queueRedis + let queueRedisHealth; + try { + await retryOperation(() => queueRedis.set(testKey, testValue)); + queueRedisHealth = await retryOperation(() => queueRedis.get(testKey)); + await retryOperation(() => queueRedis.del(testKey)); + } catch (error) { + Logger.error(`queueRedis health check failed: ${error}`); + queueRedisHealth = null; + } + + // Test redisRateLimitClient + let redisRateLimitHealth; + try { + await retryOperation(() => redisRateLimitClient.set(testKey, testValue)); + redisRateLimitHealth = await retryOperation(() => + redisRateLimitClient.get(testKey) + ); + await retryOperation(() => redisRateLimitClient.del(testKey)); + } catch (error) { + Logger.error(`redisRateLimitClient health check failed: ${error}`); + redisRateLimitHealth = null; + } + + const healthStatus = { + queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy", + redisRateLimitClient: + redisRateLimitHealth === testValue ? "healthy" : "unhealthy", + }; + + if ( + healthStatus.queueRedis === "healthy" && + healthStatus.redisRateLimitClient === "healthy" + ) { + Logger.info("Both Redis instances are healthy"); + return res.status(200).json({ status: "healthy", details: healthStatus }); + } else { + Logger.info( + `Redis instances health check: ${JSON.stringify(healthStatus)}` + ); + await sendSlackWebhook( + `[REDIS DOWN] Redis instances health check: ${JSON.stringify( + healthStatus + )}`, + true + ); + return res + .status(500) + .json({ status: "unhealthy", details: healthStatus }); + } + } catch (error) { + Logger.error(`Redis health check failed: ${error}`); + await sendSlackWebhook( + `[REDIS DOWN] Redis instances health check: ${error.message}`, + true + ); + return res + .status(500) + .json({ status: "unhealthy", message: error.message }); + } +} diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 56a5ec61..5dff80b8 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -6,6 +6,7 @@ import { withAuth } from "../../src/lib/withAuth"; import { RateLimiterRedis } from "rate-limiter-flexible"; import { setTraceAttributes } from '@hyperdx/node-opentelemetry'; import { sendNotification } from "../services/notification/email_notification"; +import { Logger } from "../lib/logger"; export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise { return withAuth(supaAuthenticateUser)(req, res, mode); @@ -17,7 +18,7 @@ function setTrace(team_id: string, api_key: string) { api_key }); } catch (error) { - console.error('Error setting trace attributes:', error); + Logger.error(`Error setting trace attributes: ${error.message}`); } } @@ -82,12 +83,15 @@ export async function supaAuthenticateUser( // $$ language plpgsql; if (error) { - console.error('Error fetching key and price_id:', error); + Logger.warn(`Error fetching key and price_id: ${error.message}`); } else { // console.log('Key and Price ID:', data); } + + if (error || !data || data.length === 0) { + Logger.warn(`Error fetching api key: ${error.message} or data is empty`); return { success: false, error: "Unauthorized: Invalid token", @@ -135,7 +139,7 @@ export async function supaAuthenticateUser( try { await rateLimiter.consume(team_endpoint_token); } catch (rateLimiterRes) { - console.error(rateLimiterRes); + Logger.error(`Rate limit exceeded: ${rateLimiterRes}`); const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1; const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext); @@ -177,7 +181,10 @@ export async function supaAuthenticateUser( .select("*") .eq("key", normalizedApi); + + if (error || !data || data.length === 0) { + Logger.warn(`Error fetching api key: ${error.message} or data is empty`); return { success: false, error: "Unauthorized: Invalid token", @@ -190,7 +197,6 @@ export async function supaAuthenticateUser( return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""}; } - function getPlanByPriceId(price_id: string) { switch (price_id) { case process.env.STRIPE_PRICE_ID_STARTER: @@ -199,11 +205,14 @@ function getPlanByPriceId(price_id: string) { return 'standard'; case process.env.STRIPE_PRICE_ID_SCALE: return 'scale'; - case process.env.STRIPE_PRICE_ID_HOBBY || process.env.STRIPE_PRICE_ID_HOBBY_YEARLY: + case process.env.STRIPE_PRICE_ID_HOBBY: + case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY: return 'hobby'; - case process.env.STRIPE_PRICE_ID_STANDARD_NEW || process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY: + case process.env.STRIPE_PRICE_ID_STANDARD_NEW: + case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY: return 'standardnew'; - case process.env.STRIPE_PRICE_ID_GROWTH || process.env.STRIPE_PRICE_ID_GROWTH_YEARLY: + case process.env.STRIPE_PRICE_ID_GROWTH: + case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY: return 'growth'; default: return 'free'; diff --git a/apps/api/src/controllers/crawl-cancel.ts b/apps/api/src/controllers/crawl-cancel.ts index ff4b2c58..d0c109ec 100644 --- a/apps/api/src/controllers/crawl-cancel.ts +++ b/apps/api/src/controllers/crawl-cancel.ts @@ -5,6 +5,7 @@ import { addWebScraperJob } from "../../src/services/queue-jobs"; import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabase_service } from "../../src/services/supabase"; import { billTeam } from "../../src/services/billing/credit_billing"; +import { Logger } from "../../src/lib/logger"; export async function crawlCancelController(req: Request, res: Response) { try { @@ -43,7 +44,7 @@ export async function crawlCancelController(req: Request, res: Response) { const { partialDocs } = await job.progress(); if (partialDocs && partialDocs.length > 0 && jobState === "active") { - console.log("Billing team for partial docs..."); + Logger.info("Billing team for partial docs..."); // Note: the credits that we will bill them here might be lower than the actual // due to promises that are not yet resolved await billTeam(team_id, partialDocs.length); @@ -55,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) { await job.discard(); await job.moveToFailed(Error("Job cancelled by user"), true); } catch (error) { - console.error(error); + Logger.error(error); } const newJobState = await job.getState(); @@ -64,7 +65,7 @@ export async function crawlCancelController(req: Request, res: Response) { status: "cancelled" }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/crawl-status.ts index a55003cc..5aafa433 100644 --- a/apps/api/src/controllers/crawl-status.ts +++ b/apps/api/src/controllers/crawl-status.ts @@ -4,6 +4,7 @@ import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; +import { Logger } from "../../src/lib/logger"; export async function crawlStatusController(req: Request, res: Response) { try { @@ -44,7 +45,7 @@ export async function crawlStatusController(req: Request, res: Response) { partial_data: jobStatus == 'completed' ? [] : partialDocs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 89358fcc..9480c63b 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -10,6 +10,8 @@ import { logCrawl } from "../../src/services/logging/crawl_log"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { createIdempotencyKey } from "../../src/services/idempotency/create"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; +import { v4 as uuidv4 } from "uuid"; +import { Logger } from "../../src/lib/logger"; export async function crawlController(req: Request, res: Response) { try { @@ -30,7 +32,7 @@ export async function crawlController(req: Request, res: Response) { try { createIdempotencyKey(req); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } @@ -60,10 +62,11 @@ export async function crawlController(req: Request, res: Response) { const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; - if (mode === "single_urls" && !url.includes(",")) { + if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? try { const a = new WebScraperDataProvider(); await a.setOptions({ + jobId: uuidv4(), mode: "single_urls", urls: [url], crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, @@ -83,7 +86,7 @@ export async function crawlController(req: Request, res: Response) { documents: docs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } @@ -101,7 +104,7 @@ export async function crawlController(req: Request, res: Response) { res.json({ jobId: job.id }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 2c3dc4ea..7c5c804d 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -3,6 +3,7 @@ import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; +import { Logger } from "../../src/lib/logger"; export async function crawlPreviewController(req: Request, res: Response) { try { @@ -39,7 +40,7 @@ export async function crawlPreviewController(req: Request, res: Response) { res.json({ jobId: job.id }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/liveness.ts b/apps/api/src/controllers/liveness.ts new file mode 100644 index 00000000..8ff1a96f --- /dev/null +++ b/apps/api/src/controllers/liveness.ts @@ -0,0 +1,6 @@ +import { Request, Response } from "express"; + +export async function livenessController(req: Request, res: Response) { + //TODO: add checks if the application is live and healthy like checking the redis connection + res.status(200).json({ status: "ok" }); +} diff --git a/apps/api/src/controllers/readiness.ts b/apps/api/src/controllers/readiness.ts new file mode 100644 index 00000000..cdb1f02c --- /dev/null +++ b/apps/api/src/controllers/readiness.ts @@ -0,0 +1,6 @@ +import { Request, Response } from "express"; + +export async function readinessController(req: Request, res: Response) { + // TODO: add checks when the application is ready to serve traffic + res.status(200).json({ status: "ok" }); +} diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 267fdb50..7b4ccfd1 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -12,8 +12,11 @@ import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOri import { addWebScraperJob } from '../services/queue-jobs'; import { getWebScraperQueue } from '../services/queue-service'; import { supabase_service } from '../services/supabase'; +import { v4 as uuidv4 } from "uuid"; +import { Logger } from '../lib/logger'; export async function scrapeHelper( + jobId: string, req: Request, team_id: string, crawlerOptions: any, @@ -148,7 +151,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(402).json({ error: "Insufficient credits" }); } } catch (error) { - console.error(error); + Logger.error(error); earlyReturn = true; return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); } @@ -163,8 +166,11 @@ export async function scrapeController(req: Request, res: Response) { checkCredits(); } + const jobId = uuidv4(); + const startTime = new Date().getTime(); const result = await scrapeHelper( + jobId, req, team_id, crawlerOptions, @@ -205,6 +211,7 @@ export async function scrapeController(req: Request, res: Response) { } logJob({ + job_id: jobId, success: result.success, message: result.error, num_docs: 1, @@ -224,7 +231,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(result.returnCode).json(result); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 8cb6d55b..dfd9b8b9 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -7,8 +7,11 @@ import { logJob } from "../services/logging/log_job"; import { PageOptions, SearchOptions } from "../lib/entities"; import { search } from "../search"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; +import { v4 as uuidv4 } from "uuid"; +import { Logger } from "../lib/logger"; export async function searchHelper( + jobId: string, req: Request, team_id: string, crawlerOptions: any, @@ -75,6 +78,7 @@ export async function searchHelper( const a = new WebScraperDataProvider(); await a.setOptions({ + jobId, mode: "single_urls", urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), crawlerOptions: { @@ -148,6 +152,8 @@ export async function searchController(req: Request, res: Response) { const searchOptions = req.body.searchOptions ?? { limit: 7 }; + const jobId = uuidv4(); + try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); @@ -155,11 +161,12 @@ export async function searchController(req: Request, res: Response) { return res.status(402).json({ error: "Insufficient credits" }); } } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: "Internal server error" }); } const startTime = new Date().getTime(); const result = await searchHelper( + jobId, req, team_id, crawlerOptions, @@ -169,6 +176,7 @@ export async function searchController(req: Request, res: Response) { const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; logJob({ + job_id: jobId, success: result.success, message: result.error, num_docs: result.data ? result.data.length : 0, @@ -183,7 +191,7 @@ export async function searchController(req: Request, res: Response) { }); return res.status(result.returnCode).json(result); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts index 231885f4..3d7fccbb 100644 --- a/apps/api/src/controllers/status.ts +++ b/apps/api/src/controllers/status.ts @@ -1,6 +1,7 @@ import { Request, Response } from "express"; import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; +import { Logger } from "../../src/lib/logger"; export async function crawlJobStatusPreviewController(req: Request, res: Response) { try { @@ -35,7 +36,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons partial_data: jobStatus == 'completed' ? [] : partialDocs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/example.ts b/apps/api/src/example.ts index 0c30ea33..edf0faef 100644 --- a/apps/api/src/example.ts +++ b/apps/api/src/example.ts @@ -4,6 +4,7 @@ async function example() { const example = new WebScraperDataProvider(); await example.setOptions({ + jobId: "TEST", mode: "crawl", urls: ["https://mendable.ai"], crawlerOptions: {}, diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 88ec4418..ebe6ef38 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -7,21 +7,30 @@ import { v0Router } from "./routes/v0"; import { initSDK } from "@hyperdx/node-opentelemetry"; import cluster from "cluster"; import os from "os"; -import { Job } from "bull"; -import { sendSlackWebhook } from "./services/alerts/slack"; -import { checkAlerts } from "./services/alerts"; -import Redis from "ioredis"; -import { redisRateLimitClient } from "./services/rate-limiter"; +import { Logger } from "./lib/logger"; +import { adminRouter } from "./routes/admin"; +import { ScrapeEvents } from "./lib/scrape-events"; +import http from 'node:http'; +import https from 'node:https'; +import CacheableLookup from 'cacheable-lookup'; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; -console.log(`Number of CPUs: ${numCPUs} available`); +Logger.info(`Number of CPUs: ${numCPUs} available`); + +const cacheable = new CacheableLookup({ + // this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme + lookup:false +}); + +cacheable.install(http.globalAgent); +cacheable.install(https.globalAgent) if (cluster.isMaster) { - console.log(`Master ${process.pid} is running`); + Logger.info(`Master ${process.pid} is running`); // Fork workers. for (let i = 0; i < numCPUs; i++) { @@ -30,8 +39,8 @@ if (cluster.isMaster) { cluster.on("exit", (worker, code, signal) => { if (code !== null) { - console.log(`Worker ${worker.process.pid} exited`); - console.log("Starting a new worker"); + Logger.info(`Worker ${worker.process.pid} exited`); + Logger.info("Starting a new worker"); cluster.fork(); } }); @@ -45,7 +54,6 @@ if (cluster.isMaster) { app.use(cors()); // Add this line to enable CORS - const serverAdapter = new ExpressAdapter(); serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); @@ -70,6 +78,7 @@ if (cluster.isMaster) { // register router app.use(v0Router); + app.use(adminRouter); const DEFAULT_PORT = process.env.PORT ?? 3002; const HOST = process.env.HOST ?? "localhost"; @@ -81,14 +90,9 @@ if (cluster.isMaster) { function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { - console.log(`Worker ${process.pid} listening on port ${port}`); - console.log( - `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` - ); - console.log(""); - console.log("1. Make sure Redis is running on port 6379 by default"); - console.log( - "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 " + Logger.info(`Worker ${process.pid} listening on port ${port}`); + Logger.info( + `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` ); }); return server; @@ -98,27 +102,6 @@ if (cluster.isMaster) { startServer(); } - // Use this as a "health check" that way we dont destroy the server - app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - - const [webScraperActive] = await Promise.all([ - webScraperQueue.getActiveCount(), - ]); - - const noActiveJobs = webScraperActive === 0; - // 200 if no active jobs, 503 if there are active jobs - return res.status(noActiveJobs ? 200 : 500).json({ - webScraperActive, - noActiveJobs, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } - }); - app.get(`/serverHealthCheck`, async (req, res) => { try { const webScraperQueue = getWebScraperQueue(); @@ -132,7 +115,7 @@ if (cluster.isMaster) { waitingJobs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } }); @@ -177,13 +160,13 @@ if (cluster.isMaster) { }); if (!response.ok) { - console.error("Failed to send Slack notification"); + Logger.error("Failed to send Slack notification"); } } }, timeout); } } catch (error) { - console.error(error); + Logger.debug(error); } }; @@ -191,140 +174,18 @@ if (cluster.isMaster) { } }); - app.get( - `/admin/${process.env.BULL_AUTH_KEY}/check-queues`, - async (req, res) => { - try { - await checkAlerts(); - return res.status(200).send("Alerts initialized"); - } catch (error) { - console.error("Failed to initialize alerts:", error); - return res.status(500).send("Failed to initialize alerts"); - } - } - ); - - app.get( - `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, - async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - const batchSize = 10; - const numberOfBatches = 9; // Adjust based on your needs - const completedJobsPromises: Promise[] = []; - for (let i = 0; i < numberOfBatches; i++) { - completedJobsPromises.push( - webScraperQueue.getJobs( - ["completed"], - i * batchSize, - i * batchSize + batchSize, - true - ) - ); - } - const completedJobs: Job[] = ( - await Promise.all(completedJobsPromises) - ).flat(); - const before24hJobs = - completedJobs.filter( - (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 - ) || []; - - let count = 0; - - if (!before24hJobs) { - return res.status(200).send(`No jobs to remove.`); - } - - for (const job of before24hJobs) { - try { - await job.remove(); - count++; - } catch (jobError) { - console.error(`Failed to remove job with ID ${job.id}:`, jobError); - } - } - return res.status(200).send(`Removed ${count} completed jobs.`); - } catch (error) { - console.error("Failed to clean last 24h complete jobs:", error); - return res.status(500).send("Failed to clean jobs"); - } - } - ); - app.get("/is-production", (req, res) => { res.send({ isProduction: global.isProduction }); }); - app.get( - `/admin/${process.env.BULL_AUTH_KEY}/redis-health`, - async (req, res) => { - try { - const queueRedis = new Redis(process.env.REDIS_URL); - - const testKey = "test"; - const testValue = "test"; - - // Test queueRedis - let queueRedisHealth; - try { - await queueRedis.set(testKey, testValue); - queueRedisHealth = await queueRedis.get(testKey); - await queueRedis.del(testKey); - } catch (error) { - console.error("queueRedis health check failed:", error); - queueRedisHealth = null; - } - - // Test redisRateLimitClient - let redisRateLimitHealth; - try { - await redisRateLimitClient.set(testKey, testValue); - redisRateLimitHealth = await redisRateLimitClient.get(testKey); - await redisRateLimitClient.del(testKey); - } catch (error) { - console.error("redisRateLimitClient health check failed:", error); - redisRateLimitHealth = null; - } - - const healthStatus = { - queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy", - redisRateLimitClient: - redisRateLimitHealth === testValue ? "healthy" : "unhealthy", - }; - - if ( - healthStatus.queueRedis === "healthy" && - healthStatus.redisRateLimitClient === "healthy" - ) { - console.log("Both Redis instances are healthy"); - return res - .status(200) - .json({ status: "healthy", details: healthStatus }); - } else { - console.log("Redis instances health check:", healthStatus); - await sendSlackWebhook( - `[REDIS DOWN] Redis instances health check: ${JSON.stringify( - healthStatus - )}`, - true - ); - return res - .status(500) - .json({ status: "unhealthy", details: healthStatus }); - } - } catch (error) { - console.error("Redis health check failed:", error); - await sendSlackWebhook( - `[REDIS DOWN] Redis instances health check: ${error.message}`, - true - ); - return res - .status(500) - .json({ status: "unhealthy", message: error.message }); - } - } - ); - - console.log(`Worker ${process.pid} started`); + Logger.info(`Worker ${process.pid} started`); } + +const wsq = getWebScraperQueue(); + +wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); +wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active")); +wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed")); +wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); +wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); +wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 2156fb3c..85a7e995 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -4,6 +4,7 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation import { generateOpenAICompletions } from "./models"; import { Document, ExtractorOptions } from "../entities"; +import { Logger } from "../logger"; // Generate completion using OpenAI export async function generateCompletions( @@ -44,7 +45,7 @@ export async function generateCompletions( return completionResult; } catch (error) { - console.error(`Error generating completions: ${error}`); + Logger.error(`Error generating completions: ${error}`); throw new Error(`Error generating completions: ${error.message}`); } default: diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 8de8ee4b..e696a8cd 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -48,7 +48,7 @@ function prepareOpenAIDoc( export async function generateOpenAICompletions({ client, - model = "gpt-4o", + model = process.env.MODEL_NAME || "gpt-4o", document, schema, //TODO - add zod dynamic type checking prompt = defaultPrompt, diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index 6ae5f99f..3b303781 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -1,6 +1,6 @@ export const defaultOrigin = "api"; -export const defaultTimeout = 30000; // 30 seconds +export const defaultTimeout = 45000; // 45 seconds export const defaultPageOptions = { onlyMainContent: false, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index f60e197f..9ffa4810 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -56,6 +56,7 @@ export type CrawlerOptions = { } export type WebScraperOptions = { + jobId: string; urls: string[]; mode: "single_urls" | "sitemap" | "crawl"; crawlerOptions?: CrawlerOptions; @@ -138,4 +139,5 @@ export interface FireEngineOptions{ engine?: string; blockMedia?: boolean; blockAds?: boolean; + disableJsDom?: boolean; } diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts new file mode 100644 index 00000000..872dbf51 --- /dev/null +++ b/apps/api/src/lib/logger.ts @@ -0,0 +1,53 @@ +enum LogLevel { + NONE = 'NONE', // No logs will be output. + ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation. + WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors. + INFO = 'INFO', // For logging informational messages that highlight the progress of the application. + DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging. + TRACE = 'TRACE' // For logging more detailed information than the DEBUG level. +} +export class Logger { + static colors = { + ERROR: '\x1b[31m%s\x1b[0m', // Red + WARN: '\x1b[33m%s\x1b[0m', // Yellow + INFO: '\x1b[34m%s\x1b[0m', // Blue + DEBUG: '\x1b[36m%s\x1b[0m', // Cyan + TRACE: '\x1b[35m%s\x1b[0m' // Magenta + }; + + static log (message: string, level: LogLevel) { + const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO; + const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE]; + const currentLevelIndex = levels.indexOf(logLevel); + const messageLevelIndex = levels.indexOf(level); + + if (currentLevelIndex >= messageLevelIndex) { + const color = Logger.colors[level]; + console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`); + + // if (process.env.USE_DB_AUTH) { + // save to supabase? another place? + // supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean }); + // } + } + } + static error(message: string | any) { + Logger.log(message, LogLevel.ERROR); + } + + static warn(message: string) { + Logger.log(message, LogLevel.WARN); + } + + static info(message: string) { + Logger.log(message, LogLevel.INFO); + } + + static debug(message: string) { + Logger.log(message, LogLevel.DEBUG); + } + + static trace(message: string) { + Logger.log(message, LogLevel.TRACE); + } +} diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts new file mode 100644 index 00000000..ab4ef681 --- /dev/null +++ b/apps/api/src/lib/scrape-events.ts @@ -0,0 +1,84 @@ +import { Job, JobId } from "bull"; +import type { baseScrapers } from "../scraper/WebScraper/single_url"; +import { supabase_service as supabase } from "../services/supabase"; +import { Logger } from "./logger"; + +export type ScrapeErrorEvent = { + type: "error", + message: string, + stack?: string, +} + +export type ScrapeScrapeEvent = { + type: "scrape", + url: string, + worker?: string, + method: (typeof baseScrapers)[number], + result: null | { + success: boolean, + response_code?: number, + response_size?: number, + error?: string | object, + // proxy?: string, + time_taken: number, + }, +} + +export type ScrapeQueueEvent = { + type: "queue", + event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed" | "failed", + worker?: string, +} + +export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent; + +export class ScrapeEvents { + static async insert(jobId: string, content: ScrapeEvent) { + if (jobId === "TEST") return null; + + if (process.env.USE_DB_AUTHENTICATION) { + try { + const result = await supabase.from("scrape_events").insert({ + job_id: jobId, + type: content.type, + content: content, + // created_at + }).select().single(); + return (result.data as any).id; + } catch (error) { + Logger.error(`Error inserting scrape event: ${error}`); + return null; + } + } + + return null; + } + + static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) { + if (logId === null) return; + + try { + const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; + await supabase.from("scrape_events").update({ + content: { + ...previousLog.content, + result, + } + }).eq("id", logId); + } catch (error) { + Logger.error(`Error updating scrape result: ${error}`); + } + } + + static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) { + try { + await this.insert(((job as any).id ? (job as any).id : job) as string, { + type: "queue", + event, + worker: process.env.FLY_MACHINE_ID, + }); + } catch (error) { + Logger.error(`Error logging job event: ${error}`); + } + } +} diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index ea5aa4d8..353c144b 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -1,4 +1,5 @@ import { AuthResponse } from "../../src/types"; +import { Logger } from "./logger"; let warningCount = 0; @@ -8,7 +9,7 @@ export function withAuth( return async function (...args: U): Promise { if (process.env.USE_DB_AUTHENTICATION === "false") { if (warningCount < 5) { - console.warn("WARNING - You're bypassing authentication"); + Logger.warn("You're bypassing authentication"); warningCount++; } return { success: true } as T; @@ -16,7 +17,7 @@ export function withAuth( try { return await originalFunction(...args); } catch (error) { - console.error("Error in withAuth function: ", error); + Logger.error(`Error in withAuth function: ${error}`); return { success: false, error: error.message } as T; } } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 3c98e11e..5e7d2279 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -10,6 +10,8 @@ import { DocumentUrl, Progress } from "../lib/entities"; import { billTeam } from "../services/billing/credit_billing"; import { Document } from "../lib/entities"; import { supabase_service } from "../services/supabase"; +import { Logger } from "../lib/logger"; +import { ScrapeEvents } from "../lib/scrape-events"; export async function startWebScraperPipeline({ job, @@ -23,6 +25,7 @@ export async function startWebScraperPipeline({ crawlerOptions: job.data.crawlerOptions, pageOptions: job.data.pageOptions, inProgress: (progress) => { + Logger.debug(`🐂 Job in progress ${job.id}`); if (progress.currentDocument) { partialDocs.push(progress.currentDocument); if (partialDocs.length > 50) { @@ -32,9 +35,12 @@ export async function startWebScraperPipeline({ } }, onSuccess: (result) => { + Logger.debug(`🐂 Job completed ${job.id}`); saveJob(job, result); }, onError: (error) => { + Logger.error(`🐂 Job failed ${job.id}`); + ScrapeEvents.logJobEvent(job, "failed"); job.moveToFailed(error); }, team_id: job.data.team_id, @@ -56,6 +62,7 @@ export async function runWebScraper({ const provider = new WebScraperDataProvider(); if (mode === "crawl") { await provider.setOptions({ + jobId: bull_job_id, mode: mode, urls: [url], crawlerOptions: crawlerOptions, @@ -64,6 +71,7 @@ export async function runWebScraper({ }); } else { await provider.setOptions({ + jobId: bull_job_id, mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, @@ -108,7 +116,6 @@ export async function runWebScraper({ // this return doesn't matter too much for the job completion result return { success: true, message: "", docs: filteredDocs }; } catch (error) { - console.error("Error running web scraper", error); onError(error); return { success: false, message: error.message, docs: [] }; } @@ -135,7 +142,8 @@ const saveJob = async (job: Job, result: any) => { // I think the job won't exist here anymore } } + ScrapeEvents.logJobEvent(job, "completed"); } catch (error) { - console.error("Failed to update job status:", error); + Logger.error(`🐂 Failed to update job status: ${error}`); } }; diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts new file mode 100644 index 00000000..77d1bf46 --- /dev/null +++ b/apps/api/src/routes/admin.ts @@ -0,0 +1,29 @@ +import express from "express"; +import { redisHealthController } from "../controllers/admin/redis-health"; +import { + checkQueuesController, + cleanBefore24hCompleteJobsController, + queuesController, +} from "../controllers/admin/queue"; + +export const adminRouter = express.Router(); + +adminRouter.get( + `/admin/${process.env.BULL_AUTH_KEY}/redis-health`, + redisHealthController +); + +adminRouter.get( + `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, + cleanBefore24hCompleteJobsController +); + +adminRouter.get( + `/admin/${process.env.BULL_AUTH_KEY}/check-queues`, + checkQueuesController +); + +adminRouter.get( + `/admin/${process.env.BULL_AUTH_KEY}/queues`, + queuesController +); diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts index a9a3a9bf..9c68d9bb 100644 --- a/apps/api/src/routes/v0.ts +++ b/apps/api/src/routes/v0.ts @@ -7,6 +7,8 @@ import { crawlJobStatusPreviewController } from "../../src/controllers/status"; import { searchController } from "../../src/controllers/search"; import { crawlCancelController } from "../../src/controllers/crawl-cancel"; import { keyAuthController } from "../../src/controllers/keyAuth"; +import { livenessController } from "../controllers/liveness"; +import { readinessController } from "../controllers/readiness"; export const v0Router = express.Router(); @@ -23,3 +25,6 @@ v0Router.get("/v0/keyAuth", keyAuthController); // Search routes v0Router.post("/v0/search", searchController); +// Health/Probe routes +v0Router.get("/v0/health/liveness", livenessController); +v0Router.get("/v0/health/readiness", readinessController); diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts index 32c8b0a0..20419ffa 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -42,6 +42,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -76,6 +77,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -104,6 +106,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -133,6 +136,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -161,6 +165,7 @@ describe('WebCrawler', () => { // Setup the crawler with the specific test case options const crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -194,6 +199,7 @@ describe('WebCrawler', () => { const limit = 2; // Set a limit for the number of links crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], diff --git a/apps/api/src/scraper/WebScraper/__tests__/dns.test.ts b/apps/api/src/scraper/WebScraper/__tests__/dns.test.ts new file mode 100644 index 00000000..968ed121 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/__tests__/dns.test.ts @@ -0,0 +1,15 @@ +import CacheableLookup from 'cacheable-lookup'; +import https from 'node:https'; +import axios from "axios"; + +describe("DNS", () => { + it("cached dns", async () => { + const cachedDns = new CacheableLookup(); + cachedDns.install(https.globalAgent); + jest.spyOn(cachedDns, "lookupAsync"); + + const res = await axios.get("https://example.com"); + expect(res.status).toBe(200); + expect(cachedDns.lookupAsync).toHaveBeenCalled(); + }); +}); diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 8a9df227..4b720835 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => { const pageOptionsWithHtml: PageOptions = { includeHtml: true }; const pageOptionsWithoutHtml: PageOptions = { includeHtml: false }; - const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml); - const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml); + const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml); + const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml); expect(resultWithHtml.html).toBeDefined(); expect(resultWithoutHtml.html).toBeUndefined(); @@ -27,7 +27,7 @@ it('should return a list of links on the mendable.ai page', async () => { const url = 'https://mendable.ai'; const pageOptions: PageOptions = { includeHtml: true }; - const result = await scrapSingleUrl(url, pageOptions); + const result = await scrapSingleUrl("TEST", url, pageOptions); // Check if the result contains a list of links expect(result.linksOnPage).toBeDefined(); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 59b53642..5ee8cda8 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -8,8 +8,10 @@ import { scrapSingleUrl } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; +import { Logger } from "../../../src/lib/logger"; export class WebCrawler { + private jobId: string; private initialUrl: string; private baseUrl: string; private includes: string[]; @@ -26,6 +28,7 @@ export class WebCrawler { private allowExternalContentLinks: boolean; constructor({ + jobId, initialUrl, includes, excludes, @@ -36,6 +39,7 @@ export class WebCrawler { allowBackwardCrawling = false, allowExternalContentLinks = false }: { + jobId: string; initialUrl: string; includes?: string[]; excludes?: string[]; @@ -46,6 +50,7 @@ export class WebCrawler { allowBackwardCrawling?: boolean; allowExternalContentLinks?: boolean; }) { + this.jobId = jobId; this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; this.includes = includes ?? []; @@ -64,7 +69,7 @@ export class WebCrawler { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { - const url = new URL(link); + const url = new URL(link.trim(), this.baseUrl); const path = url.pathname; const depth = getURLDepth(url.toString()); @@ -116,7 +121,7 @@ export class WebCrawler { const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; // Check if the link is disallowed by robots.txt if (!isAllowed) { - console.log(`Link disallowed by robots.txt: ${link}`); + Logger.debug(`Link disallowed by robots.txt: ${link}`); return false; } @@ -133,15 +138,19 @@ export class WebCrawler { limit: number = 10000, maxDepth: number = 10 ): Promise<{ url: string, html: string }[]> { + + Logger.debug(`Crawler starting with ${this.initialUrl}`); // Fetch and parse robots.txt try { const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); this.robots = robotsParser(this.robotsTxtUrl, response.data); + Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`); } catch (error) { - console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); + Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); } - if(!crawlerOptions?.ignoreSitemap){ + if (!crawlerOptions?.ignoreSitemap){ + Logger.debug(`Fetching sitemap links from ${this.initialUrl}`); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); @@ -175,6 +184,7 @@ export class WebCrawler { inProgress?: (progress: Progress) => void, ): Promise<{ url: string, html: string }[]> { const queue = async.queue(async (task: string, callback) => { + Logger.debug(`Crawling ${task}`); if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) { if (callback && typeof callback === "function") { callback(); @@ -216,16 +226,18 @@ export class WebCrawler { } }, concurrencyLimit); + Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`); queue.push( urls.filter( (url) => !this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent") ), (err) => { - if (err) console.error(err); + if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`); } ); await queue.drain(); + Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } @@ -253,7 +265,7 @@ export class WebCrawler { // If it is the first link, fetch with single url if (this.visited.size === 1) { - const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); + const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true }); content = page.html ?? ""; pageStatusCode = page.metadata?.pageStatusCode; pageError = page.metadata?.pageError || undefined; @@ -282,7 +294,6 @@ export class WebCrawler { const urlObj = new URL(fullUrl); const path = urlObj.pathname; - if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS if (this.isInternalLink(fullUrl) && this.noSections(fullUrl) && @@ -452,7 +463,7 @@ export class WebCrawler { sitemapLinks = await getLinksFromSitemap({ sitemapUrl }); } } catch (error) { - console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); + Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }); if (response) { sitemapLinks = response; @@ -467,7 +478,7 @@ export class WebCrawler { sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap }); } } catch (error) { - console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); } } diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index f8b2503e..e2f8d8cc 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -1,10 +1,12 @@ +import { Logger } from "../../../lib/logger"; + export async function handleCustomScraping( text: string, url: string ): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> { // Check for Readme Docs special case if (text.includes(' { const existingHTML = allHtmls ? allHtmls[i + index] : ""; const result = await scrapSingleUrl( + this.jobId, url, this.pageOptions, this.extractorOptions, @@ -89,14 +92,14 @@ export class WebScraperDataProvider { const job = await getWebScraperQueue().getJob(this.bullJobId); const jobStatus = await job.getState(); if (jobStatus === "failed") { - console.error( + Logger.info( "Job has failed or has been cancelled by the user. Stopping the job..." ); return [] as Document[]; } } } catch (error) { - console.error(error); + Logger.error(error.message); return [] as Document[]; } } @@ -165,6 +168,7 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { const crawler = new WebCrawler({ + jobId: this.jobId, initialUrl: this.urls[0], includes: this.includes, excludes: this.excludes, @@ -270,7 +274,7 @@ export class WebScraperDataProvider { this.mode === "single_urls" && links.length > 0 ? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch( (error) => { - console.error("Failed to fetch sitemap data:", error); + Logger.debug(`Failed to fetch sitemap data: ${error}`); return null; } ) @@ -460,7 +464,7 @@ export class WebScraperDataProvider { let documents: Document[] = []; for (const url of urls) { const normalizedUrl = this.normalizeUrl(url); - console.log( + Logger.debug( "Getting cached document for web-scraper-cache:" + normalizedUrl ); const cachedDocumentString = await getValue( @@ -499,6 +503,7 @@ export class WebScraperDataProvider { throw new Error("Urls are required"); } + this.jobId = options.jobId; this.bullJobId = options.bullJobId; this.urls = options.urls; this.mode = options.mode; diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts index 4c31438c..c9ddf93a 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -2,6 +2,7 @@ import axios from "axios"; import { logScrape } from "../../../services/logging/scrape_log"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with Axios @@ -34,9 +35,7 @@ export async function scrapWithFetch( }); if (response.status !== 200) { - console.error( - `[Axios] Error fetching url: ${url} with status: ${response.status}` - ); + Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`); logParams.error_message = response.statusText; logParams.response_code = response.status; return { @@ -63,10 +62,10 @@ export async function scrapWithFetch( } catch (error) { if (error.code === "ECONNABORTED") { logParams.error_message = "Request timed out"; - console.log(`[Axios] Request timed out for ${url}`); + Logger.debug(`⛏️ Axios: Request timed out for ${url}`); } else { logParams.error_message = error.message || error; - console.error(`[Axios] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`); } return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index e547c019..0f4c2320 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -4,6 +4,7 @@ import { logScrape } from "../../../services/logging/scrape_log"; import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with Fire-Engine @@ -59,12 +60,10 @@ export async function scrapWithFireEngine({ let engine = engineParam; // do we want fireEngineOptions as first choice? - console.log( - `[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` + Logger.info( + `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }` ); - // console.log(fireEngineOptionsParam) - const response = await axios.post( process.env.FIRE_ENGINE_BETA_URL + endpoint, { @@ -84,15 +83,15 @@ export async function scrapWithFireEngine({ ); if (response.status !== 200) { - console.error( - `[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}` + Logger.debug( + `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}` ); logParams.error_message = response.data?.pageError; logParams.response_code = response.data?.pageStatusCode; if(response.data && response.data?.pageStatusCode !== 200) { - console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`); + Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`); } return { @@ -130,10 +129,10 @@ export async function scrapWithFireEngine({ } } catch (error) { if (error.code === "ECONNABORTED") { - console.log(`[Fire-Engine] Request timed out for ${url}`); + Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`); logParams.error_message = "Request timed out"; } else { - console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`); logParams.error_message = error.message || error; } return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts index 11c3c5ad..4b3180a3 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -3,6 +3,7 @@ import { logScrape } from "../../../services/logging/scrape_log"; import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with Playwright @@ -51,8 +52,8 @@ export async function scrapWithPlaywright( ); if (response.status !== 200) { - console.error( - `[Playwright] Error fetching url: ${url} with status: ${response.status}` + Logger.debug( + `⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}` ); logParams.error_message = response.data?.pageError; logParams.response_code = response.data?.pageStatusCode; @@ -86,8 +87,8 @@ export async function scrapWithPlaywright( }; } catch (jsonError) { logParams.error_message = jsonError.message || jsonError; - console.error( - `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` + Logger.debug( + `⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}` ); return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } @@ -95,10 +96,10 @@ export async function scrapWithPlaywright( } catch (error) { if (error.code === "ECONNABORTED") { logParams.error_message = "Request timed out"; - console.log(`[Playwright] Request timed out for ${url}`); + Logger.debug(`⛏️ Playwright: Request timed out for ${url}`); } else { logParams.error_message = error.message || error; - console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`); } return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts index 9a1f0b35..554bfe22 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -3,6 +3,7 @@ import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; import { ScrapingBeeClient } from "scrapingbee"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with ScrapingBee @@ -56,8 +57,8 @@ export async function scrapWithScrapingBee( text = decoder.decode(response.data); logParams.success = true; } catch (decodeError) { - console.error( - `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` + Logger.debug( + `⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}` ); logParams.error_message = decodeError.message || decodeError; } @@ -72,7 +73,7 @@ export async function scrapWithScrapingBee( }; } } catch (error) { - console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`); logParams.error_message = error.message || error; logParams.response_code = error.response?.status; return { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 8fbd31e4..4a44b23f 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -17,17 +17,20 @@ import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { scrapWithPlaywright } from "./scrapers/playwright"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; import { extractLinks } from "./utils/utils"; +import { Logger } from "../../lib/logger"; +import { ScrapeEvents } from "../../lib/scrape-events"; +import { clientSideError } from "../../strings"; dotenv.config(); -const baseScrapers = [ +export const baseScrapers = [ "fire-engine", "fire-engine;chrome-cdp", "scrapingBee", - "playwright", + process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", "scrapingBeeLoad", "fetch", -] as const; +].filter(Boolean); export async function generateRequestParams( url: string, @@ -48,7 +51,7 @@ export async function generateRequestParams( return defaultParams; } } catch (error) { - console.error(`Error generating URL key: ${error}`); + Logger.error(`Error generating URL key: ${error}`); return defaultParams; } } @@ -82,22 +85,22 @@ function getScrapingFallbackOrder( }); let defaultOrder = [ + !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine", + !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp", "scrapingBee", - "fire-engine", - "fire-engine;chrome-cdp", - "playwright", + process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", "scrapingBeeLoad", "fetch", - ]; + ].filter(Boolean); if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { defaultOrder = [ "fire-engine", - "playwright", + process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", ...defaultOrder.filter( (scraper) => scraper !== "fire-engine" && scraper !== "playwright" ), - ]; + ].filter(Boolean); } const filteredDefaultOrder = defaultOrder.filter( @@ -117,6 +120,7 @@ function getScrapingFallbackOrder( export async function scrapSingleUrl( + jobId: string, urlToScrap: string, pageOptions: PageOptions = { onlyMainContent: true, @@ -144,6 +148,15 @@ export async function scrapSingleUrl( } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; + const timer = Date.now(); + const logInsertPromise = ScrapeEvents.insert(jobId, { + type: "scrape", + url, + worker: process.env.FLY_MACHINE_ID, + method, + result: null, + }); + switch (method) { case "fire-engine": case "fire-engine;chrome-cdp": @@ -154,7 +167,6 @@ export async function scrapSingleUrl( } if (process.env.FIRE_ENGINE_BETA_URL) { - console.log(`Scraping ${url} with Fire Engine`); const response = await scrapWithFireEngine({ url, waitFor: pageOptions.waitFor, @@ -254,8 +266,19 @@ export async function scrapSingleUrl( } //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); + const text = await parseMarkdown(cleanedHtml); + + const insertedLogId = await logInsertPromise; + ScrapeEvents.updateScrapeResult(insertedLogId, { + response_size: scraperResponse.text.length, + success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100), + error: scraperResponse.metadata.pageError, + response_code: scraperResponse.metadata.pageStatusCode, + time_taken: Date.now() - timer, + }); + return { - text: await parseMarkdown(cleanedHtml), + text, html: cleanedHtml, rawHtml: scraperResponse.text, screenshot: scraperResponse.screenshot, @@ -277,7 +300,7 @@ export async function scrapSingleUrl( try { urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); } catch (error) { - console.error(`Invalid URL key, trying: ${urlToScrap}`); + Logger.error(`Invalid URL key, trying: ${urlToScrap}`); } const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; const scrapersInOrder = getScrapingFallbackOrder( @@ -289,7 +312,7 @@ export async function scrapSingleUrl( for (const scraper of scrapersInOrder) { // If exists text coming from crawler, use it - if (existingHtml && existingHtml.trim().length >= 100) { + if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) { let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions); text = await parseMarkdown(cleanedHtml); html = cleanedHtml; @@ -311,12 +334,18 @@ export async function scrapSingleUrl( pageError = undefined; } - if (text && text.trim().length >= 100) break; - if (pageStatusCode && pageStatusCode == 404) break; - const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; - if (nextScraperIndex < scrapersInOrder.length) { - console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); + if (text && text.trim().length >= 100) { + Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`); + break; } + if (pageStatusCode && pageStatusCode == 404) { + Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`); + break; + } + // const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; + // if (nextScraperIndex < scrapersInOrder.length) { + // Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`); + // } } if (!text) { @@ -372,7 +401,12 @@ export async function scrapSingleUrl( return document; } catch (error) { - console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); + Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); + ScrapeEvents.insert(jobId, { + type: "error", + message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error), + stack: error.stack, + }); return { content: "", markdown: "", diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 1dfbf3a1..3dfc9a1c 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -3,6 +3,7 @@ import { axiosTimeout } from "../../lib/timeout"; import { parseStringPromise } from "xml2js"; import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { WebCrawler } from "./crawler"; +import { Logger } from "../../lib/logger"; export async function getLinksFromSitemap( { @@ -22,11 +23,11 @@ export async function getLinksFromSitemap( const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } else if (mode === 'fire-engine') { - const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { method: "get", mobileProxy: true },options:{endpoint:"request"} }); + const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"tlsclient", disableJsDom: true, mobileProxy: true } }); content = response.html; } } catch (error) { - console.error(`Request failed for ${sitemapUrl}: ${error}`); + Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`); return allUrls; } @@ -48,7 +49,7 @@ export async function getLinksFromSitemap( } } } catch (error) { - console.error(`Error processing ${sitemapUrl}: ${error}`); + Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`); } return allUrls; diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts index 4449285d..c09cc5b3 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts @@ -1,3 +1,4 @@ +import { Logger } from '../../../../lib/logger'; import { isUrlBlocked } from '../blocklist'; describe('isUrlBlocked', () => { @@ -19,7 +20,7 @@ describe('isUrlBlocked', () => { blockedUrls.forEach(url => { if (!isUrlBlocked(url)) { - console.log(`URL not blocked: ${url}`); + Logger.debug(`URL not blocked: ${url}`); } expect(isUrlBlocked(url)).toBe(true); }); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 633fd5d0..0bdf9876 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,3 +1,5 @@ +import { Logger } from "../../../lib/logger"; + const socialMediaBlocklist = [ 'facebook.com', 'x.com', @@ -59,7 +61,7 @@ export function isUrlBlocked(url: string): boolean { return isBlocked; } catch (e) { // If an error occurs (e.g., invalid URL), return false - console.error(`Error parsing the following URL: ${url}`); + Logger.error(`Error parsing the following URL: ${url}`); return false; } } diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index 89836b4a..c688061d 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -212,4 +212,24 @@ export const urlSpecificParams = { engine: "playwright", } }, + "mendable.ai":{ + defaultScraper: "fire-engine", + params:{ + fireEngineOptions:{ + mobileProxy: true, + method: "get", + engine: "chrome-cdp", + }, + }, + }, + "developer.apple.com":{ + defaultScraper: "fire-engine", + params:{ + engine: "playwright", + wait: 2000, + fireEngineOptions: { + blockMedia: false, + } + }, + }, }; diff --git a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts index fb8f65d9..318c2340 100644 --- a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts +++ b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts @@ -1,5 +1,6 @@ import Anthropic from '@anthropic-ai/sdk'; import axios from 'axios'; +import { Logger } from '../../../lib/logger'; export async function getImageDescription( imageUrl: string, @@ -82,7 +83,7 @@ export async function getImageDescription( } } } catch (error) { - console.error("Error generating image alt text:", error?.message); + Logger.error(`Error generating image alt text: ${error}`); return ""; } } diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index 3f2052c0..9496d569 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -1,4 +1,6 @@ import { CheerioAPI } from "cheerio"; +import { Logger } from "../../../lib/logger"; + interface Metadata { title?: string; description?: string; @@ -105,7 +107,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; } catch (error) { - console.error("Error extracting metadata:", error); + Logger.error(`Error extracting metadata: ${error}`); } return { diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 3e01571e..660d27eb 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -7,14 +7,20 @@ import pdf from "pdf-parse"; import path from "path"; import os from "os"; import { axiosTimeout } from "../../../lib/timeout"; +import { Logger } from "../../../lib/logger"; dotenv.config(); export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { - const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url); - const content = await processPdfToText(tempFilePath, parsePDF); - fs.unlinkSync(tempFilePath); // Clean up the temporary file - return { content, pageStatusCode, pageError }; + try { + const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url); + const content = await processPdfToText(tempFilePath, parsePDF); + fs.unlinkSync(tempFilePath); // Clean up the temporary file + return { content, pageStatusCode, pageError }; + } catch (error) { + Logger.error(`Failed to fetch and process PDF: ${error.message}`); + return { content: "", pageStatusCode: 500, pageError: error.message }; + } } async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> { @@ -39,6 +45,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro let content = ""; if (process.env.LLAMAPARSE_API_KEY && parsePDF) { + Logger.debug("Processing pdf document w/ LlamaIndex"); const apiKey = process.env.LLAMAPARSE_API_KEY; const headers = { Authorization: `Bearer ${apiKey}`, @@ -81,7 +88,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds } } catch (error) { - console.error("Error fetching result w/ LlamaIndex"); + Logger.debug("Error fetching result w/ LlamaIndex"); attempt++; await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying // You may want to handle specific errors differently @@ -93,7 +100,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro } content = resultResponse.data[resultType]; } catch (error) { - console.error("Error processing pdf document w/ LlamaIndex(2)"); + Logger.debug("Error processing pdf document w/ LlamaIndex(2)"); content = await processPdf(filePath); } } else if (parsePDF) { diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts index 25b43f0a..f84db63f 100644 --- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts +++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../../lib/logger"; import { Document } from "../../../lib/entities"; export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => { @@ -39,7 +40,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] return documents; } catch (error) { - console.error("Error replacing paths with absolute paths", error); + Logger.debug(`Error replacing paths with absolute paths: ${error}`); return documents; } }; @@ -78,7 +79,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen return documents; } catch (error) { - console.error("Error replacing img paths with absolute paths", error); + Logger.error(`Error replacing img paths with absolute paths: ${error}`); return documents; } }; \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index 3aa021a6..dd5906b0 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -1,5 +1,6 @@ import axios from "axios"; import * as cheerio from "cheerio"; +import { Logger } from "../../../lib/logger"; export async function attemptScrapWithRequests( @@ -9,13 +10,13 @@ export async function attemptScrapWithRequests( const response = await axios.get(urlToScrap, { timeout: 15000 }); if (!response.data) { - console.log("Failed normal requests as well"); + Logger.debug("Failed normal requests as well"); return null; } return response.data; } catch (error) { - console.error(`Error in attemptScrapWithRequests: ${error}`); + Logger.debug(`Error in attemptScrapWithRequests: ${error}`); return null; } } diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 6bfa1a39..060f4bd8 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -2,6 +2,7 @@ import axios from 'axios'; import * as cheerio from 'cheerio'; import * as querystring from 'querystring'; import { SearchResult } from '../../src/lib/entities'; +import { Logger } from '../../src/lib/logger'; const _useragent_list = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', @@ -96,7 +97,7 @@ export async function google_search(term: string, advanced = false, num_results await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); } catch (error) { if (error.message === 'Too many requests') { - console.warn('Too many requests, breaking the loop'); + Logger.warn('Too many requests, breaking the loop'); break; } throw error; @@ -107,7 +108,7 @@ export async function google_search(term: string, advanced = false, num_results } } if (attempts >= maxAttempts) { - console.warn('Max attempts reached, breaking the loop'); + Logger.warn('Max attempts reached, breaking the loop'); } return results } diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index 88cbf812..f5bc06e3 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../src/lib/logger"; import { SearchResult } from "../../src/lib/entities"; import { google_search } from "./googlesearch"; import { serper_search } from "./serper"; @@ -47,7 +48,7 @@ export async function search({ timeout ); } catch (error) { - console.error("Error in search function: ", error); + Logger.error(`Error in search function: ${error}`); return [] } // if process.env.SERPER_API_KEY is set, use serper diff --git a/apps/api/src/services/alerts/index.ts b/apps/api/src/services/alerts/index.ts index 1cfb5906..88b3c726 100644 --- a/apps/api/src/services/alerts/index.ts +++ b/apps/api/src/services/alerts/index.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../../src/lib/logger"; import { getWebScraperQueue } from "../queue-service"; import { sendSlackWebhook } from "./slack"; @@ -9,13 +10,13 @@ export async function checkAlerts() { process.env.ALERT_NUM_ACTIVE_JOBS && process.env.ALERT_NUM_WAITING_JOBS ) { - console.info("Initializing alerts"); + Logger.info("Initializing alerts"); const checkActiveJobs = async () => { try { const webScraperQueue = getWebScraperQueue(); const activeJobs = await webScraperQueue.getActiveCount(); if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) { - console.warn( + Logger.warn( `Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.` ); sendSlackWebhook( @@ -23,12 +24,12 @@ export async function checkAlerts() { true ); } else { - console.info( + Logger.info( `Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}` ); } } catch (error) { - console.error("Failed to check active jobs:", error); + Logger.error(`Failed to check active jobs: ${error}`); } }; @@ -38,7 +39,7 @@ export async function checkAlerts() { const paused = await webScraperQueue.getPausedCount(); if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) { - console.warn( + Logger.warn( `Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.` ); sendSlackWebhook( @@ -57,6 +58,6 @@ export async function checkAlerts() { // setInterval(checkAll, 10000); // Run every } } catch (error) { - console.error("Failed to initialize alerts:", error); + Logger.error(`Failed to initialize alerts: ${error}`); } } diff --git a/apps/api/src/services/alerts/slack.ts b/apps/api/src/services/alerts/slack.ts index f65035b1..96bf1c09 100644 --- a/apps/api/src/services/alerts/slack.ts +++ b/apps/api/src/services/alerts/slack.ts @@ -1,4 +1,5 @@ import axios from "axios"; +import { Logger } from "../../../src/lib/logger"; export async function sendSlackWebhook( message: string, @@ -16,8 +17,8 @@ export async function sendSlackWebhook( "Content-Type": "application/json", }, }); - console.log("Webhook sent successfully:", response.data); + Logger.log("Webhook sent successfully:", response.data); } catch (error) { - console.error("Error sending webhook:", error); + Logger.debug(`Error sending webhook: ${error}`); } } diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 82668111..9369cdbb 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -2,6 +2,7 @@ import { NotificationType } from "../../types"; import { withAuth } from "../../lib/withAuth"; import { sendNotification } from "../notification/email_notification"; import { supabase_service } from "../supabase"; +import { Logger } from "../../lib/logger"; const FREE_CREDITS = 500; @@ -12,7 +13,7 @@ export async function supaBillTeam(team_id: string, credits: number) { if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } - console.log(`Billing team ${team_id} for ${credits} credits`); + Logger.info(`Billing team ${team_id} for ${credits} credits`); // When the API is used, you can log the credit usage in the credit_usage table: // team_id: The ID of the team using the API. // subscription_id: The ID of the team's active subscription. @@ -218,7 +219,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { 0 ); - console.log("totalCreditsUsed", totalCreditsUsed); + Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`); const end = new Date(); end.setDate(end.getDate() + 30); @@ -262,14 +263,14 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { }); if (creditUsageError) { - console.error("Error calculating credit usage:", creditUsageError); + Logger.error(`Error calculating credit usage: ${creditUsageError}`); } if (creditUsages && creditUsages.length > 0) { totalCreditsUsed = creditUsages[0].total_credits_used; } } catch (error) { - console.error("Error calculating credit usage:", error); + Logger.error(`Error calculating credit usage: ${error}`); } // Adjust total credits used by subtracting coupon value diff --git a/apps/api/src/services/idempotency/create.ts b/apps/api/src/services/idempotency/create.ts index ec3e18e7..291e77d9 100644 --- a/apps/api/src/services/idempotency/create.ts +++ b/apps/api/src/services/idempotency/create.ts @@ -1,5 +1,6 @@ import { Request } from "express"; import { supabase_service } from "../supabase"; +import { Logger } from "../../../src/lib/logger"; export async function createIdempotencyKey( req: Request, @@ -14,7 +15,7 @@ export async function createIdempotencyKey( .insert({ key: idempotencyKey }); if (error) { - console.error("Failed to create idempotency key:", error); + Logger.error(`Failed to create idempotency key: ${error}`); throw error; } diff --git a/apps/api/src/services/idempotency/validate.ts b/apps/api/src/services/idempotency/validate.ts index 1ca348bb..4d58a31d 100644 --- a/apps/api/src/services/idempotency/validate.ts +++ b/apps/api/src/services/idempotency/validate.ts @@ -1,6 +1,7 @@ import { Request } from "express"; import { supabase_service } from "../supabase"; import { validate as isUuid } from 'uuid'; +import { Logger } from "../../../src/lib/logger"; export async function validateIdempotencyKey( req: Request, @@ -13,7 +14,7 @@ export async function validateIdempotencyKey( // Ensure idempotencyKey is treated as a string const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey; if (!isUuid(key)) { - console.error("Invalid idempotency key provided in the request headers."); + Logger.debug("Invalid idempotency key provided in the request headers."); return false; } @@ -23,7 +24,7 @@ export async function validateIdempotencyKey( .eq("key", idempotencyKey); if (error) { - console.error(error); + Logger.error(`Error validating idempotency key: ${error}`); } if (!data || data.length === 0) { diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts index 1224bd44..68008e02 100644 --- a/apps/api/src/services/logging/crawl_log.ts +++ b/apps/api/src/services/logging/crawl_log.ts @@ -1,4 +1,5 @@ import { supabase_service } from "../supabase"; +import { Logger } from "../../../src/lib/logger"; import "dotenv/config"; export async function logCrawl(job_id: string, team_id: string) { @@ -13,7 +14,7 @@ export async function logCrawl(job_id: string, team_id: string) { }, ]); } catch (error) { - console.error("Error logging crawl job:\n", error); + Logger.error(`Error logging crawl job to supabase:\n${error}`); } } } diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 9764493f..93d0b311 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -3,6 +3,7 @@ import { supabase_service } from "../supabase"; import { FirecrawlJob } from "../../types"; import { posthog } from "../posthog"; import "dotenv/config"; +import { Logger } from "../../lib/logger"; export async function logJob(job: FirecrawlJob) { try { @@ -68,9 +69,9 @@ export async function logJob(job: FirecrawlJob) { posthog.capture(phLog); } if (error) { - console.error("Error logging job:\n", error); + Logger.error(`Error logging job: ${error.message}`); } } catch (error) { - console.error("Error logging job:\n", error); + Logger.error(`Error logging job: ${error.message}`); } } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 728a67d0..208159da 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -2,15 +2,16 @@ import "dotenv/config"; import { ScrapeLog } from "../../types"; import { supabase_service } from "../supabase"; import { PageOptions } from "../../lib/entities"; +import { Logger } from "../../lib/logger"; export async function logScrape( scrapeLog: ScrapeLog, pageOptions?: PageOptions ) { if (process.env.USE_DB_AUTHENTICATION === "false") { + Logger.debug("Skipping logging scrape to Supabase"); return; } - try { // Only log jobs in production // if (process.env.ENV !== "production") { @@ -43,9 +44,9 @@ export async function logScrape( ]); if (error) { - console.error("Error logging proxy:\n", error); + Logger.error(`Error logging proxy:\n${error}`); } } catch (error) { - console.error("Error logging proxy:\n", error); + Logger.error(`Error logging proxy:\n${error}`); } } diff --git a/apps/api/src/services/logtail.ts b/apps/api/src/services/logtail.ts index 8b86a6b1..d9af3c7a 100644 --- a/apps/api/src/services/logtail.ts +++ b/apps/api/src/services/logtail.ts @@ -1,19 +1,20 @@ import { Logtail } from "@logtail/node"; import "dotenv/config"; +import { Logger } from "../lib/logger"; // A mock Logtail class to handle cases where LOGTAIL_KEY is not provided class MockLogtail { info(message: string, context?: Record): void { - console.log(message, context); + Logger.debug(`${message} - ${context}`); } error(message: string, context: Record = {}): void { - console.error(message, context); + Logger.error(`${message} - ${context}`); } } // Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class // Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => { - console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more."); + Logger.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more."); return new MockLogtail(); })(); diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index e5102acd..a63d78ff 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -2,6 +2,7 @@ import { supabase_service } from "../supabase"; import { withAuth } from "../../lib/withAuth"; import { Resend } from "resend"; import { NotificationType } from "../../types"; +import { Logger } from "../../../src/lib/logger"; const emailTemplates: Record< NotificationType, @@ -52,11 +53,11 @@ async function sendEmailNotification( }); if (error) { - console.error("Error sending email: ", error); + Logger.debug(`Error sending email: ${error}`); return { success: false }; } } catch (error) { - console.error("Error sending email (2): ", error); + Logger.debug(`Error sending email (2): ${error}`); return { success: false }; } } @@ -70,7 +71,28 @@ export async function sendNotificationInternal( if (team_id === "preview") { return { success: true }; } + + const fifteenDaysAgo = new Date(); + fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15); + const { data, error } = await supabase_service + .from("user_notifications") + .select("*") + .eq("team_id", team_id) + .eq("notification_type", notificationType) + .gte("sent_date", fifteenDaysAgo.toISOString()); + + if (error) { + Logger.debug(`Error fetching notifications: ${error}`); + return { success: false }; + } + + if (data.length !== 0) { + // Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} in the last 15 days`); + return { success: false }; + } + + const { data: recentData, error: recentError } = await supabase_service .from("user_notifications") .select("*") .eq("team_id", team_id) @@ -78,14 +100,16 @@ export async function sendNotificationInternal( .gte("sent_date", startDateString) .lte("sent_date", endDateString); - if (error) { - console.error("Error fetching notifications: ", error); + if (recentError) { + Logger.debug(`Error fetching recent notifications: ${recentError}`); return { success: false }; } - if (data.length !== 0) { + if (recentData.length !== 0) { + // Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} within the specified date range`); return { success: false }; } else { + console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`); // get the emails from the user with the team_id const { data: emails, error: emailsError } = await supabase_service .from("users") @@ -93,7 +117,7 @@ export async function sendNotificationInternal( .eq("team_id", team_id); if (emailsError) { - console.error("Error fetching emails: ", emailsError); + Logger.debug(`Error fetching emails: ${emailsError}`); return { success: false }; } @@ -112,7 +136,7 @@ export async function sendNotificationInternal( ]); if (insertError) { - console.error("Error inserting notification record: ", insertError); + Logger.debug(`Error inserting notification record: ${insertError}`); return { success: false }; } diff --git a/apps/api/src/services/posthog.ts b/apps/api/src/services/posthog.ts index 5ec16e2e..a7419883 100644 --- a/apps/api/src/services/posthog.ts +++ b/apps/api/src/services/posthog.ts @@ -1,5 +1,6 @@ import { PostHog } from 'posthog-node'; import "dotenv/config"; +import { Logger } from '../../src/lib/logger'; export default function PostHogClient() { const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, { @@ -19,7 +20,7 @@ class MockPostHog { export const posthog = process.env.POSTHOG_API_KEY ? PostHogClient() : (() => { - console.warn( + Logger.warn( "POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more." ); return new MockPostHog(); diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index 6c817a4a..d531c2db 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -1,5 +1,6 @@ import Queue from "bull"; import { Queue as BullQueue } from "bull"; +import { Logger } from "../lib/logger"; let webScraperQueue: BullQueue; @@ -16,7 +17,7 @@ export function getWebScraperQueue() { attempts: 5 } }); - console.log("Web scraper queue created"); + Logger.info("Web scraper queue created"); } return webScraperQueue; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 2105863a..cda690dd 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -7,8 +7,10 @@ import { callWebhook } from "./webhook"; import { logJob } from "./logging/log_job"; import { initSDK } from '@hyperdx/node-opentelemetry'; import { Job } from "bull"; +import { Logger } from "../lib/logger"; +import { ScrapeEvents } from "../lib/scrape-events"; -if(process.env.ENV === 'production') { +if (process.env.ENV === 'production') { initSDK({ consoleCapture: true, additionalInstrumentations: [], @@ -18,7 +20,8 @@ if(process.env.ENV === 'production') { const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { - console.log("taking job", job.id); + Logger.debug(`🐂 Worker taking job ${job.id}`); + try { job.progress({ current: 1, @@ -60,18 +63,18 @@ async function processJob(job: Job, done) { pageOptions: job.data.pageOptions, origin: job.data.origin, }); - console.log("job done", job.id); + Logger.debug(`🐂 Job done ${job.id}`); done(null, data); } catch (error) { - console.log("job errored", job.id, error); + Logger.error(`🐂 Job errored ${job.id} - ${error}`); if (await getWebScraperQueue().isPaused(false)) { - console.log("queue is paused, ignoring"); + Logger.debug("🐂Queue is paused, ignoring"); return; } if (error instanceof CustomError) { // Here we handle the error, then save the failed job - console.error(error.message); // or any other error handling + Logger.error(error.message); // or any other error handling logtail.error("Custom error while ingesting", { job_id: job.id, @@ -79,7 +82,7 @@ async function processJob(job: Job, done) { dataIngestionJob: error.dataIngestionJob, }); } - console.log(error); + Logger.error(error); logtail.error("Overall error ingesting", { job_id: job.id, @@ -117,3 +120,10 @@ wsq.process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), processJob ); + +wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); +wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active")); +wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed")); +wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); +wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); +wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); diff --git a/apps/api/src/services/redis.ts b/apps/api/src/services/redis.ts index b720a330..173d2b78 100644 --- a/apps/api/src/services/redis.ts +++ b/apps/api/src/services/redis.ts @@ -1,14 +1,15 @@ import Redis from "ioredis"; import { redisRateLimitClient } from "./rate-limiter"; +import { Logger } from "../lib/logger"; // Listen to 'error' events to the Redis connection redisRateLimitClient.on("error", (error) => { try { if (error.message === "ECONNRESET") { - console.log("Connection to Redis Session Store timed out."); + Logger.error("Connection to Redis Session Rate Limit Store timed out."); } else if (error.message === "ECONNREFUSED") { - console.log("Connection to Redis Session Store refused!"); - } else console.log(error); + Logger.error("Connection to Redis Session Rate Limit Store refused!"); + } else Logger.error(error); } catch (error) {} }); @@ -16,15 +17,15 @@ redisRateLimitClient.on("error", (error) => { redisRateLimitClient.on("reconnecting", (err) => { try { if (redisRateLimitClient.status === "reconnecting") - console.log("Reconnecting to Redis Session Store..."); - else console.log("Error reconnecting to Redis Session Store."); + Logger.info("Reconnecting to Redis Session Rate Limit Store..."); + else Logger.error("Error reconnecting to Redis Session Rate Limit Store."); } catch (error) {} }); // Listen to the 'connect' event to Redis redisRateLimitClient.on("connect", (err) => { try { - if (!err) console.log("Connected to Redis Session Store!"); + if (!err) Logger.info("Connected to Redis Session Rate Limit Store!"); } catch (error) {} }); diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index fa6404d7..d34f7b52 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -1,4 +1,5 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; +import { Logger } from "../lib/logger"; // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { @@ -10,13 +11,13 @@ class SupabaseService { // Only initialize the Supabase client if both URL and Service Token are provided. if (process.env.USE_DB_AUTHENTICATION === "false") { // Warn the user that Authentication is disabled by setting the client to null - console.warn( - "\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" + Logger.warn( + "Authentication is disabled. Supabase client will not be initialized." ); this.client = null; } else if (!supabaseUrl || !supabaseServiceToken) { - console.error( - "\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" + Logger.error( + "Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable" ); } else { this.client = createClient(supabaseUrl, supabaseServiceToken); @@ -35,10 +36,15 @@ export const supabase_service: SupabaseClient = new Proxy( new SupabaseService(), { get: function (target, prop, receiver) { + if (process.env.USE_DB_AUTHENTICATION === "false") { + Logger.debug( + "Attempted to access Supabase client when it's not configured." + ); + } const client = target.getClient(); // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. if (client === null) { - console.error( + Logger.error( "Attempted to access Supabase client when it's not configured." ); return () => { diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 18378546..b0222ea3 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../src/lib/logger"; import { supabase_service } from "./supabase"; export const callWebhook = async (teamId: string, jobId: string,data: any) => { @@ -15,10 +16,7 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => { .eq("team_id", teamId) .limit(1); if (error) { - console.error( - `Error fetching webhook URL for team ID: ${teamId}`, - error.message - ); + Logger.error(`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`); return null; } @@ -53,9 +51,6 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => { }), }); } catch (error) { - console.error( - `Error sending webhook for team ID: ${teamId}`, - error.message - ); + Logger.debug(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`); } }; diff --git a/apps/api/src/strings.ts b/apps/api/src/strings.ts index e7a6f21e..8edc57f1 100644 --- a/apps/api/src/strings.ts +++ b/apps/api/src/strings.ts @@ -1,2 +1,4 @@ export const errorNoResults = "No results found, please check the URL or contact us at help@mendable.ai to file a ticket."; + +export const clientSideError = "client-side exception has occurred" \ No newline at end of file diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index bd6b14e3..c9099d3b 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -5,7 +5,7 @@ the HTML content of a specified URL. It supports optional proxy settings and med from os import environ -from fastapi import FastAPI +from fastapi import FastAPI, Response from fastapi.responses import JSONResponse from playwright.async_api import Browser, async_playwright from pydantic import BaseModel @@ -39,14 +39,28 @@ async def shutdown_event(): """Event handler for application shutdown to close the browser.""" await browser.close() +@app.get("/health/liveness") +def liveness_probe(): + """Endpoint for liveness probe.""" + return JSONResponse(content={"status": "ok"}, status_code=200) + + +@app.get("/health/readiness") +async def readiness_probe(): + """Endpoint for readiness probe. Checks if the browser instance is ready.""" + if browser: + return JSONResponse(content={"status": "ok"}, status_code=200) + return JSONResponse(content={"status": "Service Unavailable"}, status_code=503) + + @app.post("/html") async def root(body: UrlModel): """ Endpoint to fetch and return HTML content of a given URL. - + Args: body (UrlModel): The URL model containing the target URL, wait time, and timeout. - + Returns: JSONResponse: The HTML content of the page. """ diff --git a/apps/test-suite/utils/supabase.ts b/apps/test-suite/utils/supabase.ts index aa19a8c9..abf7fd78 100644 --- a/apps/test-suite/utils/supabase.ts +++ b/apps/test-suite/utils/supabase.ts @@ -1,5 +1,6 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; import "dotenv/config"; + // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { private client: SupabaseClient | null = null; @@ -11,12 +12,12 @@ class SupabaseService { if (process.env.USE_DB_AUTHENTICATION === "false") { // Warn the user that Authentication is disabled by setting the client to null console.warn( - "\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" + "Authentication is disabled. Supabase client will not be initialized." ); this.client = null; } else if (!supabaseUrl || !supabaseServiceToken) { console.error( - "\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" + "Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable" ); } else { this.client = createClient(supabaseUrl, supabaseServiceToken); @@ -35,6 +36,11 @@ export const supabase_service: SupabaseClient = new Proxy( new SupabaseService(), { get: function (target, prop, receiver) { + if (process.env.USE_DB_AUTHENTICATION === "false") { + console.debug( + "Attempted to access Supabase client when it's not configured." + ); + } const client = target.getClient(); // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. if (client === null) { diff --git a/apps/ui/ingestion-ui/src/App.css b/apps/ui/ingestion-ui/src/App.css deleted file mode 100644 index b9d355df..00000000 --- a/apps/ui/ingestion-ui/src/App.css +++ /dev/null @@ -1,42 +0,0 @@ -#root { - max-width: 1280px; - margin: 0 auto; - padding: 2rem; - text-align: center; -} - -.logo { - height: 6em; - padding: 1.5em; - will-change: filter; - transition: filter 300ms; -} -.logo:hover { - filter: drop-shadow(0 0 2em #646cffaa); -} -.logo.react:hover { - filter: drop-shadow(0 0 2em #61dafbaa); -} - -@keyframes logo-spin { - from { - transform: rotate(0deg); - } - to { - transform: rotate(360deg); - } -} - -@media (prefers-reduced-motion: no-preference) { - a:nth-of-type(2) .logo { - animation: logo-spin infinite 20s linear; - } -} - -.card { - padding: 2em; -} - -.read-the-docs { - color: #888; -} diff --git a/apps/ui/ingestion-ui/src/App.tsx b/apps/ui/ingestion-ui/src/App.tsx index f0d98a3a..eb0e6954 100644 --- a/apps/ui/ingestion-ui/src/App.tsx +++ b/apps/ui/ingestion-ui/src/App.tsx @@ -1,4 +1,3 @@ -import "./App.css"; import FirecrawlComponent from "./components/ingestion"; function App() { diff --git a/apps/ui/ingestion-ui/src/assets/react.svg b/apps/ui/ingestion-ui/src/assets/react.svg deleted file mode 100644 index 6c87de9b..00000000 --- a/apps/ui/ingestion-ui/src/assets/react.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/apps/ui/ingestion-ui/src/components/ingestion.tsx b/apps/ui/ingestion-ui/src/components/ingestion.tsx index a17ee43a..6ca839e5 100644 --- a/apps/ui/ingestion-ui/src/components/ingestion.tsx +++ b/apps/ui/ingestion-ui/src/components/ingestion.tsx @@ -15,11 +15,12 @@ import { CollapsibleContent, CollapsibleTrigger, } from "@/components/ui/collapsible"; -import { ChevronDown } from "lucide-react"; +import { ChevronDown, ChevronLeft, ChevronRight } from "lucide-react"; -// Hardcoded values (not recommended for production) +//! Hardcoded values (not recommended for production) +//! Highly recommended to move all Firecrawl API calls to the backend (e.g. Next.js API route) const FIRECRAWL_API_URL = "https://api.firecrawl.dev"; // Replace with your actual API URL whether it is local or using Firecrawl Cloud -const FIRECRAWL_API_KEY = ""; // Replace with your actual API key +const FIRECRAWL_API_KEY = "fc-YOUR_API_KEY"; // Replace with your actual API key interface FormData { url: string; @@ -100,7 +101,8 @@ export default function FirecrawlComponent() { const [elapsedTime, setElapsedTime] = useState(0); const [showCrawlStatus, setShowCrawlStatus] = useState(false); const [isScraping, setIsScraping] = useState(false); - const [showAllUrls, setShowAllUrls] = useState(false); + const [currentPage, setCurrentPage] = useState(1); + const urlsPerPage = 10; useEffect(() => { let timer: NodeJS.Timeout; @@ -289,6 +291,7 @@ export default function FirecrawlComponent() { const data: ScrapeResult = await response.json(); newScrapeResults[url] = data; setCrawlStatus((prev) => ({ ...prev, current: index + 1 })); + setScrapeResults({ ...scrapeResults, ...newScrapeResults }); } catch (error) { console.error(`Error scraping ${url}:`, error); newScrapeResults[url] = { @@ -312,22 +315,35 @@ export default function FirecrawlComponent() { } } - setScrapeResults(newScrapeResults); setLoading(false); setIsScraping(false); }; + const handlePageChange = (newPage: number) => { + setCurrentPage(newPage); + }; + + const paginatedUrls = crawledUrls.slice( + (currentPage - 1) * urlsPerPage, + currentPage * urlsPerPage + ); + return (
- - - Extract web content with Firecrawl 🔥 + + + Extract web content + + Powered by Firecrawl 🔥 +
- Use this component to quickly build your own UI for Firecrawl. Plug - in your API key and the component will handle the rest. Learn more - on the{" "} + Use this component to quickly give your users the ability to connect + their AI apps to web data with Firecrawl. Learn more on the{" "}
- +
{selectedUrls.length === crawledUrls.length - ? "Unselect All" - : "Select All"} + ? `Unselect All (${selectedUrls.length})` + : `Select All (${selectedUrls.length})`} )} @@ -503,41 +548,57 @@ export default function FirecrawlComponent() { !isScraping && ( <>
    - {(showAllUrls ? crawledUrls : crawledUrls.slice(0, 10)).map( - (url, index) => ( -
  • - - setSelectedUrls((prev) => - prev.includes(url) - ? prev.filter((u) => u !== url) - : [...prev, url] - ) - } - /> - {url} -
  • - ) - )} -
- {crawledUrls.length > 10 && ( -
- -
- )} + + setSelectedUrls((prev) => + prev.includes(url) + ? prev.filter((u) => u !== url) + : [...prev, url] + ) + } + /> + + {url.length > 70 ? `${url.slice(0, 70)}...` : url} + + + ))} + +
+ + + Page {currentPage} of{" "} + {Math.ceil(crawledUrls.length / urlsPerPage)} + + +
)} - + {crawledUrls.length > 0 && !scrapingSelectedLoading && ( - ))}
diff --git a/docker-compose.yaml b/docker-compose.yaml index 98b00041..4974e8b8 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -13,6 +13,8 @@ x-common-service: &common-service - PORT=${PORT:-3002} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - OPENAI_API_KEY=${OPENAI_API_KEY} + - OPENAI_BASE_URL=${OPENAI_BASE_URL} + - MODEL_NAME=${MODEL_NAME:-gpt-4o} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} - SERPER_API_KEY=${SERPER_API_KEY} - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} diff --git a/examples/kubernetes/cluster-install/README.md b/examples/kubernetes/cluster-install/README.md index f874d829..736ae038 100644 --- a/examples/kubernetes/cluster-install/README.md +++ b/examples/kubernetes/cluster-install/README.md @@ -4,12 +4,12 @@ 2. Build Docker images, and host it in your Docker Registry (replace the target registry with your own) 1. API (which is also used as a worker image) 1. ```bash - docker build -t ghcr.io/winkk-dev/firecrawl:latest ../../apps/api + docker build --no-cache -t ghcr.io/winkk-dev/firecrawl:latest ../../../apps/api docker push ghcr.io/winkk-dev/firecrawl:latest ``` 2. Playwright 1. ```bash - docker build -t ghcr.io/winkk-dev/firecrawl-playwright:latest ../../apps/playwright-service + docker build --no-cache -t ghcr.io/winkk-dev/firecrawl-playwright:latest ../../../apps/playwright-service docker push ghcr.io/winkk-dev/firecrawl-playwright:latest ``` 3. Replace the image in [worker.yaml](worker.yaml), [api.yaml](api.yaml) and [playwright-service.yaml](playwright-service.yaml) diff --git a/examples/kubernetes/cluster-install/api.yaml b/examples/kubernetes/cluster-install/api.yaml index cdc69c3d..54ecfbf6 100644 --- a/examples/kubernetes/cluster-install/api.yaml +++ b/examples/kubernetes/cluster-install/api.yaml @@ -15,16 +15,35 @@ spec: imagePullSecrets: - name: docker-registry-secret containers: - - name: api - image: ghcr.io/winkk-dev/firecrawl:latest - args: [ "pnpm", "run", "start:production" ] - ports: - - containerPort: 3002 - envFrom: - - configMapRef: - name: firecrawl-config - - secretRef: - name: firecrawl-secret + - name: api + image: ghcr.io/winkk-dev/firecrawl:latest + imagePullPolicy: Always + args: [ "pnpm", "run", "start:production" ] + ports: + - containerPort: 3002 + envFrom: + - configMapRef: + name: firecrawl-config + #- secretRef: + # name: firecrawl-secret + livenessProbe: + httpGet: + path: /v0/health/liveness + port: 3002 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /v0/health/readiness + port: 3002 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 --- apiVersion: v1 kind: Service diff --git a/examples/kubernetes/cluster-install/configmap.yaml b/examples/kubernetes/cluster-install/configmap.yaml index b415d562..b56cfbcd 100644 --- a/examples/kubernetes/cluster-install/configmap.yaml +++ b/examples/kubernetes/cluster-install/configmap.yaml @@ -7,8 +7,7 @@ data: PORT: "3002" HOST: "0.0.0.0" REDIS_URL: "redis://redis:6379" - PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000" + REDIS_RATE_LIMIT_URL: "redis://redis:6379" + PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000/html" USE_DB_AUTHENTICATION: "false" - SUPABASE_ANON_TOKEN: "" - SUPABASE_URL: "" - SUPABASE_SERVICE_TOKEN: "" + HDX_NODE_BETA_MODE: "1" diff --git a/examples/kubernetes/cluster-install/playwright-service.yaml b/examples/kubernetes/cluster-install/playwright-service.yaml index ce794253..43cf15f0 100644 --- a/examples/kubernetes/cluster-install/playwright-service.yaml +++ b/examples/kubernetes/cluster-install/playwright-service.yaml @@ -1,3 +1,10 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: playwright-service-config +data: + PORT: "3000" +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -15,13 +22,32 @@ spec: imagePullSecrets: - name: docker-registry-secret containers: - - name: playwright-service - image: ghcr.io/winkk-dev/firecrawl-playwright:latest - ports: - - containerPort: 3000 - envFrom: - - configMapRef: - name: firecrawl-config + - name: playwright-service + image: ghcr.io/winkk-dev/firecrawl-playwright:latest + imagePullPolicy: Always + ports: + - containerPort: 3000 + envFrom: + - configMapRef: + name: playwright-service-config + livenessProbe: + httpGet: + path: /health/liveness + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health/readiness + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 --- apiVersion: v1 kind: Service diff --git a/examples/kubernetes/cluster-install/worker.yaml b/examples/kubernetes/cluster-install/worker.yaml index 2b3b2e79..8e992cf1 100644 --- a/examples/kubernetes/cluster-install/worker.yaml +++ b/examples/kubernetes/cluster-install/worker.yaml @@ -15,10 +15,12 @@ spec: imagePullSecrets: - name: docker-registry-secret containers: - - name: worker - image: ghcr.io/winkk-dev/firecrawl:latest - envFrom: - - configMapRef: - name: firecrawl-config - - secretRef: - name: firecrawl-secret + - name: worker + image: ghcr.io/winkk-dev/firecrawl:latest + imagePullPolicy: Always + args: [ "pnpm", "run", "workers" ] + envFrom: + - configMapRef: + name: firecrawl-config + #- secretRef: + # name: firecrawl-secret