diff --git a/.github/workflows/fly-direct.yml b/.github/workflows/fly-direct.yml index d395ff31..8ec675fa 100644 --- a/.github/workflows/fly-direct.yml +++ b/.github/workflows/fly-direct.yml @@ -28,6 +28,7 @@ jobs: deploy: name: Deploy app runs-on: ubuntu-latest + timeout-minutes: 15 steps: - uses: actions/checkout@v3 - uses: superfly/flyctl-actions/setup-flyctl@master diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 9a5933b6..3a87849c 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -3,6 +3,7 @@ import { getRateLimiter } from "../services/rate-limiter"; import { AuthResponse, NotificationType, + PlanType, RateLimiterMode, } from "../types"; import { supabase_service } from "../services/supabase"; @@ -101,7 +102,7 @@ export async function supaAuthenticateUser( team_id?: string; error?: string; status?: number; - plan?: string; + plan?: PlanType; }> { const authHeader = req.headers.authorization; if (!authHeader) { @@ -349,10 +350,10 @@ export async function supaAuthenticateUser( return { success: true, team_id: subscriptionData.team_id, - plan: subscriptionData.plan ?? "", + plan: (subscriptionData.plan ?? "") as PlanType, }; } -function getPlanByPriceId(price_id: string) { +function getPlanByPriceId(price_id: string): PlanType { switch (price_id) { case process.env.STRIPE_PRICE_ID_STARTER: return "starter"; diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index e3a5c889..aefdb5e5 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -14,10 +14,11 @@ import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl import { getScrapeQueue } from "../../../src/services/queue-service"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; +import { getJobPriority } from "../../lib/job-priority"; export async function crawlController(req: Request, res: Response) { try { - const { success, team_id, error, status } = await authenticateUser( + const { success, team_id, error, status, plan } = await authenticateUser( req, res, RateLimiterMode.Crawl @@ -136,6 +137,7 @@ export async function crawlController(req: Request, res: Response) { crawlerOptions, pageOptions, team_id, + plan, createdAt: Date.now(), }; @@ -151,7 +153,15 @@ export async function crawlController(req: Request, res: Response) { ? null : await crawler.tryGetSitemap(); + if (sitemap !== null && sitemap.length > 0) { + let jobPriority = 20; + // If it is over 1000, we need to get the job priority, + // otherwise we can use the default priority of 20 + if(sitemap.length > 1000){ + // set base to 21 + jobPriority = await getJobPriority({plan, team_id, basePriority: 21}) + } const jobs = sitemap.map((x) => { const url = x.url; const uuid = uuidv4(); @@ -169,7 +179,7 @@ export async function crawlController(req: Request, res: Response) { }, opts: { jobId: uuid, - priority: 20, + priority: jobPriority, }, }; }); @@ -192,6 +202,10 @@ export async function crawlController(req: Request, res: Response) { } } else { await lockURL(id, sc, url); + + // Not needed, first one should be 15. + // const jobPriority = await getJobPriority({plan, team_id, basePriority: 10}) + const job = await addScrapeJob( { url, diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index c5084b28..f8706867 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -11,7 +11,7 @@ import * as Sentry from "@sentry/node"; export async function crawlPreviewController(req: Request, res: Response) { try { - const { success, error, status } = await authenticateUser( + const { success, error, status, team_id:a, plan } = await authenticateUser( req, res, RateLimiterMode.Preview @@ -89,6 +89,7 @@ export async function crawlPreviewController(req: Request, res: Response) { crawlerOptions, pageOptions, team_id, + plan, robots, createdAt: Date.now(), }; diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 0640918b..be99e8c1 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -2,7 +2,7 @@ import { ExtractorOptions, PageOptions } from './../../lib/entities'; import { Request, Response } from "express"; import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; -import { RateLimiterMode } from "../../types"; +import { PlanType, RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; import { Document } from "../../lib/entities"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function @@ -13,6 +13,7 @@ import { getScrapeQueue } from '../../services/queue-service'; import { v4 as uuidv4 } from "uuid"; import { Logger } from '../../lib/logger'; import * as Sentry from "@sentry/node"; +import { getJobPriority } from '../../lib/job-priority'; export async function scrapeHelper( jobId: string, @@ -22,7 +23,7 @@ export async function scrapeHelper( pageOptions: PageOptions, extractorOptions: ExtractorOptions, timeout: number, - plan?: string + plan?: PlanType ): Promise<{ success: boolean; error?: string; @@ -38,6 +39,8 @@ export async function scrapeHelper( return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; } + const jobPriority = await getJobPriority({plan, team_id, basePriority: 10}) + const job = await addScrapeJob({ url, mode: "single_urls", @@ -46,7 +49,7 @@ export async function scrapeHelper( pageOptions, extractorOptions, origin: req.body.origin ?? defaultOrigin, - }, {}, jobId); + }, {}, jobId, jobPriority); let doc; diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index b1c68b53..825abbe1 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -2,7 +2,7 @@ import { Request, Response } from "express"; import { WebScraperDataProvider } from "../../scraper/WebScraper"; import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; -import { RateLimiterMode } from "../../types"; +import { PlanType, RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; import { PageOptions, SearchOptions } from "../../lib/entities"; import { search } from "../../search"; @@ -12,6 +12,7 @@ import { Logger } from "../../lib/logger"; import { getScrapeQueue } from "../../services/queue-service"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import * as Sentry from "@sentry/node"; +import { getJobPriority } from "../../lib/job-priority"; export async function searchHelper( jobId: string, @@ -20,6 +21,7 @@ export async function searchHelper( crawlerOptions: any, pageOptions: PageOptions, searchOptions: SearchOptions, + plan: PlanType ): Promise<{ success: boolean; error?: string; @@ -76,6 +78,8 @@ export async function searchHelper( return { success: true, error: "No search results found", returnCode: 200 }; } + const jobPriority = await getJobPriority({plan, team_id, basePriority: 20}); + // filter out social media links const jobDatas = res.map(x => { @@ -92,7 +96,7 @@ export async function searchHelper( }, opts: { jobId: uuid, - priority: 20, + priority: jobPriority, } }; }) @@ -135,7 +139,7 @@ export async function searchHelper( export async function searchController(req: Request, res: Response) { try { // make sure to authenticate user first, Bearer - const { success, team_id, error, status } = await authenticateUser( + const { success, team_id, error, status, plan } = await authenticateUser( req, res, RateLimiterMode.Search @@ -176,6 +180,7 @@ export async function searchController(req: Request, res: Response) { crawlerOptions, pageOptions, searchOptions, + plan ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index bc403ddc..a30005c4 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -216,6 +216,8 @@ if (cluster.isMaster) { Logger.info(`Worker ${process.pid} started`); } + + // const sq = getScrapeQueue(); // sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); diff --git a/apps/api/src/lib/__tests__/job-priority.test.ts b/apps/api/src/lib/__tests__/job-priority.test.ts new file mode 100644 index 00000000..82477379 --- /dev/null +++ b/apps/api/src/lib/__tests__/job-priority.test.ts @@ -0,0 +1,134 @@ +import { + getJobPriority, + addJobPriority, + deleteJobPriority, +} from "../job-priority"; +import { redisConnection } from "../../services/queue-service"; +import { PlanType } from "../../types"; + +jest.mock("../../services/queue-service", () => ({ + redisConnection: { + sadd: jest.fn(), + srem: jest.fn(), + scard: jest.fn(), + expire: jest.fn(), + }, +})); + +describe("Job Priority Tests", () => { + afterEach(() => { + jest.clearAllMocks(); + }); + + test("addJobPriority should add job_id to the set and set expiration", async () => { + const team_id = "team1"; + const job_id = "job1"; + await addJobPriority(team_id, job_id); + expect(redisConnection.sadd).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + job_id + ); + expect(redisConnection.expire).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + 60 + ); + }); + + test("deleteJobPriority should remove job_id from the set", async () => { + const team_id = "team1"; + const job_id = "job1"; + await deleteJobPriority(team_id, job_id); + expect(redisConnection.srem).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + job_id + ); + }); + + test("getJobPriority should return correct priority based on plan and set length", async () => { + const team_id = "team1"; + const plan: PlanType = "standard"; + (redisConnection.scard as jest.Mock).mockResolvedValue(150); + + const priority = await getJobPriority({ plan, team_id }); + expect(priority).toBe(10); + + (redisConnection.scard as jest.Mock).mockResolvedValue(250); + const priorityExceeded = await getJobPriority({ plan, team_id }); + expect(priorityExceeded).toBe(20); // basePriority + Math.ceil((250 - 200) * 0.4) + }); + + test("getJobPriority should handle different plans correctly", async () => { + const team_id = "team1"; + + (redisConnection.scard as jest.Mock).mockResolvedValue(50); + let plan: PlanType = "hobby"; + let priority = await getJobPriority({ plan, team_id }); + expect(priority).toBe(10); + + (redisConnection.scard as jest.Mock).mockResolvedValue(150); + plan = "hobby"; + priority = await getJobPriority({ plan, team_id }); + expect(priority).toBe(25); // basePriority + Math.ceil((150 - 50) * 0.3) + + (redisConnection.scard as jest.Mock).mockResolvedValue(25); + plan = "free"; + priority = await getJobPriority({ plan, team_id }); + expect(priority).toBe(10); + + (redisConnection.scard as jest.Mock).mockResolvedValue(60); + plan = "free"; + priority = await getJobPriority({ plan, team_id }); + expect(priority).toBe(28); // basePriority + Math.ceil((60 - 25) * 0.5) + }); + + test("addJobPriority should reset expiration time when adding new job", async () => { + const team_id = "team1"; + const job_id1 = "job1"; + const job_id2 = "job2"; + + await addJobPriority(team_id, job_id1); + expect(redisConnection.expire).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + 60 + ); + + // Clear the mock calls + (redisConnection.expire as jest.Mock).mockClear(); + + // Add another job + await addJobPriority(team_id, job_id2); + expect(redisConnection.expire).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + 60 + ); + }); + + test("Set should expire after 60 seconds", async () => { + const team_id = "team1"; + const job_id = "job1"; + + jest.useFakeTimers(); + + await addJobPriority(team_id, job_id); + expect(redisConnection.expire).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + 60 + ); + + // Fast-forward time by 59 seconds + jest.advanceTimersByTime(59000); + + // The set should still exist + expect(redisConnection.scard).not.toHaveBeenCalled(); + + // Fast-forward time by 2 more seconds (total 61 seconds) + jest.advanceTimersByTime(2000); + + // Check if the set has been removed (scard should return 0) + (redisConnection.scard as jest.Mock).mockResolvedValue(0); + const setSize = await redisConnection.scard(`limit_team_id:${team_id}`); + expect(setSize).toBe(0); + + jest.useRealTimers(); + }); +}); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 6640678d..9240018e 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -6,6 +6,7 @@ export type StoredCrawl = { crawlerOptions: any; pageOptions: any; team_id: string; + plan: string; robots?: string; cancelled?: boolean; createdAt: number; diff --git a/apps/api/src/lib/job-priority.ts b/apps/api/src/lib/job-priority.ts new file mode 100644 index 00000000..bb6158f9 --- /dev/null +++ b/apps/api/src/lib/job-priority.ts @@ -0,0 +1,91 @@ +import { redisConnection } from "../../src/services/queue-service"; +import { PlanType } from "../../src/types"; +import { Logger } from "./logger"; + +const SET_KEY_PREFIX = "limit_team_id:"; +export async function addJobPriority(team_id, job_id) { + try { + const setKey = SET_KEY_PREFIX + team_id; + + // Add scrape job id to the set + await redisConnection.sadd(setKey, job_id); + + // This approach will reset the expiration time to 60 seconds every time a new job is added to the set. + await redisConnection.expire(setKey, 60); + } catch (e) { + Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`); + } +} + +export async function deleteJobPriority(team_id, job_id) { + try { + const setKey = SET_KEY_PREFIX + team_id; + + // remove job_id from the set + await redisConnection.srem(setKey, job_id); + } catch (e) { + Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`); + } +} + +export async function getJobPriority({ + plan, + team_id, + basePriority = 10, +}: { + plan: PlanType; + team_id: string; + basePriority?: number; +}): Promise { + try { + const setKey = SET_KEY_PREFIX + team_id; + + // Get the length of the set + const setLength = await redisConnection.scard(setKey); + + // Determine the priority based on the plan and set length + let planModifier = 1; + let bucketLimit = 0; + + switch (plan) { + case "free": + bucketLimit = 25; + planModifier = 0.5; + break; + case "hobby": + bucketLimit = 100; + planModifier = 0.3; + break; + case "standard": + case "standardnew": + bucketLimit = 200; + planModifier = 0.2; + break; + case "growth": + case "growthdouble": + bucketLimit = 400; + planModifier = 0.1; + break; + + default: + bucketLimit = 25; + planModifier = 1; + break; + } + + // if length set is smaller than set, just return base priority + if (setLength <= bucketLimit) { + return basePriority; + } else { + // If not, we keep base priority + planModifier + return Math.ceil( + basePriority + Math.ceil((setLength - bucketLimit) * planModifier) + ); + } + } catch (e) { + Logger.error( + `Get job priority failed: ${team_id}, ${plan}, ${basePriority}` + ); + return basePriority; + } +} diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 24e1ba85..2b476f52 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -8,10 +8,11 @@ async function addScrapeJobRaw( webScraperOptions: any, options: any, jobId: string, + jobPriority: number = 10 ): Promise { return await getScrapeQueue().add(jobId, webScraperOptions, { ...options, - priority: webScraperOptions.crawl_id ? 20 : 10, + priority: jobPriority, jobId, }); } @@ -20,7 +21,9 @@ export async function addScrapeJob( webScraperOptions: WebScraperOptions, options: any = {}, jobId: string = uuidv4(), + jobPriority: number = 10 ): Promise { + if (Sentry.isInitialized()) { const size = JSON.stringify(webScraperOptions).length; return await Sentry.startSpan({ @@ -39,10 +42,10 @@ export async function addScrapeJob( baggage: Sentry.spanToBaggageHeader(span), size, }, - }, options, jobId); + }, options, jobId, jobPriority); }); } else { - return await addScrapeJobRaw(webScraperOptions, options, jobId); + return await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority); } } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index c2e3ba3c..31d70a0b 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -21,6 +21,8 @@ import { addCrawlJob, addCrawlJobDone, crawlToCrawler, finishCrawl, getCrawl, ge import { StoredCrawl } from "../lib/crawl-redis"; import { addScrapeJob } from "./queue-jobs"; import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; +import { addJobPriority, deleteJobPriority, getJobPriority } from "../../src/lib/job-priority"; +import { PlanType } from "../types"; if (process.env.ENV === "production") { initSDK({ @@ -50,6 +52,7 @@ const processJobInternal = async (token: string, job: Job) => { await job.extendLock(token, jobLockExtensionTime); }, jobLockExtendInterval); + await addJobPriority(job.data.team_id, job.id ); let err = null; try { const result = await processJob(job, token); @@ -67,6 +70,7 @@ const processJobInternal = async (token: string, job: Job) => { err = error; await job.moveToFailed(error, token, false); } finally { + await deleteJobPriority(job.data.team_id, job.id ); clearInterval(extendLockInterval); } @@ -251,6 +255,16 @@ async function processJob(job: Job, token: string) { for (const link of links) { if (await lockURL(job.data.crawl_id, sc, link)) { + + // This seems to work really welel + const jobPriority = await getJobPriority({plan:sc.plan as PlanType, team_id: sc.team_id, basePriority: job.data.crawl_id ? 20 : 10}) + const jobId = uuidv4(); + + // console.log("plan: ", sc.plan); + // console.log("team_id: ", sc.team_id) + // console.log("base priority: ", job.data.crawl_id ? 20 : 10) + // console.log("job priority: " , jobPriority, "\n\n\n") + const newJob = await addScrapeJob({ url: link, mode: "single_urls", @@ -260,7 +274,7 @@ async function processJob(job: Job, token: string) { origin: job.data.origin, crawl_id: job.data.crawl_id, v1: job.data.v1, - }); + }, {}, jobId, jobPriority); await addCrawlJob(job.data.crawl_id, newJob.id); } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 1da2c703..c57969f2 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -116,8 +116,8 @@ export interface AuthResponse { team_id?: string; error?: string; status?: number; - plan?: string; api_key?: string; + plan?: PlanType; } @@ -140,4 +140,15 @@ export type ScrapeLog = { html?: string; ipv4_support?: boolean | null; ipv6_support?: boolean | null; -}; \ No newline at end of file +}; + +export type PlanType = + | "starter" + | "standard" + | "scale" + | "hobby" + | "standardnew" + | "growth" + | "growthdouble" + | "free" + | ""; \ No newline at end of file diff --git a/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb b/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb new file mode 100644 index 00000000..ee14f147 --- /dev/null +++ b/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb @@ -0,0 +1,259 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Web Scraping and Extraction with Firecrawl and Claude\n", + "\n", + "This notebook demonstrates how to use Firecrawl to scrape web content and Claude to extract structured data from it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Import Required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import json\n", + "from firecrawl import FirecrawlApp\n", + "from anthropic import Anthropic\n", + "from dotenv import load_dotenv\n", + "\n", + "# Load environment variables\n", + "load_dotenv()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Set Up API Keys and URL" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "URL to scrape: https://mendable.ai\n" + ] + } + ], + "source": [ + "# Retrieve API keys from environment variables\n", + "anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\")\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", + "\n", + "# Set the URL to scrape\n", + "url = \"https://mendable.ai\" # Replace with the actual URL you want to scrape\n", + "\n", + "print(f\"URL to scrape: {url}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Initialize Firecrawl and Anthropic Clients" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Firecrawl and Anthropic clients initialized.\n" + ] + } + ], + "source": [ + "# Initialize FirecrawlApp and Anthropic client\n", + "firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)\n", + "anthropic_client = Anthropic(api_key=anthropic_api_key)\n", + "\n", + "print(\"Firecrawl and Anthropic clients initialized.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Scrape the URL using Firecrawl" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Page content scraped. Length: 16199 characters\n" + ] + } + ], + "source": [ + "# Scrape the URL using Firecrawl\n", + "page_content = firecrawl_app.scrape_url(url, params={\"pageOptions\": {\"onlyMainContent\": True}})\n", + "\n", + "print(f\"Page content scraped. Length: {len(page_content['content'])} characters\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Prepare the Prompt for Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prompt prepared for Claude.\n" + ] + } + ], + "source": [ + "# Prepare the prompt for Claude\n", + "prompt = f\"\"\"Analyze the following webpage content and extract the following information:\n", + "1. The title of the page\n", + "2. Whether the company is part of Y Combinator (YC)\n", + "3. Whether the company/product is open source\n", + "\n", + "Return the information in JSON format with the following schema:\n", + "{{\n", + " \"main_header_title\": string,\n", + " \"is_yc_company\": boolean,\n", + " \"is_open_source\": boolean\n", + "}}\n", + "\n", + "Webpage content:\n", + "{page_content['content']}\n", + "\n", + "Return only the JSON, nothing else.\"\"\"\n", + "\n", + "print(\"Prompt prepared for Claude.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Query Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Claude response received.\n" + ] + } + ], + "source": [ + "# Query Claude\n", + "response = anthropic_client.messages.create(\n", + " model=\"claude-3-opus-20240229\",\n", + " max_tokens=1000,\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ]\n", + ")\n", + "\n", + "print(\"Claude response received.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Parse and Display the Result" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"title\": \"Just in time answers for Sales and Support\",\n", + " \"is_yc_company\": true,\n", + " \"is_open_source\": false\n", + "}\n" + ] + } + ], + "source": [ + "# Parse and print the result\n", + "result = json.loads(response.content[0].text)\n", + "print(json.dumps(result, indent=2))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}