Files
firecrawl/apps/api/src/lib/extract/extract-redis.ts
T
Nicolas 6634d236bf (feat/fire-1) FIRE-1 (#1462)
* wip

* integrating smart-scrape

* integrate smartscrape into llmExtract

* wip

* smart scrape multiple links

* fixes

* fix

* wip

* it worked!

* wip. there's a bug on the batchExtract TypeError: Converting circular structure to JSON

* wip

* retry model

* retry models

* feat/scrape+json+extract interfaces ready

* vertex -> googleapi

* fix/transformArrayToObject. required params on schema is still a bug

* change model

* o3-mini -> gemini

* Update extractSmartScrape.ts

* sessionId

* sessionId

* Nick: f-0 start

* Update extraction-service-f0.ts

* Update types.ts

* Nick:

* Update queue-worker.ts

* Nick: new interface

* rename analyzeSchemaAndPrompt -> F0

* refactor: rename agent ID to model in types and extract logic

* agent

* id->model

* id->model

* refactor: standardize agent model handling and validation across extraction logic

* livecast agent

* (feat/f1) sdks (#1459)

* feat: add FIRE-1 agent support to Python and JavaScript SDKs

Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev>

* feat: add FIRE-1 agent support to scrape methods in both SDKs

Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev>

* feat: add prompt and sessionId to AgentOptions interface

Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev>

* Update index.ts

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: hello@sideguide.dev <hello@sideguide.dev>
Co-authored-by: Nicolas <nicolascamara29@gmail.com>

* feat(v1): rate limits

* Update types.ts

* Update llmExtract.ts

* add cost tracking

* remove

* Update requests.http

* fix smart scrape cost calc

* log sm cost

* fix counts

* fix

* expose cost tracking

* models fix

* temp: skipLibcheck

* get rid of it

* fix ts

* dont skip lib check

* Update extractSmartScrape.ts

* Update queue-worker.ts

* Update smartScrape.ts

* Update requests.http

* fix(rate-limiter):

* types: fire-1 refine

* bill 150

* fix credits used on crawl

* ban from crawl

* route cost limit warning

* Update generic-ai.ts

* genres

* Update llmExtract.ts

* test server diff

* cletu

---------

Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Co-authored-by: Thomas Kosmas <thomas510111@gmail.com>
Co-authored-by: Ademílson F. Tonato <ademilsonft@outlook.com>
Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: hello@sideguide.dev <hello@sideguide.dev>
Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
2025-04-15 00:19:45 -07:00

121 lines
3.4 KiB
TypeScript

import { redisConnection } from "../../services/queue-service";
import { logger as _logger } from "../logger";
import { CostTracking } from "./extraction-service";
export enum ExtractStep {
INITIAL = "initial",
MAP = "map",
MAP_RERANK = "map-rerank",
MULTI_ENTITY = "multi-entity",
MULTI_ENTITY_SCRAPE = "multi-entity-scrape",
MULTI_ENTITY_EXTRACT = "multi-entity-extract",
SCRAPE = "scrape",
EXTRACT = "extract",
COMPLETE = "complete",
}
export type ExtractedStep = {
step: ExtractStep;
startedAt: number;
finishedAt: number;
error?: any;
discoveredLinks?: string[];
};
export type StoredExtract = {
id: string;
team_id: string;
createdAt: number;
status: "processing" | "completed" | "failed" | "cancelled";
error?: any;
showSteps?: boolean;
steps?: ExtractedStep[];
showLLMUsage?: boolean;
showSources?: boolean;
llmUsage?: number;
showCostTracking?: boolean;
costTracking?: CostTracking;
sources?: {
[key: string]: string[];
};
};
// Reduce TTL to 6 hours instead of 24
const EXTRACT_TTL = 6 * 60 * 60;
const STEPS_MAX_DISCOVERED_LINKS = 100;
export async function saveExtract(id: string, extract: StoredExtract) {
_logger.debug("Saving extract " + id + " to Redis...");
// Only store essential data
const minimalExtract = {
...extract,
steps: extract.steps?.map(step => ({
step: step.step,
startedAt: step.startedAt,
finishedAt: step.finishedAt,
error: step.error,
// Only store first 20 discovered links per step
discoveredLinks: step.discoveredLinks?.slice(0, STEPS_MAX_DISCOVERED_LINKS)
}))
};
await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract));
await redisConnection.expire("extract:" + id, EXTRACT_TTL);
}
export async function getExtract(id: string): Promise<StoredExtract | null> {
const x = await redisConnection.get("extract:" + id);
return x ? JSON.parse(x) : null;
}
export async function updateExtract(
id: string,
extract: Partial<StoredExtract>,
) {
const current = await getExtract(id);
if (!current) return;
// Handle steps aggregation with cleanup
if (extract.steps && current.steps) {
// Keep only the last 5 steps to prevent unbounded growth
const allSteps = [...current.steps, ...extract.steps];
extract.steps = allSteps.slice(Math.max(0, allSteps.length - 5));
}
// Limit links in steps to 20 instead of 100 to reduce memory usage
if (extract.steps) {
extract.steps = extract.steps.map((step) => {
if (step.discoveredLinks && step.discoveredLinks.length > STEPS_MAX_DISCOVERED_LINKS) {
return {
...step,
discoveredLinks: step.discoveredLinks.slice(0, STEPS_MAX_DISCOVERED_LINKS),
};
}
return step;
});
}
const minimalExtract = {
...current,
...extract,
steps: extract.steps?.map(step => ({
step: step.step,
startedAt: step.startedAt,
finishedAt: step.finishedAt,
error: step.error,
discoveredLinks: step.discoveredLinks?.slice(0, STEPS_MAX_DISCOVERED_LINKS)
}))
};
await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract));
await redisConnection.expire("extract:" + id, EXTRACT_TTL);
}
export async function getExtractExpiry(id: string): Promise<Date> {
const d = new Date();
const ttl = await redisConnection.pttl("extract:" + id);
d.setMilliseconds(d.getMilliseconds() + ttl);
d.setMilliseconds(0);
return d;
}