6634d236bf
* wip * integrating smart-scrape * integrate smartscrape into llmExtract * wip * smart scrape multiple links * fixes * fix * wip * it worked! * wip. there's a bug on the batchExtract TypeError: Converting circular structure to JSON * wip * retry model * retry models * feat/scrape+json+extract interfaces ready * vertex -> googleapi * fix/transformArrayToObject. required params on schema is still a bug * change model * o3-mini -> gemini * Update extractSmartScrape.ts * sessionId * sessionId * Nick: f-0 start * Update extraction-service-f0.ts * Update types.ts * Nick: * Update queue-worker.ts * Nick: new interface * rename analyzeSchemaAndPrompt -> F0 * refactor: rename agent ID to model in types and extract logic * agent * id->model * id->model * refactor: standardize agent model handling and validation across extraction logic * livecast agent * (feat/f1) sdks (#1459) * feat: add FIRE-1 agent support to Python and JavaScript SDKs Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev> * feat: add FIRE-1 agent support to scrape methods in both SDKs Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev> * feat: add prompt and sessionId to AgentOptions interface Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev> * Update index.ts --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev <hello@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com> * feat(v1): rate limits * Update types.ts * Update llmExtract.ts * add cost tracking * remove * Update requests.http * fix smart scrape cost calc * log sm cost * fix counts * fix * expose cost tracking * models fix * temp: skipLibcheck * get rid of it * fix ts * dont skip lib check * Update extractSmartScrape.ts * Update queue-worker.ts * Update smartScrape.ts * Update requests.http * fix(rate-limiter): * types: fire-1 refine * bill 150 * fix credits used on crawl * ban from crawl * route cost limit warning * Update generic-ai.ts * genres * Update llmExtract.ts * test server diff * cletu --------- Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Co-authored-by: Thomas Kosmas <thomas510111@gmail.com> Co-authored-by: Ademílson F. Tonato <ademilsonft@outlook.com> Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev <hello@sideguide.dev> Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
165 lines
5.0 KiB
TypeScript
165 lines
5.0 KiB
TypeScript
import { z } from "zod";
|
|
import { logger } from "../../../lib/logger";
|
|
import { robustFetch } from "./fetch";
|
|
import fs from "fs/promises";
|
|
import { configDotenv } from "dotenv";
|
|
|
|
configDotenv();
|
|
|
|
// Define schemas outside the function scope
|
|
const tokenUsageDetailSchema = z.object({
|
|
input_tokens: z.number().int(),
|
|
output_tokens: z.number().int(),
|
|
total_cost: z.number().nullable(), // Allows number or null
|
|
});
|
|
|
|
// Schema for an individual scraped page object
|
|
const scrapedPageSchema = z.object({
|
|
html: z.string(),
|
|
reason: z.string(),
|
|
page: z.union([z.string(), z.number()]),
|
|
});
|
|
|
|
// Main schema for the structure returned by the smart-scrape endpoint
|
|
const smartScrapeResultSchema = z.object({
|
|
sessionId: z.string(),
|
|
success: z.boolean(),
|
|
scrapedPages: z.array(scrapedPageSchema),
|
|
tokenUsage: z.number(),
|
|
|
|
// z.record(
|
|
// z.string(), // Key is the model name (string)
|
|
// tokenUsageDetailSchema, // Value matches the detail schema
|
|
// ),
|
|
});
|
|
|
|
// Infer the TypeScript type from the Zod schema
|
|
export type SmartScrapeResult = z.infer<typeof smartScrapeResultSchema>;
|
|
|
|
/**
|
|
* Sends a POST request to the internal /smart-scrape endpoint to extract
|
|
* structured data from a URL based on a prompt.
|
|
*
|
|
* @param url The URL of the page to scrape.
|
|
* @param prompt The prompt guiding the data extraction.
|
|
* @returns A promise that resolves to an object matching the SmartScrapeResult type.
|
|
* @throws Throws an error if the request fails or the response is invalid.
|
|
*/
|
|
export async function smartScrape(
|
|
url: string,
|
|
prompt: string,
|
|
sessionId?: string,
|
|
): Promise<SmartScrapeResult> {
|
|
try {
|
|
logger.info("Initiating smart scrape request", { url, prompt });
|
|
|
|
// Pass schema type as generic parameter to robustFeth
|
|
const response = await robustFetch<typeof smartScrapeResultSchema>({
|
|
url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`,
|
|
method: "POST",
|
|
body: {
|
|
url,
|
|
prompt,
|
|
userProvidedId: sessionId ?? undefined,
|
|
models: {
|
|
thinkingModel: {
|
|
model: "gemini-2.5-pro-preview-03-25",
|
|
provider: "vertex",
|
|
supportTools: true,
|
|
toolChoice: "required",
|
|
cost: {
|
|
input: 1.3,
|
|
output: 5,
|
|
},
|
|
},
|
|
toolModel: {
|
|
model: "gemini-2.0-flash",
|
|
provider: "google",
|
|
},
|
|
},
|
|
},
|
|
schema: smartScrapeResultSchema, // Pass the schema instance for validation
|
|
logger,
|
|
mock: null, // Keep mock null if not mocking
|
|
});
|
|
|
|
// Check if the response indicates a 500 error
|
|
// Use type assertion to handle the error response structure
|
|
const errorResponse = response as unknown as {
|
|
success: boolean;
|
|
error?: string;
|
|
details?: string;
|
|
};
|
|
|
|
if (
|
|
errorResponse &&
|
|
errorResponse.success === false &&
|
|
errorResponse.error
|
|
) {
|
|
if (errorResponse.error === "Cost limit exceeded") {
|
|
throw new Error("Cost limit exceeded", {
|
|
cause: { tokenUsage: (errorResponse as any).tokenUsage },
|
|
});
|
|
}
|
|
|
|
logger.error("Smart scrape returned error response", {
|
|
url,
|
|
prompt,
|
|
error: errorResponse.error,
|
|
details: errorResponse.details || "No details provided",
|
|
});
|
|
throw new Error(
|
|
`Smart scrape failed: ${errorResponse.error}${errorResponse.details ? ` - ${errorResponse.details}` : ""}`,
|
|
);
|
|
}
|
|
|
|
logger.info("Smart scrape successful", {
|
|
url,
|
|
prompt,
|
|
sessionId: response.sessionId,
|
|
});
|
|
|
|
logger.info("Smart scrape cost $" + response.tokenUsage);
|
|
|
|
return response; // The response type now matches SmartScrapeResult
|
|
} catch (error) {
|
|
// Safely extract error information without circular references
|
|
const errorInfo = {
|
|
message: error instanceof Error ? error.message : String(error),
|
|
name: error instanceof Error ? error.name : "Unknown",
|
|
stack: error instanceof Error ? error.stack : undefined,
|
|
// Extract cause safely if it exists
|
|
cause:
|
|
error instanceof Error && error.cause
|
|
? error.cause instanceof Error
|
|
? {
|
|
message: error.cause.message,
|
|
name: error.cause.name,
|
|
stack: error.cause.stack,
|
|
}
|
|
: typeof error.cause === "object"
|
|
? {
|
|
...Object.fromEntries(
|
|
Object.entries(error.cause).filter(
|
|
([_, v]) => v !== null && typeof v !== "object",
|
|
),
|
|
),
|
|
error:
|
|
(error.cause as any)?.error?.message ||
|
|
(error.cause as any)?.error,
|
|
}
|
|
: String(error.cause)
|
|
: undefined,
|
|
};
|
|
|
|
logger.error("Smart scrape request failed", {
|
|
url,
|
|
prompt,
|
|
error: JSON.stringify(errorInfo),
|
|
});
|
|
|
|
// Rethrowing the error to be handled by the caller
|
|
throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error });
|
|
}
|
|
}
|