Implement investor processing and querying functionality
- Added InvestorProcessor class for processing CSV data in batches and saving to SQL and vector databases. - Introduced QueryProcessor class for querying investor information from SQL and vector databases. - Integrated OpenAI's ChatGPT for structured output generation. - Implemented data cleaning and control character removal in CSV processing. - Added asynchronous processing capabilities for batch handling. - Established connection to ChromaDB for vector storage of investor descriptions. - Defined structured output schemas using Pydantic for investor data validation. - Enhanced settings management for API key and database configurations.
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
from typing import Optional
|
||||
|
||||
import chromadb
|
||||
from langchain_openai import ChatOpenAI
|
||||
from pydantic_schemas import Investor, InvestorList
|
||||
from settings import settings
|
||||
|
||||
# Add these imports for your databases
|
||||
# from sqlalchemy.ext.asyncio import AsyncSession
|
||||
# from your_vector_db import VectorDBClient
|
||||
|
||||
|
||||
class QueryProcessor:
|
||||
def __init__(
|
||||
self,
|
||||
sql_session: Optional[object] = None,
|
||||
vector_db_client: Optional[object] = None,
|
||||
):
|
||||
self.llm = ChatOpenAI(
|
||||
api_key=settings.OPENROUTER_API_KEY,
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model="openai/gpt-oss-120b:free",
|
||||
temperature=0,
|
||||
)
|
||||
|
||||
self.structured_llm = self.llm.with_structured_output(InvestorList)
|
||||
self.sql_session = sql_session
|
||||
self.vector_db_client = vector_db_client
|
||||
|
||||
self.vector_db_client = chromadb.PersistentClient(path="./chroma_db")
|
||||
self.collection = self.vector_db_client.get_or_create_collection(
|
||||
name="investor_descriptions",
|
||||
metadata={
|
||||
"description": "Investor descriptions and investment thesis focus"
|
||||
},
|
||||
)
|
||||
|
||||
def query_sql_database(self, query: str) -> Optional[InvestorList]:
|
||||
"""Query the SQL database for investor information."""
|
||||
if not self.sql_session:
|
||||
return None
|
||||
|
||||
# Implement SQL querying logic here
|
||||
result = self.sql_session.execute(query)
|
||||
investors = result.scalars().all()
|
||||
return InvestorList(investors=investors)
|
||||
|
||||
def query_vector_database(self, query: str) -> Optional[InvestorList]:
|
||||
"""Query the vector database for investor information."""
|
||||
if not self.vector_db_client:
|
||||
return None
|
||||
|
||||
# Implement vector database querying logic here
|
||||
results = self.vector_db_client.query(collection=self.collection, query=query)
|
||||
investors = [Investor(**doc.metadata) for doc in results.documents]
|
||||
return InvestorList(investors=investors)
|
||||
|
||||
def process_query(self, question: str) -> InvestorList:
|
||||
"""Process a query using the LLM and return structured investor data."""
|
||||
response = self.structured_llm.predict(question=question)
|
||||
return response
|
||||
Reference in New Issue
Block a user