diff --git a/app/__pycache__/main.cpython-312.pyc b/app/__pycache__/main.cpython-312.pyc index 8132a27..b7576a7 100644 Binary files a/app/__pycache__/main.cpython-312.pyc and b/app/__pycache__/main.cpython-312.pyc differ diff --git a/app/db/__pycache__/__init__.cpython-312.pyc b/app/db/__pycache__/__init__.cpython-312.pyc index 3386371..811dde4 100644 Binary files a/app/db/__pycache__/__init__.cpython-312.pyc and b/app/db/__pycache__/__init__.cpython-312.pyc differ diff --git a/app/db/__pycache__/db.cpython-312.pyc b/app/db/__pycache__/db.cpython-312.pyc index 027f602..952f14e 100644 Binary files a/app/db/__pycache__/db.cpython-312.pyc and b/app/db/__pycache__/db.cpython-312.pyc differ diff --git a/app/db/__pycache__/models.cpython-312.pyc b/app/db/__pycache__/models.cpython-312.pyc index bf0f112..e7219a5 100644 Binary files a/app/db/__pycache__/models.cpython-312.pyc and b/app/db/__pycache__/models.cpython-312.pyc differ diff --git a/app/db/db.py b/app/db/db.py index 27eace0..8097fa8 100644 --- a/app/db/db.py +++ b/app/db/db.py @@ -32,7 +32,6 @@ db_dependency = Annotated[Session, Depends(get_db)] def init_database(): """Initialize the database by creating all tables""" Base.metadata.create_all(bind=engine) - print("Database initialized successfully!") def get_session_sync() -> Session: diff --git a/app/db/models.py b/app/db/models.py index 0d7f8fc..a99459a 100644 --- a/app/db/models.py +++ b/app/db/models.py @@ -55,12 +55,12 @@ class InvestorTable(Base, TimestampMixin): id = Column(Integer, primary_key=True, index=True) name = Column(String, nullable=False) description = Column(Text, nullable=True) - aum = Column(Integer, nullable=False) # Assets Under Management - check_size_lower = Column(Integer, nullable=False) # Lower bound - check_size_upper = Column(Integer, nullable=False) # Upper bound - geographic_focus = Column(String, nullable=False) - stage_focus = Column(Enum(InvestmentStage), nullable=False) - number_of_investments = Column(Integer, default=0) + aum = Column(Integer, nullable=True) # Assets Under Management + check_size_lower = Column(Integer, nullable=True) # Lower bound + check_size_upper = Column(Integer, nullable=True) # Upper bound + geographic_focus = Column(String, nullable=True) + stage_focus = Column(Enum(InvestmentStage), nullable=True) + number_of_investments = Column(Integer, default=0, nullable=True) # Relationship to portfolio companies portfolio_companies = relationship( @@ -80,8 +80,8 @@ class InvestorMember(Base, TimestampMixin): __tablename__ = "investor_members" id = Column(Integer, primary_key=True, index=True) name = Column(String, nullable=False) - role = Column(String, nullable=False) - email = Column(String, nullable=False) + role = Column(String, nullable=True) + email = Column(String, nullable=True) investor_id = Column(Integer, ForeignKey("investors.id")) investor = relationship("InvestorTable", back_populates="team_members") @@ -92,8 +92,8 @@ class CompanyTable(Base, TimestampMixin): id = Column(Integer, primary_key=True, index=True) name = Column(String, nullable=False) - industry = Column(String, nullable=False) - location = Column(String, nullable=False) + industry = Column(String, nullable=True) + location = Column(String, nullable=True) description = Column(String, nullable=True) founded_year = Column(Integer, nullable=True) website = Column(String, nullable=True) @@ -115,8 +115,8 @@ class CompanyMember(Base, TimestampMixin): __tablename__ = "company_members" id = Column(Integer, primary_key=True) name = Column(String) - linkedin = Column(String) - role = Column(String) + linkedin = Column(String, nullable=True) + role = Column(String, nullable=True) company_id = Column(Integer, ForeignKey("companies.id"), nullable=False) company = relationship("CompanyTable", back_populates="members") diff --git a/app/main.py b/app/main.py index ac839bf..c32d579 100644 --- a/app/main.py +++ b/app/main.py @@ -1,7 +1,7 @@ import io import pandas as pd -from db.db import db_dependency, init_database +from db.db import Base, db_dependency, engine from dotenv import load_dotenv from fastapi import FastAPI, File, Form, UploadFile from pydantic import BaseModel @@ -11,6 +11,13 @@ from services.llm_parser import InvestorProcessor from services.querying import QueryProcessor load_dotenv() + + +def init_database(): + """Initialize the database by creating all tables""" + Base.metadata.create_all(bind=engine) + + init_database() app = FastAPI() @@ -34,7 +41,9 @@ def health(): @app.post("/parse-csv", tags=["CSV Upload"], response_model=list[dict]) -async def parse_csv(db: db_dependency, file: UploadFile = File(...), is_investor: int = Form(...)): +async def parse_csv( + db: db_dependency, file: UploadFile = File(...), is_investor: int = Form(...) +): # Read uploaded CSV with pandas content = await file.read() df = pd.read_csv(io.StringIO(content.decode("utf-8"))) diff --git a/app/routers/__pycache__/__init__.cpython-312.pyc b/app/routers/__pycache__/__init__.cpython-312.pyc index 476b0ab..b90f7c0 100644 Binary files a/app/routers/__pycache__/__init__.cpython-312.pyc and b/app/routers/__pycache__/__init__.cpython-312.pyc differ diff --git a/app/routers/__pycache__/companies.cpython-312.pyc b/app/routers/__pycache__/companies.cpython-312.pyc index 1daaddd..2ab18c8 100644 Binary files a/app/routers/__pycache__/companies.cpython-312.pyc and b/app/routers/__pycache__/companies.cpython-312.pyc differ diff --git a/app/routers/__pycache__/investors.cpython-312.pyc b/app/routers/__pycache__/investors.cpython-312.pyc index eeeb4b1..7653798 100644 Binary files a/app/routers/__pycache__/investors.cpython-312.pyc and b/app/routers/__pycache__/investors.cpython-312.pyc differ diff --git a/app/schemas/__pycache__/__init__.cpython-312.pyc b/app/schemas/__pycache__/__init__.cpython-312.pyc index dd51e01..61947b4 100644 Binary files a/app/schemas/__pycache__/__init__.cpython-312.pyc and b/app/schemas/__pycache__/__init__.cpython-312.pyc differ diff --git a/app/schemas/__pycache__/py_schemas.cpython-312.pyc b/app/schemas/__pycache__/py_schemas.cpython-312.pyc index a8e483a..e923ca2 100644 Binary files a/app/schemas/__pycache__/py_schemas.cpython-312.pyc and b/app/schemas/__pycache__/py_schemas.cpython-312.pyc differ diff --git a/app/schemas/__pycache__/router_schemas.cpython-312.pyc b/app/schemas/__pycache__/router_schemas.cpython-312.pyc index c81f5fa..e90b44a 100644 Binary files a/app/schemas/__pycache__/router_schemas.cpython-312.pyc and b/app/schemas/__pycache__/router_schemas.cpython-312.pyc differ diff --git a/app/schemas/py_schemas.py b/app/schemas/py_schemas.py index 5d4d9de..ad902d0 100644 --- a/app/schemas/py_schemas.py +++ b/app/schemas/py_schemas.py @@ -1,7 +1,7 @@ from enum import Enum from typing import List, Optional -from pydantic import BaseModel, field_validator +from pydantic import BaseModel, Field, field_validator class InvestmentStage(str, Enum): @@ -14,98 +14,227 @@ class InvestmentStage(str, Enum): class SectorSchema(BaseModel): - id: int - name: str + """ + Expert parser: Only extract sector information if clearly identifiable. + Leave name empty if uncertain about the sector classification. + """ + + id: int = Field( + ge=0, description="Sector ID, must be 0 or greater. Use 0 if uncertain." + ) + name: str = Field( + description="Sector name. Leave empty string if not clearly identifiable from the data." + ) class Config: from_attributes = True class InvestorMemberSchema(BaseModel): - id: int - name: str - role: str - email: str - investor_id: int + """ + Expert parser: Only extract team member information if clearly identifiable. + Leave fields empty if uncertain about the member details. + """ + + id: int = Field( + ge=0, description="Member ID, must be 0 or greater. Use 0 if uncertain." + ) + name: str = Field( + description="Team member name. Leave empty string if not clearly identifiable." + ) + role: str = Field( + description="Team member role/title. Leave empty string if not clearly identifiable." + ) + email: str = Field( + description="Team member email. Leave empty string if not clearly identifiable or not provided." + ) + investor_id: int = Field( + ge=0, description="Investor ID, must be 0 or greater. Use 0 if uncertain." + ) class Config: from_attributes = True class CompanyMemberSchema(BaseModel): - id: int - name: Optional[str] = None - linkedin: Optional[str] = None - role: Optional[str] = None - company_id: int + """ + Expert parser: Only extract company member information if clearly identifiable. + Leave fields empty if uncertain about the member details. + """ + + id: int = Field( + ge=0, description="Member ID, must be 0 or greater. Use 0 if uncertain." + ) + name: Optional[str] = Field( + default="", + description="Company member name. Leave empty if not clearly identifiable.", + ) + linkedin: Optional[str] = Field( + default="", + description="LinkedIn profile URL. Leave empty if not provided or uncertain.", + ) + role: Optional[str] = Field( + default="", + description="Company member role/title. Leave empty if not clearly identifiable.", + ) + company_id: int = Field( + ge=0, description="Company ID, must be 0 or greater. Use 0 if uncertain." + ) class Config: from_attributes = True class CompanySchema(BaseModel): - id: int - name: str - industry: str - location: str - description: Optional[str] = None # Fixed typo from 'nullabel' - founded_year: Optional[int] = None # Changed from str to int to match model - website: Optional[str] = None + """ + Expert parser: Only extract company information if clearly identifiable. + Leave optional fields empty if uncertain. Integer values must be 0 or greater. + """ + + id: int = Field( + ge=0, description="Company ID, must be 0 or greater. Use 0 if uncertain." + ) + name: str = Field( + description="Company name. Leave empty string if not clearly identifiable." + ) + industry: str = Field( + description="Company industry/sector. Leave empty string if not clearly identifiable." + ) + location: str = Field( + description="Company location/address. Leave empty string if not clearly identifiable." + ) + description: Optional[str] = Field( + default="", + description="Company description. Leave empty if not clearly available or uncertain.", + ) + founded_year: Optional[int] = Field( + default=None, + ge=0, + description="Year company was founded, must be 0 or greater. Leave None if not clearly identifiable or uncertain.", + ) + website: Optional[str] = Field( + default="", + description="Company website URL. Leave empty if not provided or uncertain.", + ) @field_validator("founded_year", mode="before") @classmethod def validate_founded_year(cls, v): - if v is None or v == "Not Available" or v == "": + """Expert parser: Only accept clearly identifiable founding years""" + if v is None or v == "Not Available" or v == "" or v == "Unknown": return None if isinstance(v, str): try: - return int(v) + year = int(v) + return year if year >= 0 else None except ValueError: return None - return v + return v if isinstance(v, int) and v >= 0 else None class Config: from_attributes = True class InvestorSchema(BaseModel): - id: int - name: str - description: Optional[str] = None - aum: int - check_size_lower: int - check_size_upper: int - geographic_focus: str - stage_focus: InvestmentStage - number_of_investments: int = 0 - + """ + Expert parser: Only extract investor information if clearly identifiable. + Leave optional fields empty if uncertain. All numeric values must be 0 or greater. + """ + + id: int = Field( + ge=0, description="Investor ID, must be 0 or greater. Use 0 if uncertain." + ) + name: str = Field( + description="Investor name. Leave empty string if not clearly identifiable." + ) + description: Optional[str] = Field( + default="", + description="Investor description. Leave empty if not clearly available or uncertain.", + ) + aum: int = Field( + ge=0, + description="Assets Under Management in USD, must be 0 or greater. Use 0 if not clearly identifiable or uncertain.", + ) + check_size_lower: int = Field( + ge=0, + description="Lower bound of typical investment check size in USD, must be 0 or greater. Use 0 if not clearly identifiable.", + ) + check_size_upper: int = Field( + ge=0, + description="Upper bound of typical investment check size in USD, must be 0 or greater. Use 0 if not clearly identifiable.", + ) + geographic_focus: str = Field( + description="Geographic investment focus. Leave empty string if not clearly identifiable." + ) + stage_focus: InvestmentStage = Field( + description="Investment stage focus. Use SEED as default if uncertain." + ) + number_of_investments: int = Field( + ge=0, + default=0, + description="Total number of investments made, must be 0 or greater. Use 0 if not clearly identifiable.", + ) class Config: from_attributes = True class InvestorData(BaseModel): - """Comprehensive investor data schema for LLM processing""" + """ + Expert parser: Comprehensive investor data schema for LLM processing. + Only populate fields with clearly identifiable information. Leave lists empty if uncertain. + """ - investor: InvestorSchema - portfolio_companies: List[CompanySchema] = [] - team_members: List[InvestorMemberSchema] = [] # Changed from TeamMember - sectors: List[SectorSchema] = [] + investor: InvestorSchema = Field( + description="Core investor information. Only populate with clearly identifiable data." + ) + portfolio_companies: List[CompanySchema] = Field( + default=[], + description="List of portfolio companies. Leave empty if not clearly identifiable.", + ) + team_members: List[InvestorMemberSchema] = Field( + default=[], + description="List of team members. Leave empty if not clearly identifiable.", + ) + sectors: List[SectorSchema] = Field( + default=[], + description="List of investment sectors. Leave empty if not clearly identifiable.", + ) class Config: from_attributes = True -class CompanyData(BaseModel): # Renamed from CompaniesData for consistency - company: CompanySchema - sectors: List[SectorSchema] = [] - members: List[CompanyMemberSchema] = [] # Changed to match model relationship name - investors: List[InvestorSchema] = [] +class CompanyData(BaseModel): + """ + Expert parser: Comprehensive company data schema for LLM processing. + Only populate fields with clearly identifiable information. Leave lists empty if uncertain. + """ + + company: CompanySchema = Field( + description="Core company information. Only populate with clearly identifiable data." + ) + sectors: List[SectorSchema] = Field( + default=[], + description="List of company sectors. Leave empty if not clearly identifiable.", + ) + members: List[CompanyMemberSchema] = Field( + default=[], + description="List of company members. Leave empty if not clearly identifiable.", + ) + investors: List[InvestorSchema] = Field( + default=[], + description="List of investors. Leave empty if not clearly identifiable.", + ) class Config: from_attributes = True class InvestorList(BaseModel): - investors: List[InvestorData] = [] + """Expert parser: List of investors with clearly identifiable information only.""" + investors: List[InvestorData] = Field( + default=[], + description="List of investors. Leave empty if no clearly identifiable investors.", + ) diff --git a/app/services/__pycache__/__init__.cpython-312.pyc b/app/services/__pycache__/__init__.cpython-312.pyc index 0bd9599..b3e9b6b 100644 Binary files a/app/services/__pycache__/__init__.cpython-312.pyc and b/app/services/__pycache__/__init__.cpython-312.pyc differ diff --git a/app/services/__pycache__/llm_parser.cpython-312.pyc b/app/services/__pycache__/llm_parser.cpython-312.pyc index d575eb3..0837f36 100644 Binary files a/app/services/__pycache__/llm_parser.cpython-312.pyc and b/app/services/__pycache__/llm_parser.cpython-312.pyc differ diff --git a/app/services/__pycache__/querying.cpython-312.pyc b/app/services/__pycache__/querying.cpython-312.pyc index df4cdea..cadd1a4 100644 Binary files a/app/services/__pycache__/querying.cpython-312.pyc and b/app/services/__pycache__/querying.cpython-312.pyc differ