Refactor database models and schemas to allow nullable fields; update init_database function for improved initialization.

This commit is contained in:
bolade
2025-09-26 15:24:42 +01:00
parent 0f7beca5e1
commit f2bbcb96f3
17 changed files with 196 additions and 59 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
-1
View File
@@ -32,7 +32,6 @@ db_dependency = Annotated[Session, Depends(get_db)]
def init_database(): def init_database():
"""Initialize the database by creating all tables""" """Initialize the database by creating all tables"""
Base.metadata.create_all(bind=engine) Base.metadata.create_all(bind=engine)
print("Database initialized successfully!")
def get_session_sync() -> Session: def get_session_sync() -> Session:
+12 -12
View File
@@ -55,12 +55,12 @@ class InvestorTable(Base, TimestampMixin):
id = Column(Integer, primary_key=True, index=True) id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False) name = Column(String, nullable=False)
description = Column(Text, nullable=True) description = Column(Text, nullable=True)
aum = Column(Integer, nullable=False) # Assets Under Management aum = Column(Integer, nullable=True) # Assets Under Management
check_size_lower = Column(Integer, nullable=False) # Lower bound check_size_lower = Column(Integer, nullable=True) # Lower bound
check_size_upper = Column(Integer, nullable=False) # Upper bound check_size_upper = Column(Integer, nullable=True) # Upper bound
geographic_focus = Column(String, nullable=False) geographic_focus = Column(String, nullable=True)
stage_focus = Column(Enum(InvestmentStage), nullable=False) stage_focus = Column(Enum(InvestmentStage), nullable=True)
number_of_investments = Column(Integer, default=0) number_of_investments = Column(Integer, default=0, nullable=True)
# Relationship to portfolio companies # Relationship to portfolio companies
portfolio_companies = relationship( portfolio_companies = relationship(
@@ -80,8 +80,8 @@ class InvestorMember(Base, TimestampMixin):
__tablename__ = "investor_members" __tablename__ = "investor_members"
id = Column(Integer, primary_key=True, index=True) id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False) name = Column(String, nullable=False)
role = Column(String, nullable=False) role = Column(String, nullable=True)
email = Column(String, nullable=False) email = Column(String, nullable=True)
investor_id = Column(Integer, ForeignKey("investors.id")) investor_id = Column(Integer, ForeignKey("investors.id"))
investor = relationship("InvestorTable", back_populates="team_members") investor = relationship("InvestorTable", back_populates="team_members")
@@ -92,8 +92,8 @@ class CompanyTable(Base, TimestampMixin):
id = Column(Integer, primary_key=True, index=True) id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False) name = Column(String, nullable=False)
industry = Column(String, nullable=False) industry = Column(String, nullable=True)
location = Column(String, nullable=False) location = Column(String, nullable=True)
description = Column(String, nullable=True) description = Column(String, nullable=True)
founded_year = Column(Integer, nullable=True) founded_year = Column(Integer, nullable=True)
website = Column(String, nullable=True) website = Column(String, nullable=True)
@@ -115,8 +115,8 @@ class CompanyMember(Base, TimestampMixin):
__tablename__ = "company_members" __tablename__ = "company_members"
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True)
name = Column(String) name = Column(String)
linkedin = Column(String) linkedin = Column(String, nullable=True)
role = Column(String) role = Column(String, nullable=True)
company_id = Column(Integer, ForeignKey("companies.id"), nullable=False) company_id = Column(Integer, ForeignKey("companies.id"), nullable=False)
company = relationship("CompanyTable", back_populates="members") company = relationship("CompanyTable", back_populates="members")
+11 -2
View File
@@ -1,7 +1,7 @@
import io import io
import pandas as pd import pandas as pd
from db.db import db_dependency, init_database from db.db import Base, db_dependency, engine
from dotenv import load_dotenv from dotenv import load_dotenv
from fastapi import FastAPI, File, Form, UploadFile from fastapi import FastAPI, File, Form, UploadFile
from pydantic import BaseModel from pydantic import BaseModel
@@ -11,6 +11,13 @@ from services.llm_parser import InvestorProcessor
from services.querying import QueryProcessor from services.querying import QueryProcessor
load_dotenv() load_dotenv()
def init_database():
"""Initialize the database by creating all tables"""
Base.metadata.create_all(bind=engine)
init_database() init_database()
app = FastAPI() app = FastAPI()
@@ -34,7 +41,9 @@ def health():
@app.post("/parse-csv", tags=["CSV Upload"], response_model=list[dict]) @app.post("/parse-csv", tags=["CSV Upload"], response_model=list[dict])
async def parse_csv(db: db_dependency, file: UploadFile = File(...), is_investor: int = Form(...)): async def parse_csv(
db: db_dependency, file: UploadFile = File(...), is_investor: int = Form(...)
):
# Read uploaded CSV with pandas # Read uploaded CSV with pandas
content = await file.read() content = await file.read()
df = pd.read_csv(io.StringIO(content.decode("utf-8"))) df = pd.read_csv(io.StringIO(content.decode("utf-8")))
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
+173 -44
View File
@@ -1,7 +1,7 @@
from enum import Enum from enum import Enum
from typing import List, Optional from typing import List, Optional
from pydantic import BaseModel, field_validator from pydantic import BaseModel, Field, field_validator
class InvestmentStage(str, Enum): class InvestmentStage(str, Enum):
@@ -14,98 +14,227 @@ class InvestmentStage(str, Enum):
class SectorSchema(BaseModel): class SectorSchema(BaseModel):
id: int """
name: str Expert parser: Only extract sector information if clearly identifiable.
Leave name empty if uncertain about the sector classification.
"""
id: int = Field(
ge=0, description="Sector ID, must be 0 or greater. Use 0 if uncertain."
)
name: str = Field(
description="Sector name. Leave empty string if not clearly identifiable from the data."
)
class Config: class Config:
from_attributes = True from_attributes = True
class InvestorMemberSchema(BaseModel): class InvestorMemberSchema(BaseModel):
id: int """
name: str Expert parser: Only extract team member information if clearly identifiable.
role: str Leave fields empty if uncertain about the member details.
email: str """
investor_id: int
id: int = Field(
ge=0, description="Member ID, must be 0 or greater. Use 0 if uncertain."
)
name: str = Field(
description="Team member name. Leave empty string if not clearly identifiable."
)
role: str = Field(
description="Team member role/title. Leave empty string if not clearly identifiable."
)
email: str = Field(
description="Team member email. Leave empty string if not clearly identifiable or not provided."
)
investor_id: int = Field(
ge=0, description="Investor ID, must be 0 or greater. Use 0 if uncertain."
)
class Config: class Config:
from_attributes = True from_attributes = True
class CompanyMemberSchema(BaseModel): class CompanyMemberSchema(BaseModel):
id: int """
name: Optional[str] = None Expert parser: Only extract company member information if clearly identifiable.
linkedin: Optional[str] = None Leave fields empty if uncertain about the member details.
role: Optional[str] = None """
company_id: int
id: int = Field(
ge=0, description="Member ID, must be 0 or greater. Use 0 if uncertain."
)
name: Optional[str] = Field(
default="",
description="Company member name. Leave empty if not clearly identifiable.",
)
linkedin: Optional[str] = Field(
default="",
description="LinkedIn profile URL. Leave empty if not provided or uncertain.",
)
role: Optional[str] = Field(
default="",
description="Company member role/title. Leave empty if not clearly identifiable.",
)
company_id: int = Field(
ge=0, description="Company ID, must be 0 or greater. Use 0 if uncertain."
)
class Config: class Config:
from_attributes = True from_attributes = True
class CompanySchema(BaseModel): class CompanySchema(BaseModel):
id: int """
name: str Expert parser: Only extract company information if clearly identifiable.
industry: str Leave optional fields empty if uncertain. Integer values must be 0 or greater.
location: str """
description: Optional[str] = None # Fixed typo from 'nullabel'
founded_year: Optional[int] = None # Changed from str to int to match model id: int = Field(
website: Optional[str] = None ge=0, description="Company ID, must be 0 or greater. Use 0 if uncertain."
)
name: str = Field(
description="Company name. Leave empty string if not clearly identifiable."
)
industry: str = Field(
description="Company industry/sector. Leave empty string if not clearly identifiable."
)
location: str = Field(
description="Company location/address. Leave empty string if not clearly identifiable."
)
description: Optional[str] = Field(
default="",
description="Company description. Leave empty if not clearly available or uncertain.",
)
founded_year: Optional[int] = Field(
default=None,
ge=0,
description="Year company was founded, must be 0 or greater. Leave None if not clearly identifiable or uncertain.",
)
website: Optional[str] = Field(
default="",
description="Company website URL. Leave empty if not provided or uncertain.",
)
@field_validator("founded_year", mode="before") @field_validator("founded_year", mode="before")
@classmethod @classmethod
def validate_founded_year(cls, v): def validate_founded_year(cls, v):
if v is None or v == "Not Available" or v == "": """Expert parser: Only accept clearly identifiable founding years"""
if v is None or v == "Not Available" or v == "" or v == "Unknown":
return None return None
if isinstance(v, str): if isinstance(v, str):
try: try:
return int(v) year = int(v)
return year if year >= 0 else None
except ValueError: except ValueError:
return None return None
return v return v if isinstance(v, int) and v >= 0 else None
class Config: class Config:
from_attributes = True from_attributes = True
class InvestorSchema(BaseModel): class InvestorSchema(BaseModel):
id: int """
name: str Expert parser: Only extract investor information if clearly identifiable.
description: Optional[str] = None Leave optional fields empty if uncertain. All numeric values must be 0 or greater.
aum: int """
check_size_lower: int
check_size_upper: int id: int = Field(
geographic_focus: str ge=0, description="Investor ID, must be 0 or greater. Use 0 if uncertain."
stage_focus: InvestmentStage )
number_of_investments: int = 0 name: str = Field(
description="Investor name. Leave empty string if not clearly identifiable."
)
description: Optional[str] = Field(
default="",
description="Investor description. Leave empty if not clearly available or uncertain.",
)
aum: int = Field(
ge=0,
description="Assets Under Management in USD, must be 0 or greater. Use 0 if not clearly identifiable or uncertain.",
)
check_size_lower: int = Field(
ge=0,
description="Lower bound of typical investment check size in USD, must be 0 or greater. Use 0 if not clearly identifiable.",
)
check_size_upper: int = Field(
ge=0,
description="Upper bound of typical investment check size in USD, must be 0 or greater. Use 0 if not clearly identifiable.",
)
geographic_focus: str = Field(
description="Geographic investment focus. Leave empty string if not clearly identifiable."
)
stage_focus: InvestmentStage = Field(
description="Investment stage focus. Use SEED as default if uncertain."
)
number_of_investments: int = Field(
ge=0,
default=0,
description="Total number of investments made, must be 0 or greater. Use 0 if not clearly identifiable.",
)
class Config: class Config:
from_attributes = True from_attributes = True
class InvestorData(BaseModel): class InvestorData(BaseModel):
"""Comprehensive investor data schema for LLM processing""" """
Expert parser: Comprehensive investor data schema for LLM processing.
Only populate fields with clearly identifiable information. Leave lists empty if uncertain.
"""
investor: InvestorSchema investor: InvestorSchema = Field(
portfolio_companies: List[CompanySchema] = [] description="Core investor information. Only populate with clearly identifiable data."
team_members: List[InvestorMemberSchema] = [] # Changed from TeamMember )
sectors: List[SectorSchema] = [] portfolio_companies: List[CompanySchema] = Field(
default=[],
description="List of portfolio companies. Leave empty if not clearly identifiable.",
)
team_members: List[InvestorMemberSchema] = Field(
default=[],
description="List of team members. Leave empty if not clearly identifiable.",
)
sectors: List[SectorSchema] = Field(
default=[],
description="List of investment sectors. Leave empty if not clearly identifiable.",
)
class Config: class Config:
from_attributes = True from_attributes = True
class CompanyData(BaseModel): # Renamed from CompaniesData for consistency class CompanyData(BaseModel):
company: CompanySchema """
sectors: List[SectorSchema] = [] Expert parser: Comprehensive company data schema for LLM processing.
members: List[CompanyMemberSchema] = [] # Changed to match model relationship name Only populate fields with clearly identifiable information. Leave lists empty if uncertain.
investors: List[InvestorSchema] = [] """
company: CompanySchema = Field(
description="Core company information. Only populate with clearly identifiable data."
)
sectors: List[SectorSchema] = Field(
default=[],
description="List of company sectors. Leave empty if not clearly identifiable.",
)
members: List[CompanyMemberSchema] = Field(
default=[],
description="List of company members. Leave empty if not clearly identifiable.",
)
investors: List[InvestorSchema] = Field(
default=[],
description="List of investors. Leave empty if not clearly identifiable.",
)
class Config: class Config:
from_attributes = True from_attributes = True
class InvestorList(BaseModel): class InvestorList(BaseModel):
investors: List[InvestorData] = [] """Expert parser: List of investors with clearly identifiable information only."""
investors: List[InvestorData] = Field(
default=[],
description="List of investors. Leave empty if no clearly identifiable investors.",
)
Binary file not shown.
Binary file not shown.
Binary file not shown.