Update .gitignore to exclude __pycache__ directories and modify schemas to allow optional fields for better flexibility; adjust batch size in InvestorProcessor for improved processing efficiency.

This commit is contained in:
bolade
2025-09-26 15:56:29 +01:00
parent f2bbcb96f3
commit abac19c6ae
7 changed files with 32 additions and 29 deletions
+3 -2
View File
@@ -8,8 +8,9 @@
/chroma_db
/*__pycache__*/
*__pycache__
/*.db
/*.cypython-*
*.cypython
Binary file not shown.
+7 -7
View File
@@ -145,31 +145,31 @@ class InvestorSchema(BaseModel):
ge=0, description="Investor ID, must be 0 or greater. Use 0 if uncertain."
)
name: str = Field(
description="Investor name. Leave empty string if not clearly identifiable."
description="Investor name. Do not return any special characters, Just the name as a string."
)
description: Optional[str] = Field(
default="",
description="Investor description. Leave empty if not clearly available or uncertain.",
)
aum: int = Field(
aum: int | None = Field(
ge=0,
description="Assets Under Management in USD, must be 0 or greater. Use 0 if not clearly identifiable or uncertain.",
)
check_size_lower: int = Field(
check_size_lower: int | None = Field(
ge=0,
description="Lower bound of typical investment check size in USD, must be 0 or greater. Use 0 if not clearly identifiable.",
)
check_size_upper: int = Field(
check_size_upper: int | None = Field(
ge=0,
description="Upper bound of typical investment check size in USD, must be 0 or greater. Use 0 if not clearly identifiable.",
)
geographic_focus: str = Field(
description="Geographic investment focus. Leave empty string if not clearly identifiable."
geographic_focus: str | None = Field(
description="Geographic investment focus. Do not return any special characters, Just locations separated by commas. Leave empty if not clearly identifiable.",
)
stage_focus: InvestmentStage = Field(
description="Investment stage focus. Use SEED as default if uncertain."
)
number_of_investments: int = Field(
number_of_investments: int | None = Field(
ge=0,
default=0,
description="Total number of investments made, must be 0 or greater. Use 0 if not clearly identifiable.",
+19 -17
View File
@@ -25,50 +25,51 @@ class SectorSchema(BaseModel):
class InvestorMemberSchema(BaseModel):
id: int
name: str
role: str
email: str
role: str | None
email: str | None
class Config:
from_attributes = True
class CompanyMemberSchema(BaseModel):
id: int
name: Optional[str] = None
linkedin: Optional[str] = None
role: Optional[str] = None
name: Optional[str]
linkedin: Optional[str]
role: Optional[str]
company_id: int
class Config:
from_attributes = True
class CompanySchema(BaseModel):
id: int
name: str
industry: str
location: str
industry: str | None
location: str | None
description: Optional[str]
founded_year: Optional[int]
website: Optional[str]
created_at: Optional[datetime]
updated_at: Optional[datetime]
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class Config:
from_attributes = True
class InvestorSchema(BaseModel):
id: int
name: str
description: Optional[str]
aum: int
check_size_lower: int
check_size_upper: int
geographic_focus: str
aum: int | None
check_size_lower: int | None
check_size_upper: int | None
geographic_focus: str | None
stage_focus: InvestmentStage
number_of_investments: int
created_at: Optional[datetime]
updated_at: Optional[datetime]
number_of_investments: int | None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class Config:
from_attributes = True
@@ -95,5 +96,6 @@ class CompanyData(BaseModel): # Renamed from CompaniesData for consistency
class Config:
from_attributes = True
class InvestorList(BaseModel):
investors: List[InvestorData]
Binary file not shown.
+2 -2
View File
@@ -183,7 +183,7 @@ class InvestorProcessor:
try:
# Process rows in batches asynchronously
batch_size = 15 # Adjust batch size as needed
batch_size = 20 # Adjust batch size as needed
rows = [(idx, row) for idx, row in df.iterrows()]
for i in range(0, len(rows), batch_size):
@@ -251,7 +251,7 @@ class InvestorProcessor:
try:
# Process rows in batches asynchronously
batch_size = 15 # Adjust batch size as needed
batch_size = 20 # Adjust batch size as needed
rows = [(idx, row) for idx, row in df.iterrows()]
for i in range(0, len(rows), batch_size):