diff --git a/app/__pycache__/main.cpython-312.pyc b/app/__pycache__/main.cpython-312.pyc index ef94b29..81d7ae9 100644 Binary files a/app/__pycache__/main.cpython-312.pyc and b/app/__pycache__/main.cpython-312.pyc differ diff --git a/app/main.py b/app/main.py index 86f67bd..fa6a869 100644 --- a/app/main.py +++ b/app/main.py @@ -61,16 +61,18 @@ async def parse_csv( - Handles AUM, fund sizes, and check sizes as integers **For companies:** - - Expected columns: Name, Website, Investor, Final Investor Profile (company profile) + - Expected columns: Name, Website, Perplexity Gap Output (or Final Investor Profile) - 100% manual JSON parsing - no LLM needed - - Extracts company details, executives, investors, and client categories - - Automatically links companies to investors in database + - **Only extracts:** founded_year and key_executives + - **Only updates companies already in the database** (syncs with existing records) + - Skips companies not found in the database **Benefits:** - Fast processing (5-10s per record) - Low cost (minimal or no LLM usage) - Accurate data extraction - Automatic database persistence + - Safe: won't create duplicate companies """ # Read uploaded CSV with pandas content = await file.read() diff --git a/app/routers/__pycache__/investors.cpython-312.pyc b/app/routers/__pycache__/investors.cpython-312.pyc index 1bb8a47..a2ced80 100644 Binary files a/app/routers/__pycache__/investors.cpython-312.pyc and b/app/routers/__pycache__/investors.cpython-312.pyc differ diff --git a/app/routers/report_route.py b/app/routers/report_route.py index 8ab7d24..6f422ac 100644 --- a/app/routers/report_route.py +++ b/app/routers/report_route.py @@ -52,7 +52,6 @@ async def generate_investor_report( "website": investor.website, "headquarters": investor.headquarters, "aum": investor.aum, - "geographic_focus": investor.geographic_focus, "portfolio_highlights": investor.portfolio_highlights or [], "investment_thesis": investor.investment_thesis or [], "sectors": [sector.name for sector in investor.sectors], @@ -65,24 +64,22 @@ async def generate_investor_report( } for member in investor.team_members ], - "check_size_lower": None, - "check_size_upper": None, - "investment_stages": [], + "funds": [], } - # Get check sizes and stages from funds + # Get all funds with their data if investor.funds: - # Use the first fund's data or aggregate - fund = investor.funds[0] - investor_data["check_size_lower"] = fund.check_size_lower - investor_data["check_size_upper"] = fund.check_size_upper - - # Aggregate all investment stages from all funds - stages = set() for fund in investor.funds: - for stage in fund.investment_stages: - stages.add(stage.name) - investor_data["investment_stages"] = list(stages) + fund_data = { + "fund_name": fund.fund_name, + "fund_size": fund.fund_size, + "check_size_lower": fund.check_size_lower, + "check_size_upper": fund.check_size_upper, + "geographic_focus": fund.geographic_focus, + "investment_stages": [stage.name for stage in fund.investment_stages], + "sectors": [sector.name for sector in fund.sectors], + } + investor_data["funds"].append(fund_data) # Fetch project data if project_id is provided project_data = None diff --git a/app/services/__pycache__/llm_parser.cpython-312.pyc b/app/services/__pycache__/llm_parser.cpython-312.pyc index 35f2e13..3a13065 100644 Binary files a/app/services/__pycache__/llm_parser.cpython-312.pyc and b/app/services/__pycache__/llm_parser.cpython-312.pyc differ diff --git a/app/services/__pycache__/querying.cpython-312.pyc b/app/services/__pycache__/querying.cpython-312.pyc index 39750b2..0fc1072 100644 Binary files a/app/services/__pycache__/querying.cpython-312.pyc and b/app/services/__pycache__/querying.cpython-312.pyc differ diff --git a/app/services/llm_parser.py b/app/services/llm_parser.py index b86ed1f..872fc44 100644 --- a/app/services/llm_parser.py +++ b/app/services/llm_parser.py @@ -145,16 +145,74 @@ Return the lower and upper bounds in USD.""" """ Manually parse the JSON profile from the CSV. Returns a cleaned dictionary with the investor profile data. + Handles JSON wrapped in markdown code blocks (```json ... ```). + Handles trailing quotes and extra data after JSON. """ if not json_str or pd.isna(json_str): return None try: + # Clean the JSON string + cleaned_json = json_str.strip() + + # Check if it's plain text (no JSON structure) + if not cleaned_json.startswith(("{", "```", "'")): + print(" ⚠️ No JSON structure found - skipping") + return None + + # Remove markdown code block markers if present + if cleaned_json.startswith("```"): + # Remove opening marker (```json or ```Json or ```) + lines = cleaned_json.split("\n") + if lines[0].startswith("```"): + lines = lines[1:] # Remove first line + # Remove closing marker (``` or ```') + if lines and lines[-1].strip() in ("```", "```'", '```"'): + lines = lines[:-1] # Remove last line + cleaned_json = "\n".join(lines).strip() + + # Remove trailing quotes that might be left over + if cleaned_json.endswith(("'", '"')): + cleaned_json = cleaned_json[:-1].strip() + + # Try to find JSON boundaries if there's extra data + # Look for the first { and the last } + start_idx = cleaned_json.find("{") + if start_idx == -1: + print(" ⚠️ No opening brace found - not valid JSON") + return None + + # Find the matching closing brace + # We need to count braces to find the actual end + brace_count = 0 + end_idx = -1 + for i in range(start_idx, len(cleaned_json)): + if cleaned_json[i] == "{": + brace_count += 1 + elif cleaned_json[i] == "}": + brace_count -= 1 + if brace_count == 0: + end_idx = i + 1 + break + + if end_idx == -1: + print(" ⚠️ No matching closing brace found") + return None + + # Extract just the JSON part + cleaned_json = cleaned_json[start_idx:end_idx] + # Parse JSON string - profile = json.loads(json_str) + profile = json.loads(cleaned_json) return profile except json.JSONDecodeError as e: - print(f"Error parsing JSON: {e}") + print(f" ❌ JSON parsing error: {e}") + # Print first 200 chars for debugging + preview = json_str[:200] if len(json_str) > 200 else json_str + print(f" Preview: {preview}...") + return None + except Exception as e: + print(f" ❌ Unexpected error: {e}") return None async def process_investor_profile( @@ -338,34 +396,45 @@ Return the lower and upper bounds in USD.""" if existing_company: # Update only founded_year on existing company company = existing_company + updated_fields = [] + if company_data.get("founded_year"): company.founded_year = company_data["founded_year"] + updated_fields.append( + f"founded_year: {company_data['founded_year']}" + ) + + # Add/update company members (key executives) + # First, remove existing members if updating + db.query(CompanyMember).filter_by(company_id=company.id).delete() + + exec_count = 0 + for exec_data in company_data.get("key_executives", []): + member = CompanyMember( + name=exec_data.get("name"), + role=exec_data.get("title"), + linkedin=exec_data.get( + "source_url" + ), # Store source URL in linkedin field + company_id=company.id, + ) + db.add(member) + exec_count += 1 + + if exec_count > 0: + updated_fields.append(f"{exec_count} executives") + + if updated_fields: + print(f" 📝 Updated: {', '.join(updated_fields)}") + + return company else: - # Company should already be in base database, but if not found, skip - print( - f"⚠️ Company '{company_data['name']}' not found in base database - skipping" - ) + # Company not found in base database, skip + print(" ⚠️ Not in database - skipping") return None - # Add/update company members (key executives) - # First, remove existing members if updating - db.query(CompanyMember).filter_by(company_id=company.id).delete() - - for exec_data in company_data.get("key_executives", []): - member = CompanyMember( - name=exec_data.get("name"), - role=exec_data.get("title"), - linkedin=exec_data.get( - "source_url" - ), # Store source URL in linkedin field - company_id=company.id, - ) - db.add(member) - - return company - except Exception as e: - print(f"Error saving company to database: {e}") + print(f" ❌ Error saving: {e}") db.rollback() return None @@ -789,8 +858,11 @@ Return the lower and upper bounds in USD.""" if pd.notna(row.get("Investor")) else None ) + # Try both column names for flexibility profile_json = ( - row.get("Final Investor Profile", "") + row.get("Perplexity Gap Output", "") + if pd.notna(row.get("Perplexity Gap Output")) + else row.get("Final Investor Profile", "") if pd.notna(row.get("Final Investor Profile")) else None ) diff --git a/app/services/report_gen.py b/app/services/report_gen.py index 1e0c2a5..fcfe220 100644 --- a/app/services/report_gen.py +++ b/app/services/report_gen.py @@ -80,34 +80,70 @@ class ReportGenerator: "thesis": 5, } + # Aggregate data from all funds + all_sectors = set(investor_data.get("sectors", [])) + all_stages = set() + all_geographies = [] + check_ranges = [] + + for fund in investor_data.get("funds", []): + all_sectors.update(fund.get("sectors", [])) + all_stages.update(fund.get("investment_stages", [])) + if fund.get("geographic_focus"): + all_geographies.append(fund["geographic_focus"]) + if fund.get("check_size_lower") and fund.get("check_size_upper"): + check_ranges.append( + { + "lower": fund["check_size_lower"], + "upper": fund["check_size_upper"], + } + ) + # Sector match - investor_sectors = set(investor_data.get("sectors", [])) project_sectors = set(project_data.get("sectors", [])) - if investor_sectors and project_sectors: - if investor_sectors & project_sectors: + if all_sectors and project_sectors: + if all_sectors & project_sectors: score += weights["sector"] - # Stage match - investor_stages = set(investor_data.get("investment_stages", [])) + # Stage match - case insensitive comparison project_stage = project_data.get("stage") - if project_stage and project_stage in investor_stages: - score += weights["stage"] + if project_stage and all_stages: + # Normalize stage names for comparison (case-insensitive) + normalized_stages = { + stage.lower().replace("_", " ") for stage in all_stages + } + project_stage_normalized = project_stage.lower().replace("_", " ") + if project_stage_normalized in normalized_stages: + score += weights["stage"] - # Geography match - investor_geo = (investor_data.get("geographic_focus") or "").lower() + # Geography match - check if any fund matches project_geo = (project_data.get("location") or "").lower() - if investor_geo and project_geo and investor_geo in project_geo: + geo_match = False + if all_geographies: + for geo in all_geographies: + if geo: + geo_lower = geo.lower() + # Match if investor geography is "global" or if there's a location overlap + if "global" in geo_lower or "worldwide" in geo_lower: + geo_match = True + break + if project_geo and ( + geo_lower in project_geo or project_geo in geo_lower + ): + geo_match = True + break + if geo_match: score += weights["geography"] - # Check size match + # Check size match - check if any fund's range matches project_valuation = project_data.get("valuation", 0) - check_lower = investor_data.get("check_size_lower") or 0 - check_upper = investor_data.get("check_size_upper") or float("inf") - if ( - check_lower - and check_upper - and check_lower <= project_valuation <= check_upper - ): + check_match = False + if project_valuation and check_ranges: + for check_range in check_ranges: + if check_range["lower"] <= project_valuation <= check_range["upper"]: + check_match = True + break + if check_match: score += weights["check_size"] # Thesis alignment (simplified) @@ -121,86 +157,126 @@ class ReportGenerator: """Generate detailed match criteria table""" criteria = [] + # Aggregate data from all funds + all_sectors = set(investor_data.get("sectors", [])) + all_stages = set() + all_geographies = [] + check_ranges = [] + + for fund in investor_data.get("funds", []): + all_sectors.update(fund.get("sectors", [])) + all_stages.update(fund.get("investment_stages", [])) + if fund.get("geographic_focus"): + all_geographies.append(fund["geographic_focus"]) + if fund.get("check_size_lower") and fund.get("check_size_upper"): + check_ranges.append( + { + "lower": fund["check_size_lower"], + "upper": fund["check_size_upper"], + "fund_name": fund.get("fund_name", "Unnamed Fund"), + } + ) + # Sector criterion - investor_sectors = investor_data.get("sectors", []) project_sectors = project_data.get("sectors", []) - sector_match = ( - "Perfect" if set(investor_sectors) & set(project_sectors) else "Mismatch" - ) + sector_match = "Perfect" if all_sectors & set(project_sectors) else "Mismatch" criteria.append( { "name": "Sector", - "requirement": "Cybersecurity, B2B SaaS" if project_sectors else "N/A", - "evidence": ", ".join(investor_sectors[:3]) - if investor_sectors - else "N/A", + "requirement": ", ".join(project_sectors) if project_sectors else "N/A", + "evidence": ", ".join(list(all_sectors)[:3]) if all_sectors else "N/A", "match": sector_match, "weight": "30%", } ) - # Stage criterion - investor_stages = investor_data.get("investment_stages", []) + # Stage criterion - case insensitive comparison project_stage = project_data.get("stage", "N/A") - stage_match = "Perfect" if project_stage in investor_stages else "Mismatch" + stage_match = "Mismatch" + if project_stage != "N/A" and all_stages: + # Normalize stage names for comparison + normalized_stages = { + stage.lower().replace("_", " ") for stage in all_stages + } + project_stage_normalized = project_stage.lower().replace("_", " ") + stage_match = ( + "Perfect" + if project_stage_normalized in normalized_stages + else "Mismatch" + ) + elif project_stage == "N/A": + stage_match = "N/A" + criteria.append( { "name": "Stage", "requirement": str(project_stage), - "evidence": ", ".join(investor_stages) if investor_stages else "N/A", + "evidence": ", ".join(all_stages) if all_stages else "N/A", "match": stage_match, "weight": "30%", } ) # Geography criterion - investor_geo = investor_data.get("geographic_focus") or "N/A" project_geo = project_data.get("location") or "N/A" + investor_geo_display = ", ".join(all_geographies) if all_geographies else "N/A" + + # Safe comparison handling None values and "Global" matches + geo_match = "Mismatch" + if project_geo != "N/A" and all_geographies: + for geo in all_geographies: + if geo: + geo_lower = geo.lower() + # Match if investor geography is "global" or if there's a location overlap + if "global" in geo_lower or "worldwide" in geo_lower: + geo_match = "Perfect" + break + if ( + geo_lower in project_geo.lower() + or project_geo.lower() in geo_lower + ): + geo_match = "Strong" + break + elif not all_geographies and project_geo == "N/A": + geo_match = "N/A" - # Safe comparison handling None values - if investor_geo == "N/A" or project_geo == "N/A": - geo_match = ( - "N/A" if investor_geo == "N/A" and project_geo == "N/A" else "Mismatch" - ) - else: - investor_geo_lower = investor_geo.lower() - project_geo_lower = project_geo.lower() - geo_match = ( - "Strong" - if investor_geo_lower in project_geo_lower - or project_geo_lower in investor_geo_lower - else "Mismatch" - ) criteria.append( { "name": "Geography", "requirement": project_geo, - "evidence": investor_geo, + "evidence": investor_geo_display, "match": geo_match, "weight": "20%", } ) # Check Size criterion - check_lower = investor_data.get("check_size_lower") or 0 - check_upper = investor_data.get("check_size_upper") or 0 project_val = project_data.get("valuation", 0) + # Build evidence string from all fund ranges check_evidence = "N/A" - if check_lower and check_upper: - check_evidence = ( - f"€{check_lower / 1000000:.0f}M - €{check_upper / 1000000:.0f}M" - ) - elif check_lower: - check_evidence = f"€{check_lower / 1000000:.0f}M+" + if check_ranges: + evidence_parts = [] + for cr in check_ranges[:3]: # Show up to 3 funds + range_str = ( + f"€{cr['lower'] / 1000000:.0f}M - €{cr['upper'] / 1000000:.0f}M" + ) + if cr["fund_name"]: + evidence_parts.append(f"{cr['fund_name']}: {range_str}") + else: + evidence_parts.append(range_str) + check_evidence = "; ".join(evidence_parts) + + # Check if project valuation matches any fund + check_match = "N/A" + if project_val > 0 and check_ranges: + match_found = any( + cr["lower"] <= project_val <= cr["upper"] for cr in check_ranges + ) + check_match = "Perfect" if match_found else "Mismatch" + elif project_val > 0: + check_match = "Strong" - check_match = ( - "Perfect" - if check_lower and check_upper and check_lower <= project_val <= check_upper - else "Strong" - if project_val > 0 - else "N/A" - ) criteria.append( { "name": "Check Size", diff --git a/app/templates/report.html b/app/templates/report.html index 44d02ff..06c92e7 100644 --- a/app/templates/report.html +++ b/app/templates/report.html @@ -161,13 +161,6 @@

-
-

DACH Region:

-

- {{ investor.geographic_focus or 'N/A' }} -

-
-

AUM (EUR million):

@@ -179,33 +172,47 @@

-
-

- Investment Stage: -

-

- {% if investor.investment_stages %} {{ - investor.investment_stages | join(', ') }} {% else - %} N/A {% endif %} -

-
- -
-

- Est. Investment Size: -

-

- {% if investor.check_size_lower and - investor.check_size_upper %} €{{ - '{:,.0f}'.format(investor.check_size_lower / - 1000000) }}M - €{{ - '{:,.0f}'.format(investor.check_size_upper / - 1000000) }}M {% elif investor.check_size_lower %} - €{{ '{:,.0f}'.format(investor.check_size_lower / - 1000000) }}M+ {% else %} N/A {% endif %} +

+

Number of Funds:

+

+ {{ investor.funds | length if investor.funds else 'N/A' }}

+ +
+

+ Fund Details +

+ {% if investor.funds %} + {% for fund in investor.funds %} +
+

+ {{ fund.fund_name or 'Fund ' + loop.index|string }} +

+
+ {% if fund.fund_size %} +

Fund Size: €{{ '{:,.0f}'.format(fund.fund_size / 1000000) }}M

+ {% endif %} + {% if fund.check_size_lower and fund.check_size_upper %} +

Check Size: €{{ '{:,.0f}'.format(fund.check_size_lower / 1000000) }}M - €{{ '{:,.0f}'.format(fund.check_size_upper / 1000000) }}M

+ {% endif %} + {% if fund.geographic_focus %} +

Geography: {{ fund.geographic_focus }}

+ {% endif %} + {% if fund.investment_stages %} +

Stages: {{ fund.investment_stages | join(', ') }}

+ {% endif %} + {% if fund.sectors %} +

Sectors: {{ fund.sectors[:3] | join(', ') }}

+ {% endif %} +
+
+ {% endfor %} + {% else %} +

No fund information available

+ {% endif %} +
diff --git a/investors.db b/investors.db index c3d9648..ccc9762 100644 Binary files a/investors.db and b/investors.db differ