completed investors linkedin

feat: Add LinkedIn URL support for investor synchronization and update schemas
added linkedin profiles
2025-11-28 07:19:58 +01:00 · 2025-11-28 06:18:04 +00:00 · 2025-11-27 16:44:22 +01:00 · 2025-11-26 08:04:11 +00:00 · 2025-11-11 20:28:20 +01:00 · 2025-11-11 20:27:55 +01:00
33 changed files with 2833 additions and 395 deletions
@@ -13,4 +13,8 @@
 *.cypython
-nohup.out
+nohup.out
 server.log
 server.pid
@@ -1,5 +1,4 @@
 import os
 from pathlib import Path
 from typing import Annotated
 from fastapi import Depends
@@ -162,6 +162,7 @@ class InvestorMember(Base, TimestampMixin):
    role = Column(String, nullable=True)
    title = Column(String, nullable=True)  # Alternative to role
    email = Column(String, nullable=True)
    linkedin = Column(String, nullable=True)  # LinkedIn profile URL
    source_url = Column(String, nullable=True)  # URL where member info was found
    investor_id = Column(Integer, ForeignKey("investors.id"))
@@ -215,6 +216,8 @@ class CompanyTable(Base, TimestampMixin):
    description = Column(String, nullable=True)
    founded_year = Column(Integer, nullable=True)
    website = Column(String, nullable=True)
    product_service = Column(Text, nullable=True)  # Product/service description
    clients = Column(JSON, nullable=True)  # List of client names or client information
    members = relationship(
        "CompanyMember", back_populates="company", cascade="all, delete-orphan"
@@ -296,9 +299,11 @@ class ProjectTable(Base, TimestampMixin):
    stage = Column(Enum(InvestmentStage), nullable=True)
    location = Column(String, nullable=True)
    industry = Column(String, nullable=True)
    description = Column(Text, nullable=True)
    start_date = Column(DateTime, nullable=True)
    end_date = Column(DateTime, nullable=True)
    is_archived = Column(Integer, default=0, nullable=False)  # 0 = active, 1 = archived
    sector = relationship(
        "SectorTable", secondary=project_sector_association, back_populates="projects"
@@ -0,0 +1,730 @@
 """
 LinkedIn Profile Scraper for Investor Members
 This module uses crawl4ai to scrape team pages and find LinkedIn profiles.
 Strategies:
 1. Crawl the source_url (team pages) to extract LinkedIn profile links
 2. Use LLM-powered web search to find LinkedIn profiles by name
 Key advantages of crawl4ai:
 - Handles JavaScript-rendered pages
 - Better at extracting content from modern websites
 - More reliable than simple requests
 """
 import asyncio
 import logging
 import os
 import re
 from typing import Dict, List, Optional
 from crawl4ai import AsyncWebCrawler
 from ddgs import DDGS
 from dotenv import load_dotenv
 from langchain_openai import ChatOpenAI
 # Setup logging
 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
 )
 logger = logging.getLogger("linkedin_scraper")
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 class LinkedInProfileScraper:
    """
    LinkedIn profile finder using crawl4ai and LLM-powered web search.
    Strategies:
    1. Crawl source URLs (team pages) to extract LinkedIn links
    2. Use LLM-powered web search to find profiles by name
    """
    def __init__(
        self,
        rate_limit_delay: float = 0.5,
        use_cache: bool = True,
        use_llm_search: bool = True,
    ):
        """
        Initialize the scraper
        Args:
            rate_limit_delay: Delay between requests in seconds
            use_cache: Whether to cache crawled pages
            use_llm_search: Whether to use LLM-powered web search as fallback
        """
        self.rate_limit_delay = rate_limit_delay
        self.use_cache = use_cache
        self.use_llm_search = use_llm_search and OPENROUTER_API_KEY
        self.page_cache: Dict[str, str] = {}  # Cache crawled pages by URL
        self.html_cache: Dict[str, str] = {}  # Cache HTML separately
        self.profile_cache: Dict[str, Dict] = {}  # Cache results by member
        # Initialize LLM agent if API key available
        if self.use_llm_search:
            self._init_llm_agent()
        else:
            self.llm = None
            self.agent = None
            self.ddg_search = None
            logger.info("LLM search disabled (no OPENROUTER_API_KEY)")
    def _init_llm_agent(self):
        """Initialize LLM agent for web search"""
        try:
            self.llm = ChatOpenAI(
                api_key=OPENROUTER_API_KEY,
                base_url="https://openrouter.ai/api/v1",
                model="x-ai/grok-4.1-fast:free",
                temperature=0,
            )
            self.ddg_search = DDGS()
            logger.info("LLM search agent initialized")
        except Exception as e:
            logger.error(f"Failed to initialize LLM agent: {e}")
            self.llm = None
            self.ddg_search = None
    def web_search(self, query: str) -> List[Dict]:
        """Tool to search the web using DuckDuckGo"""
        if not self.ddg_search:
            return []
        try:
            results = list(self.ddg_search.text(query, max_results=10))
            return results
        except Exception as e:
            logger.error(f"Web search error: {e}")
            return []
    async def crawl_page(self, url: str) -> Optional[str]:
        """
        Crawl a webpage and return its content.
        Args:
            url: URL to crawl
        Returns:
            Page content as markdown/text, or None if failed
        """
        if not url:
            return None
        # Check cache first
        if self.use_cache and url in self.page_cache:
            logger.debug(f"Using cached page for {url}")
            return self.page_cache[url]
        try:
            logger.info(f"Crawling: {url}")
            async with AsyncWebCrawler() as crawler:
                result = await crawler.arun(url)
                if result and result.markdown:
                    content = result.markdown
                    # Also get HTML for better link extraction
                    html_content = result.html if hasattr(result, "html") else ""
                    # Cache the results
                    if self.use_cache:
                        self.page_cache[url] = content
                        self.html_cache[url] = html_content
                    return content
        except Exception as e:
            logger.error(f"Error crawling {url}: {e}")
        return None
    def extract_linkedin_urls_from_content(self, content: str) -> List[Dict[str, str]]:
        """
        Extract all LinkedIn profile URLs from content (HTML or markdown).
        Returns:
            List of dicts with 'url', 'context', and 'username'
        """
        linkedin_links = []
        # Pattern for LinkedIn profile URLs (handles country-specific domains)
        linkedin_pattern = (
            r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)/?"
        )
        # Find all LinkedIn URLs
        matches = list(re.finditer(linkedin_pattern, content, re.IGNORECASE))
        for match in matches:
            url = match.group(0).rstrip("/")
            # Normalize URL
            url = self._normalize_linkedin_url(url)
            # Get surrounding context (200 chars before and after)
            start = max(0, match.start() - 200)
            end = min(len(content), match.end() + 200)
            context = content[start:end]
            # Clean up context (remove HTML tags for readability)
            context = re.sub(r"<[^>]+>", " ", context)
            context = " ".join(context.split())  # Normalize whitespace
            linkedin_links.append(
                {"url": url, "context": context, "username": match.group(1)}
            )
        # Remove duplicates while preserving order
        seen_urls = set()
        unique_links = []
        for link in linkedin_links:
            if link["url"] not in seen_urls:
                seen_urls.add(link["url"])
                unique_links.append(link)
        return unique_links
    def _normalize_linkedin_url(self, url: str) -> str:
        """Normalize LinkedIn URL to standard format"""
        # Remove trailing slashes
        url = url.rstrip("/")
        # Convert country-specific to www
        url = re.sub(
            r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url
        )
        # Ensure https
        if url.startswith("http://"):
            url = url.replace("http://", "https://")
        return url
    def _name_matches_context(self, name: str, context: str) -> float:
        """
        Check if a person's name appears in the context around a LinkedIn URL.
        Returns:
            Confidence score 0-100
        """
        if not name or not context:
            return 0
        context_lower = context.lower()
        name_lower = name.lower()
        # Split name into parts (handle multiple spaces, titles like "Dr.", etc.)
        name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 1]
        # Check for full name match
        if name_lower in context_lower:
            return 95
        # Check for name parts in context
        matches = sum(
            1 for part in name_parts if part in context_lower and len(part) > 2
        )
        if len(name_parts) > 0:
            if matches == len(name_parts):
                return 90  # All name parts found
            elif matches >= 2:
                return 75  # At least 2 parts found (first + last typically)
            elif matches == 1 and len(name_parts) <= 2:
                return 50  # Only one part found but name is short
            elif matches == 1:
                return 35  # Only one part found
        return 0
    def _name_matches_username(self, name: str, username: str) -> float:
        """
        Check if LinkedIn username contains parts of the name.
        Returns:
            Confidence score 0-100
        """
        if not name or not username:
            return 0
        name_lower = name.lower()
        username_lower = username.lower().replace("-", " ").replace("_", " ")
        name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 2]
        matches = sum(1 for part in name_parts if part in username_lower)
        if len(name_parts) > 0:
            if matches == len(name_parts) and len(name_parts) >= 2:
                return 85  # Full name in username
            elif matches >= 2:
                return 70  # Multiple parts match
            elif matches == 1:
                return 35  # Only one part matches
        return 0
    async def find_linkedin_from_source(
        self, name: str, source_url: str, role: Optional[str] = None
    ) -> Dict:
        """
        Find LinkedIn profile by crawling the source URL (team page).
        Args:
            name: Person's name
            source_url: URL of the team/about page
            role: Person's role (for additional context matching)
        Returns:
            Dict with linkedin_url, confidence, method, notes
        """
        if not source_url:
            return {
                "linkedin_url": None,
                "confidence": 0,
                "method": "source_crawl",
                "notes": "No source URL provided",
            }
        # Crawl the page
        content = await self.crawl_page(source_url)
        if not content:
            return {
                "linkedin_url": None,
                "confidence": 0,
                "method": "source_crawl",
                "notes": f"Failed to crawl {source_url}",
            }
        # Get HTML for better link extraction
        html = self.html_cache.get(source_url, content)
        # Extract all LinkedIn URLs from both HTML and markdown
        linkedin_links = self.extract_linkedin_urls_from_content(html)
        if not linkedin_links:
            linkedin_links = self.extract_linkedin_urls_from_content(content)
        if not linkedin_links:
            return {
                "linkedin_url": None,
                "confidence": 0,
                "method": "source_crawl",
                "notes": f"No LinkedIn URLs found on {source_url}",
            }
        # Score each LinkedIn URL based on name matching
        best_match = None
        best_score = 0
        for link in linkedin_links:
            # Score based on context matching
            context_score = self._name_matches_context(name, link["context"])
            # Score based on username matching
            username_score = self._name_matches_username(name, link["username"])
            # Also check if role appears in context
            role_bonus = 0
            if role and role.lower() in link["context"].lower():
                role_bonus = 10
            # Combined score (take best of context or username, plus role bonus)
            total_score = max(context_score, username_score) + role_bonus
            logger.debug(
                f"  {name} -> {link['url']}: context={context_score}, username={username_score}, role={role_bonus}, total={total_score}"
            )
            if total_score > best_score:
                best_score = total_score
                best_match = link
        if best_match and best_score >= 30:  # Minimum threshold
            return {
                "linkedin_url": best_match["url"],
                "confidence": min(best_score, 100),
                "method": "source_crawl",
                "notes": f"Found on {source_url}",
            }
        return {
            "linkedin_url": None,
            "confidence": 0,
            "method": "source_crawl",
            "notes": f'No matching LinkedIn profile found for "{name}" on {source_url}',
        }
    async def find_linkedin_via_search(
        self, name: str, company: str, role: Optional[str] = None
    ) -> Dict:
        """
        Find LinkedIn profile using web search.
        Args:
            name: Person's name
            company: Company/investor name
            role: Person's role (optional)
        Returns:
            Dict with linkedin_url, confidence, method, notes
        """
        if not self.ddg_search:
            return {
                "linkedin_url": None,
                "confidence": 0,
                "method": "web_search",
                "notes": "Web search not available",
            }
        try:
            # Build search query - search for LinkedIn profile
            query = f"{name} {company} site:linkedin.com/in"
            if role:
                query = f"{name} {role} {company} site:linkedin.com/in"
            logger.debug(f"Searching: {query}")
            results = self.web_search(query)
            if results:
                # Look for LinkedIn profile URLs in results
                linkedin_pattern = r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)"
                for result in results:
                    url = result.get("href") or result.get("link") or ""
                    title = result.get("title", "").lower()
                    body = result.get("body", "").lower()
                    match = re.search(linkedin_pattern, url, re.IGNORECASE)
                    if match:
                        linkedin_url = self._normalize_linkedin_url(match.group(0))
                        username = match.group(1)
                        # Score based on name matching in title/body and username
                        context = f"{title} {body}"
                        context_score = self._name_matches_context(name, context)
                        username_score = self._name_matches_username(name, username)
                        total_score = max(context_score, username_score)
                        if total_score >= 30:
                            return {
                                "linkedin_url": linkedin_url,
                                "confidence": min(
                                    total_score, 90
                                ),  # Cap at 90 for search results
                                "method": "web_search",
                                "notes": "Found via web search",
                            }
            return {
                "linkedin_url": None,
                "confidence": 0,
                "method": "web_search",
                "notes": "No matching profile found in search results",
            }
        except Exception as e:
            logger.error(f"Web search error for {name}: {e}")
            return {
                "linkedin_url": None,
                "confidence": 0,
                "method": "web_search",
                "notes": f"Search error: {str(e)}",
            }
    async def find_linkedin_profile(
        self,
        name: str,
        company: str,
        role: Optional[str] = None,
        source_url: Optional[str] = None,
    ) -> Dict:
        """
        Find LinkedIn profile for a person.
        Primary strategy: Crawl source URL to find LinkedIn links.
        Args:
            name: Person's name
            company: Company/investor name
            role: Person's role/title (optional)
            source_url: URL where person info was found (optional)
        Returns:
            Dict with:
                - linkedin_url: Found LinkedIn URL or None
                - confidence: Confidence score (0-100)
                - method: Method used to find the profile
                - notes: Additional information
        """
        cache_key = f"{name}|{company}"
        # Check cache
        if self.use_cache and cache_key in self.profile_cache:
            logger.debug(f"Using cached result for {name}")
            return self.profile_cache[cache_key]
        result = {"linkedin_url": None, "confidence": 0, "method": "none", "notes": ""}
        # Primary strategy: Crawl source URL
        if source_url:
            result = await self.find_linkedin_from_source(name, source_url, role)
            if result["linkedin_url"]:
                if self.use_cache:
                    self.profile_cache[cache_key] = result
                return result
        # Fallback strategy: Web search (if enabled and no result from source crawl)
        if self.use_llm_search and not result.get("linkedin_url"):
            search_result = await self.find_linkedin_via_search(name, company, role)
            if search_result["linkedin_url"]:
                if self.use_cache:
                    self.profile_cache[cache_key] = search_result
                return search_result
        # If no source URL or no match found
        if not result["linkedin_url"]:
            result = {
                "linkedin_url": None,
                "confidence": 0,
                "method": "none",
                "notes": "No source URL available"
                if not source_url
                else result.get("notes", "Not found"),
            }
        if self.use_cache:
            self.profile_cache[cache_key] = result
        return result
    async def batch_find_profiles(
        self, members: List[Dict], progress_callback=None, db_callback=None
    ) -> List[Dict]:
        """
        Find LinkedIn profiles for multiple members efficiently.
        Groups members by source_url to minimize crawling the same page multiple times.
        Args:
            members: List of dicts with 'name', 'company', 'role', 'source_url', 'id'
            progress_callback: Optional callback function(current, total, result)
            db_callback: Optional callback to save to database immediately when profile found
                         Signature: db_callback(member_id, linkedin_url) -> bool
        Returns:
            List of results for each member
        """
        results = []
        total = len(members)
        # Group members by source_url for efficient crawling
        url_groups: Dict[str, List[Dict]] = {}
        no_url_members = []
        for member in members:
            url = member.get("source_url")
            if url:
                if url not in url_groups:
                    url_groups[url] = []
                url_groups[url].append(member)
            else:
                no_url_members.append(member)
        logger.info(
            f"Processing {len(url_groups)} unique source URLs for {total} members"
        )
        logger.info(f"Members with source URLs: {total - len(no_url_members)}")
        logger.info(f"Members without source URLs: {len(no_url_members)}")
        if self.use_llm_search:
            logger.info("Web search fallback: ENABLED")
        else:
            logger.info("Web search fallback: DISABLED")
        processed = 0
        # Process members grouped by URL (efficient - one crawl per page)
        for url, group_members in url_groups.items():
            # Crawl the page once
            content = await self.crawl_page(url)
            html = self.html_cache.get(url, content or "")
            # Extract all LinkedIn URLs from this page
            linkedin_links = []
            if content:
                linkedin_links = self.extract_linkedin_urls_from_content(html)
                if not linkedin_links:
                    linkedin_links = self.extract_linkedin_urls_from_content(content)
            # Match each member in this group
            for member in group_members:
                processed += 1
                result = None
                found_linkedin = False
                if linkedin_links:
                    # Find best matching LinkedIn for this member
                    best_match = None
                    best_score = 0
                    for link in linkedin_links:
                        context_score = self._name_matches_context(
                            member["name"], link["context"]
                        )
                        username_score = self._name_matches_username(
                            member["name"], link["username"]
                        )
                        role_bonus = (
                            10
                            if member.get("role")
                            and member["role"].lower() in link["context"].lower()
                            else 0
                        )
                        total_score = max(context_score, username_score) + role_bonus
                        if total_score > best_score:
                            best_score = total_score
                            best_match = link
                    if best_match and best_score >= 30:
                        result = {
                            "linkedin_url": best_match["url"],
                            "confidence": min(best_score, 100),
                            "method": "source_crawl",
                            "notes": f"Found on {url}",
                            "member_id": member.get("id"),
                            "member_name": member["name"],
                        }
                        found_linkedin = True
                        # Save to database immediately if callback provided
                        if db_callback and member.get("id"):
                            db_callback(member["id"], best_match["url"])
                # If no result from source crawl, try web search IMMEDIATELY
                if not found_linkedin and self.use_llm_search:
                    search_result = await self.find_linkedin_via_search(
                        member["name"], member["company"], member.get("role")
                    )
                    if search_result["linkedin_url"]:
                        result = {
                            "linkedin_url": search_result["linkedin_url"],
                            "confidence": search_result["confidence"],
                            "method": "web_search",
                            "notes": search_result.get("notes", "Found via web search"),
                            "member_id": member.get("id"),
                            "member_name": member["name"],
                        }
                        found_linkedin = True
                        # Save to database immediately
                        if db_callback and member.get("id"):
                            db_callback(member["id"], search_result["linkedin_url"])
                # If still no result, record as not found
                if not found_linkedin:
                    result = {
                        "linkedin_url": None,
                        "confidence": 0,
                        "method": "source_crawl" if content else "none",
                        "notes": f"No match on {url}"
                        if linkedin_links
                        else (
                            f"No LinkedIn URLs on {url}"
                            if content
                            else f"Failed to crawl {url}"
                        ),
                        "member_id": member.get("id"),
                        "member_name": member["name"],
                    }
                results.append(result)
                if progress_callback:
                    progress_callback(processed, total, result)
            # Small delay between different URLs
            await asyncio.sleep(self.rate_limit_delay)
        # Process members without source URLs - do web search immediately for each
        for member in no_url_members:
            processed += 1
            result = None
            # Try web search immediately
            if self.use_llm_search:
                search_result = await self.find_linkedin_via_search(
                    member["name"], member["company"], member.get("role")
                )
                if search_result["linkedin_url"]:
                    result = {
                        "linkedin_url": search_result["linkedin_url"],
                        "confidence": search_result["confidence"],
                        "method": "web_search",
                        "notes": search_result.get("notes", "Found via web search"),
                        "member_id": member.get("id"),
                        "member_name": member["name"],
                    }
                    # Save to database immediately
                    if db_callback and member.get("id"):
                        db_callback(member["id"], search_result["linkedin_url"])
            # If no result from search
            if not result:
                result = {
                    "linkedin_url": None,
                    "confidence": 0,
                    "method": "web_search" if self.use_llm_search else "none",
                    "notes": "No LinkedIn profile found"
                    if self.use_llm_search
                    else "No source URL available",
                    "member_id": member.get("id"),
                    "member_name": member["name"],
                }
            results.append(result)
            if progress_callback:
                progress_callback(processed, total, result)
            # Rate limit between searches
            await asyncio.sleep(self.rate_limit_delay)
        return results
 def format_linkedin_url(url: str) -> str:
    """Normalize LinkedIn URL format"""
    if not url:
        return url
    # Remove trailing slashes
    url = url.rstrip("/")
    # Ensure https and normalize to www
    url = re.sub(r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url)
    if url.startswith("http://"):
        url = url.replace("http://", "https://")
    return url
 # Async wrapper for sync contexts
 def run_batch_scraper(
    members: List[Dict], rate_limit: float = 0.5, progress_callback=None
 ) -> List[Dict]:
    """
    Synchronous wrapper for batch_find_profiles.
    Args:
        members: List of member dicts
        rate_limit: Delay between URL crawls
        progress_callback: Optional progress callback
    Returns:
        List of results
    """
    scraper = LinkedInProfileScraper(rate_limit_delay=rate_limit)
    return asyncio.run(scraper.batch_find_profiles(members, progress_callback))
@@ -1,11 +1,14 @@
 import io
 import logging
 import pandas as pd
 from db.db import Base, db_dependency, engine
 from dotenv import load_dotenv
-from fastapi import FastAPI, File, Form, UploadFile
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from routers import (
    addition,
    companies,
    folk_crm,
    insight_route,
@@ -13,7 +16,8 @@ from routers import (
    projects,
    report_route,
 )
-from schemas.router_schemas import InvestmentResponse, PaginatedResponse
+from schemas.router_schemas import CompanyData, InvestmentResponse, PaginatedResponse
 from services.company_querying import CompanyQueryProcessor
 from services.llm_parser import InvestorProcessor
 from services.querying import QueryProcessor
@@ -25,10 +29,21 @@ def init_database():
    Base.metadata.create_all(bind=engine)
 logger = logging.getLogger(__name__)
 init_database()
 app = FastAPI()
 # Add CORS middleware to allow frontend requests
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # In production, replace with specific origins
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Request models
 class QueryRequest(BaseModel):
@@ -42,6 +57,17 @@ class QueryRequest(BaseModel):
        }
 class CompanyQueryRequest(BaseModel):
    question: str
    class Config:
        json_schema_extra = {
            "example": {
                "question": "Find me companies in the fintech sector located in San Francisco."
            }
        }
@app.get("/")
 def health():
    return {"Hello": "World"}
@@ -61,16 +87,18 @@ async def parse_csv(
    - Handles AUM, fund sizes, and check sizes as integers
    **For companies:**
-    - Expected columns: Name, Website, Investor, Final Investor Profile (company profile)
+    - Expected columns: Name, Website, Perplexity Gap Output (or Final Investor Profile)
    - 100% manual JSON parsing - no LLM needed
-    - Extracts company details, executives, investors, and client categories
+    - **Only extracts:** founded_year and key_executives
-    - Automatically links companies to investors in database
+    - **Only updates companies already in the database** (syncs with existing records)
    - Skips companies not found in the database
    **Benefits:**
    - Fast processing (5-10s per record)
    - Low cost (minimal or no LLM usage)
    - Accurate data extraction
    - Automatic database persistence
    - Safe: won't create duplicate companies
    """
    # Read uploaded CSV with pandas
    content = await file.read()
@@ -95,21 +123,30 @@ async def parse_csv(
    "/query", response_model=PaginatedResponse[InvestmentResponse], tags=["Querying"]
 )
 async def query_investors(request: QueryRequest):
-    """
+    """Query investors/funds using natural language"""
-    Query investors using natural language.
+    try:
        processor = QueryProcessor()
        result = await processor.process_query(request.question)
        logger.info(f"Query completed successfully with {result.total} results")
        return result
    except Exception as e:
        logger.error(f"Error in query_investors: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))
    Returns fund-level matches (one row per fund) with investor details.
    This ensures only relevant funds are included in the response.
-    Supports queries like:
+@app.post(
-    - "Show me seed stage investors"
+    "/query-companies", response_model=PaginatedResponse[CompanyData], tags=["Querying"]
-    - "Find fintech investors in Silicon Valley"
+)
-    - "Growth stage investors with $5M+ check sizes"
+async def query_companies(request: CompanyQueryRequest):
-    - "Healthcare investors in Europe"
+    """Query companies using natural language"""
-    """
+    try:
-    processor = QueryProcessor()
+        processor = CompanyQueryProcessor()
-    results = processor.process_query(request.question)
+        result = await processor.process_query(request.question)
-    return results
+        logger.info(f"Company query completed successfully with {result.total} results")
        return result
    except Exception as e:
        logger.error(f"Error in query_companies: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))
 app.include_router(investors.router)
@@ -118,6 +155,7 @@ app.include_router(projects.router)
 app.include_router(folk_crm.router)
 app.include_router(insight_route.router)
 app.include_router(report_route.router)
 app.include_router(addition.router)
 if __name__ == "__main__":
    import uvicorn
@@ -0,0 +1,370 @@
 from typing import Optional
 from db.db import get_db
 from db.models import FundTable, InvestorTable, SectorTable
 from fastapi import APIRouter, Depends
 from pydantic import BaseModel
 from sqlalchemy.orm import Session
 router = APIRouter(tags=["Additional Routes"])
 # Response schemas
 class SectorsResponse(BaseModel):
    sectors: list[str]
    total: int
 class CountryInfo(BaseModel):
    name: str
 class ContinentInfo(BaseModel):
    name: str
    countries: list[str]
 class GeographyResponse(BaseModel):
    continents: list[ContinentInfo]
    total_continents: int
    total_countries: int
 # Mapping of countries to continents
 COUNTRY_TO_CONTINENT = {
    # Africa
    "Algeria": "Africa",
    "Angola": "Africa",
    "Benin": "Africa",
    "Botswana": "Africa",
    "Burkina Faso": "Africa",
    "Burundi": "Africa",
    "Cameroon": "Africa",
    "Cape Verde": "Africa",
    "Central African Republic": "Africa",
    "Chad": "Africa",
    "Comoros": "Africa",
    "Congo": "Africa",
    "Democratic Republic of the Congo": "Africa",
    "Djibouti": "Africa",
    "Egypt": "Africa",
    "Equatorial Guinea": "Africa",
    "Eritrea": "Africa",
    "Eswatini": "Africa",
    "Ethiopia": "Africa",
    "Gabon": "Africa",
    "Gambia": "Africa",
    "Ghana": "Africa",
    "Guinea": "Africa",
    "Guinea-Bissau": "Africa",
    "Ivory Coast": "Africa",
    "Kenya": "Africa",
    "Lesotho": "Africa",
    "Liberia": "Africa",
    "Libya": "Africa",
    "Madagascar": "Africa",
    "Malawi": "Africa",
    "Mali": "Africa",
    "Mauritania": "Africa",
    "Mauritius": "Africa",
    "Morocco": "Africa",
    "Mozambique": "Africa",
    "Namibia": "Africa",
    "Niger": "Africa",
    "Nigeria": "Africa",
    "Rwanda": "Africa",
    "Sao Tome and Principe": "Africa",
    "Senegal": "Africa",
    "Seychelles": "Africa",
    "Sierra Leone": "Africa",
    "Somalia": "Africa",
    "South Africa": "Africa",
    "South Sudan": "Africa",
    "Sudan": "Africa",
    "Tanzania": "Africa",
    "Togo": "Africa",
    "Tunisia": "Africa",
    "Uganda": "Africa",
    "Zambia": "Africa",
    "Zimbabwe": "Africa",
    # Asia
    "Afghanistan": "Asia",
    "Armenia": "Asia",
    "Azerbaijan": "Asia",
    "Bahrain": "Asia",
    "Bangladesh": "Asia",
    "Bhutan": "Asia",
    "Brunei": "Asia",
    "Cambodia": "Asia",
    "China": "Asia",
    "Cyprus": "Asia",
    "Georgia": "Asia",
    "Hong Kong": "Asia",
    "India": "Asia",
    "Indonesia": "Asia",
    "Iran": "Asia",
    "Iraq": "Asia",
    "Israel": "Asia",
    "Japan": "Asia",
    "Jordan": "Asia",
    "Kazakhstan": "Asia",
    "Kuwait": "Asia",
    "Kyrgyzstan": "Asia",
    "Laos": "Asia",
    "Lebanon": "Asia",
    "Malaysia": "Asia",
    "Maldives": "Asia",
    "Mongolia": "Asia",
    "Myanmar": "Asia",
    "Nepal": "Asia",
    "North Korea": "Asia",
    "Oman": "Asia",
    "Pakistan": "Asia",
    "Palestine": "Asia",
    "Philippines": "Asia",
    "Qatar": "Asia",
    "Saudi Arabia": "Asia",
    "Singapore": "Asia",
    "South Korea": "Asia",
    "Sri Lanka": "Asia",
    "Syria": "Asia",
    "Taiwan": "Asia",
    "Tajikistan": "Asia",
    "Thailand": "Asia",
    "Timor-Leste": "Asia",
    "Turkey": "Asia",
    "Turkmenistan": "Asia",
    "United Arab Emirates": "Asia",
    "UAE": "Asia",
    "Uzbekistan": "Asia",
    "Vietnam": "Asia",
    "Yemen": "Asia",
    # Europe
    "Albania": "Europe",
    "Andorra": "Europe",
    "Austria": "Europe",
    "Belarus": "Europe",
    "Belgium": "Europe",
    "Bosnia and Herzegovina": "Europe",
    "Bulgaria": "Europe",
    "Croatia": "Europe",
    "Czech Republic": "Europe",
    "Czechia": "Europe",
    "Denmark": "Europe",
    "Estonia": "Europe",
    "Finland": "Europe",
    "France": "Europe",
    "Germany": "Europe",
    "Greece": "Europe",
    "Hungary": "Europe",
    "Iceland": "Europe",
    "Ireland": "Europe",
    "Italy": "Europe",
    "Kosovo": "Europe",
    "Latvia": "Europe",
    "Liechtenstein": "Europe",
    "Lithuania": "Europe",
    "Luxembourg": "Europe",
    "Malta": "Europe",
    "Moldova": "Europe",
    "Monaco": "Europe",
    "Montenegro": "Europe",
    "Netherlands": "Europe",
    "North Macedonia": "Europe",
    "Norway": "Europe",
    "Poland": "Europe",
    "Portugal": "Europe",
    "Romania": "Europe",
    "Russia": "Europe",
    "San Marino": "Europe",
    "Serbia": "Europe",
    "Slovakia": "Europe",
    "Slovenia": "Europe",
    "Spain": "Europe",
    "Sweden": "Europe",
    "Switzerland": "Europe",
    "Ukraine": "Europe",
    "United Kingdom": "Europe",
    "UK": "Europe",
    "Vatican City": "Europe",
    # North America
    "Antigua and Barbuda": "North America",
    "Bahamas": "North America",
    "Barbados": "North America",
    "Belize": "North America",
    "Canada": "North America",
    "Costa Rica": "North America",
    "Cuba": "North America",
    "Dominica": "North America",
    "Dominican Republic": "North America",
    "El Salvador": "North America",
    "Grenada": "North America",
    "Guatemala": "North America",
    "Haiti": "North America",
    "Honduras": "North America",
    "Jamaica": "North America",
    "Mexico": "North America",
    "Nicaragua": "North America",
    "Panama": "North America",
    "Saint Kitts and Nevis": "North America",
    "Saint Lucia": "North America",
    "Saint Vincent and the Grenadines": "North America",
    "Trinidad and Tobago": "North America",
    "United States": "North America",
    "USA": "North America",
    "US": "North America",
    # South America
    "Argentina": "South America",
    "Bolivia": "South America",
    "Brazil": "South America",
    "Chile": "South America",
    "Colombia": "South America",
    "Ecuador": "South America",
    "Guyana": "South America",
    "Paraguay": "South America",
    "Peru": "South America",
    "Suriname": "South America",
    "Uruguay": "South America",
    "Venezuela": "South America",
    # Oceania
    "Australia": "Oceania",
    "Fiji": "Oceania",
    "Kiribati": "Oceania",
    "Marshall Islands": "Oceania",
    "Micronesia": "Oceania",
    "Nauru": "Oceania",
    "New Zealand": "Oceania",
    "Palau": "Oceania",
    "Papua New Guinea": "Oceania",
    "Samoa": "Oceania",
    "Solomon Islands": "Oceania",
    "Tonga": "Oceania",
    "Tuvalu": "Oceania",
    "Vanuatu": "Oceania",
 }
 # Valid continent names for direct matching
 VALID_CONTINENTS = {
    "Africa",
    "Asia",
    "Europe",
    "North America",
    "South America",
    "Oceania",
    "Antarctica",
 }
 def extract_countries_from_geographic_focus(geographic_focus: str) -> set[str]:
    """
    Extract country names from a geographic_focus string.
    Handles comma-separated values, slashes, and various formats.
    """
    if not geographic_focus:
        return set()
    countries = set()
    # Split by common delimiters
    parts = geographic_focus.replace("/", ",").replace(";", ",").split(",")
    for part in parts:
        cleaned = part.strip()
        if cleaned:
            # Check if it's a known country
            if cleaned in COUNTRY_TO_CONTINENT:
                countries.add(cleaned)
            # Check for partial matches (e.g., "United States of America" -> "United States")
            else:
                for country in COUNTRY_TO_CONTINENT.keys():
                    if country.lower() in cleaned.lower() or cleaned.lower() in country.lower():
                        countries.add(country)
                        break
    return countries
 def organize_geography(geographic_data: list[str]) -> dict[str, set[str]]:
    """
    Organize geographic data into continents and their countries.
    Returns a dict with continent names as keys and sets of countries as values.
    """
    continent_countries: dict[str, set[str]] = {}
    for geo_focus in geographic_data:
        if not geo_focus:
            continue
        # Extract countries from the geographic focus string
        countries = extract_countries_from_geographic_focus(geo_focus)
        for country in countries:
            continent = COUNTRY_TO_CONTINENT.get(country)
            if continent:
                if continent not in continent_countries:
                    continent_countries[continent] = set()
                continent_countries[continent].add(country)
        # Also check if the geographic focus itself is a continent
        cleaned_geo = geo_focus.strip()
        if cleaned_geo in VALID_CONTINENTS:
            if cleaned_geo not in continent_countries:
                continent_countries[cleaned_geo] = set()
    return continent_countries
@router.get("/sectors", response_model=SectorsResponse)
 def get_unique_sectors(db: Session = Depends(get_db)):
    """
    Get all unique sectors from the database.
    Returns a list of sector names sorted alphabetically.
    """
    sectors = db.query(SectorTable.name).distinct().order_by(SectorTable.name).all()
    sector_names = [s[0] for s in sectors if s[0]]
    return SectorsResponse(sectors=sector_names, total=len(sector_names))
@router.get("/geography", response_model=GeographyResponse)
 def get_arranged_geography(db: Session = Depends(get_db)):
    """
    Get all unique geographic locations arranged by continent and countries.
    Extracts geography from both investors and funds tables.
    Returns continents with their associated countries.
    """
    # Collect all geographic focus data from investors
    investor_geo = (
        db.query(InvestorTable.geographic_focus)
        .filter(InvestorTable.geographic_focus.isnot(None))
        .distinct()
        .all()
    )
    # Collect all geographic focus data from funds
    fund_geo = (
        db.query(FundTable.geographic_focus)
        .filter(FundTable.geographic_focus.isnot(None))
        .distinct()
        .all()
    )
    # Combine all geographic data
    all_geo_data = [g[0] for g in investor_geo] + [g[0] for g in fund_geo]
    # Organize into continents and countries
    continent_countries = organize_geography(all_geo_data)
    # Build response
    continents = []
    total_countries = 0
    for continent_name in sorted(continent_countries.keys()):
        countries = sorted(continent_countries[continent_name])
        total_countries += len(countries)
        continents.append(ContinentInfo(name=continent_name, countries=countries))
    return GeographyResponse(
        continents=continents,
        total_continents=len(continents),
        total_countries=total_countries,
    )
@@ -63,11 +63,13 @@ def read_companies(
    # Transform CompanyTable objects to CompanyData format
    company_data_list = []
    for company in companies:
        # Sort sectors alphabetically
        sorted_sectors = sorted(company.sectors, key=lambda s: s.name) if company.sectors else []
        company_data = CompanyData(
            company=company,
            investors=company.investors,
            members=company.members,
-            sectors=company.sectors,
+            sectors=sorted_sectors,
        )
        company_data_list.append(company_data)
@@ -147,11 +149,13 @@ def filter_companies(
    # Transform to CompanyData format
    company_data_list = []
    for company in companies:
        # Sort sectors alphabetically
        sorted_sectors = sorted(company.sectors, key=lambda s: s.name) if company.sectors else []
        company_data = CompanyData(
            company=company,
            investors=company.investors,
            members=company.members,
-            sectors=company.sectors,
+            sectors=sorted_sectors,
        )
        company_data_list.append(company_data)
@@ -184,12 +188,15 @@ def read_company(company_id: int, db: Session = Depends(get_db)):
    if not company:
        raise HTTPException(status_code=404, detail="Company not found")
    # Sort sectors alphabetically
    sorted_sectors = sorted(company.sectors, key=lambda s: s.name) if company.sectors else []
    # Transform to CompanyData format
    return CompanyData(
        company=company,
        investors=company.investors,
        members=company.members,
-        sectors=company.sectors,
+        sectors=sorted_sectors,
    )
@@ -250,12 +257,15 @@ def update_company(
        .first()
    )
    # Sort sectors alphabetically
    sorted_sectors = sorted(company_with_relations.sectors, key=lambda s: s.name) if company_with_relations.sectors else []
    # Transform to CompanyData format
    return CompanyData(
        company=company_with_relations,
        investors=company_with_relations.investors,
        members=company_with_relations.members,
-        sectors=company_with_relations.sectors,
+        sectors=sorted_sectors,
    )
@@ -1,15 +1,21 @@
 import os
 from typing import List
 from db.db import get_db
 from db.models import InvestorTable
 from fastapi import APIRouter, Depends, HTTPException
 from pydantic import BaseModel
-from services.crm import folk
+from services.crm import FolkAPI
 from sqlalchemy.orm import Session, selectinload
 router = APIRouter(prefix="/folk", tags=["Folk CRM"])
 def get_folk_client():
    """Get Folk API client with loaded environment variables"""
    return FolkAPI(api_key=os.environ.get("FOLK_API_KEY", ""))
 class GroupResponse(BaseModel):
    id: str
    name: str
@@ -44,6 +50,7 @@ def get_folk_groups():
    to sync investors to Folk.
    """
    try:
        folk = get_folk_client()
        groups_data = folk.get_groups()
        items = groups_data.get("data", {}).get("items", [])
@@ -71,6 +78,7 @@ def sync_investors_to_folk(
    Returns:
        Summary of sync operation including successes and errors
    """
    folk = get_folk_client()
    # Fetch investors with their team members
    investors = (
        db.query(InvestorTable)
@@ -128,6 +136,11 @@ def sync_investors_to_folk(
                    if hasattr(member, "source_url") and member.source_url:
                        urls_list = [member.source_url]
                    # Get LinkedIn URL if available
                    linkedin_url = None
                    if hasattr(member, "linkedin") and member.linkedin:
                        linkedin_url = member.linkedin
                    # Build job title from title or role
                    job_title = None
                    if hasattr(member, "title") and member.title:
@@ -141,6 +154,7 @@ def sync_investors_to_folk(
                        email=member.email,
                        company_id=company_id,
                        group_id=request.group_id,
                        linkedin_url=linkedin_url,
                        urls=urls_list,
                        jobTitle=job_title,
                    )
@@ -12,7 +12,10 @@ from schemas.router_schemas import (
    PaginatedResponse,
    SectorMinimal,
 )
-from services.compatibility_score import calculate_project_investor_compatibility
+from services.compatibility_score import (
    _calculate_project_fund_compatibility,
    _calculate_project_investor_direct_compatibility,
 )
 from sqlalchemy.orm import Session, selectinload
 router = APIRouter(tags=["Investor Routes"])
@@ -77,31 +80,46 @@ def read_investors(
        if not project:
            raise HTTPException(status_code=404, detail="Project not found")
-    # Get paginated results
+    # When project_id is provided, we need to get all investors first to sort by compatibility score
-    investors = (
+    # Otherwise, we can paginate at the database level
-        db.query(InvestorTable)
+    if project is not None:
-        .options(
+        # Get all investors (we'll sort by compatibility score, then paginate)
-            selectinload(InvestorTable.portfolio_companies),
+        all_investors = (
-            selectinload(InvestorTable.team_members),
+            db.query(InvestorTable)
-            selectinload(InvestorTable.sectors),
+            .options(
-            selectinload(InvestorTable.funds).selectinload(FundTable.investment_stages),
+                selectinload(InvestorTable.portfolio_companies),
-            selectinload(InvestorTable.funds).selectinload(FundTable.sectors),
+                selectinload(InvestorTable.team_members),
                selectinload(InvestorTable.sectors),
                selectinload(InvestorTable.funds).selectinload(
                    FundTable.investment_stages
                ),
                selectinload(InvestorTable.funds).selectinload(FundTable.sectors),
            )
            .all()
        )
        # We'll paginate after sorting by compatibility score
        investors = all_investors
    else:
        # Get paginated results (no sorting needed)
        investors = (
            db.query(InvestorTable)
            .options(
                selectinload(InvestorTable.portfolio_companies),
                selectinload(InvestorTable.team_members),
                selectinload(InvestorTable.sectors),
                selectinload(InvestorTable.funds).selectinload(
                    FundTable.investment_stages
                ),
                selectinload(InvestorTable.funds).selectinload(FundTable.sectors),
            )
            .offset(offset)
            .limit(page_size)
            .all()
        )
        .offset(offset)
        .limit(page_size)
        .all()
    )
    # Transform to InvestmentResponse format (one row per investor-fund combination)
    investment_responses = []
    for investor in investors:
        # Calculate compatibility score if project provided
        compatibility_score = 1.0
        if project is not None:
            compatibility_score = calculate_project_investor_compatibility(
                project=project, investor=investor, use_funds=True
            )
        # Get top 3 portfolio companies (id and name only)
        portfolio_companies = [
            CompanyMinimal(id=company.id, name=company.name)
@@ -111,6 +129,13 @@ def read_investors(
        # If investor has funds, create one entry per fund
        if investor.funds:
            for fund in investor.funds:
                # Calculate compatibility score for this specific fund
                compatibility_score = 1.0
                if project is not None:
                    compatibility_score = _calculate_project_fund_compatibility(
                        project=project, fund=fund
                    )
                # Get stage focus as comma-separated string
                stage_focus = (
                    ", ".join([stage.name for stage in fund.investment_stages])
@@ -118,10 +143,12 @@ def read_investors(
                    else None
                )
-                # Get top 3 sectors from fund (id and name only)
+                # Get top 3 sectors from fund (id and name only) - sorted alphabetically
                fund_sectors = [
                    SectorMinimal(id=sector.id, name=sector.name)
-                    for sector in (fund.sectors[:3] if fund.sectors else [])
+                    for sector in sorted(
                        fund.sectors[:3] if fund.sectors else [], key=lambda s: s.name
                    )
                ]
                investment_response = InvestmentResponse(
@@ -141,6 +168,13 @@ def read_investors(
                investment_responses.append(investment_response)
        else:
            # If no funds, create one entry with null fund fields
            # Calculate compatibility using investor-level data
            compatibility_score = 1.0
            if project is not None:
                compatibility_score = _calculate_project_investor_direct_compatibility(
                    project=project, investor=investor
                )
            investment_response = InvestmentResponse(
                id=investor.id,
                name=investor.name,
@@ -155,6 +189,12 @@ def read_investors(
            )
            investment_responses.append(investment_response)
    # Sort by compatibility score (descending) when project_id is provided
    if project is not None:
        investment_responses.sort(key=lambda x: x.compatibility_score, reverse=True)
        # Apply pagination after sorting
        investment_responses = investment_responses[offset : offset + page_size]
    # Calculate total pages
    total_pages = (total_count + page_size - 1) // page_size
@@ -246,20 +286,27 @@ def filter_investors(
    # Get total count before pagination
    total_count = query.count()
-    # Calculate offset and apply pagination
+    # When project_id is provided, we need to get all funds first to sort by compatibility score
-    offset = (page - 1) * page_size
+    # Otherwise, we can paginate at the database level
-    funds = query.offset(offset).limit(page_size).all()
+    if project is not None:
        # Get all funds (we'll sort by compatibility score, then paginate)
        all_funds = query.all()
        funds = all_funds
    else:
        # Calculate offset and apply pagination (no sorting needed)
        offset = (page - 1) * page_size
        funds = query.offset(offset).limit(page_size).all()
    # Transform to InvestmentResponse format (one row per fund)
    investment_responses = []
    for fund in funds:
        investor = fund.investor
-        # Calculate compatibility score if project provided
+        # Calculate compatibility score for this specific fund
        compatibility_score = 1.0
        if project is not None:
-            compatibility_score = calculate_project_investor_compatibility(
+            compatibility_score = _calculate_project_fund_compatibility(
-                project=project, investor=investor, use_funds=True
+                project=project, fund=fund
            )
        # Get top 3 portfolio companies (id and name only)
@@ -275,10 +322,12 @@ def filter_investors(
            else None
        )
-        # Get top 3 sectors from fund (id and name only)
+        # Get top 3 sectors from fund (id and name only) - sorted alphabetically
        fund_sectors = [
            SectorMinimal(id=sector.id, name=sector.name)
-            for sector in (fund.sectors[:3] if fund.sectors else [])
+            for sector in sorted(
                fund.sectors[:3] if fund.sectors else [], key=lambda s: s.name
            )
        ]
        investment_response = InvestmentResponse(
@@ -297,6 +346,13 @@ def filter_investors(
        )
        investment_responses.append(investment_response)
    # Sort by compatibility score (descending) when project_id is provided
    if project is not None:
        investment_responses.sort(key=lambda x: x.compatibility_score, reverse=True)
        # Apply pagination after sorting
        offset = (page - 1) * page_size
        investment_responses = investment_responses[offset : offset + page_size]
    # Calculate total pages
    total_pages = (total_count + page_size - 1) // page_size
@@ -24,19 +24,29 @@ router = APIRouter(tags=["Project Routes"])
 def read_projects(
    page: int = Query(1, ge=1, description="Page number (starts at 1)"),
    page_size: int = Query(10, ge=1, le=100, description="Items per page (max 100)"),
    include_archived: bool = Query(False, description="Include archived projects"),
    db: Session = Depends(get_db),
 ):
-    """Get all projects with their related data (paginated)"""
+    """Get all projects with their related data (paginated)
    By default, archived projects are excluded. Set include_archived=True to include them.
    """
    # Calculate offset
    offset = (page - 1) * page_size
    # Start with base query
    query = db.query(ProjectTable)
    # Filter out archived projects by default
    if not include_archived:
        query = query.filter(ProjectTable.is_archived == 0)
    # Get total count
-    total_count = db.query(ProjectTable).count()
+    total_count = query.count()
    # Get paginated results
    projects = (
-        db.query(ProjectTable)
+        query.options(
        .options(
            selectinload(ProjectTable.sector),
            selectinload(ProjectTable.investors),
            selectinload(ProjectTable.companies),
@@ -162,7 +172,7 @@ def update_project(
@router.delete("/projects/{project_id}")
 def delete_project(project_id: int, db: Session = Depends(get_db)):
-    """Delete a project"""
+    """Delete a project permanently"""
    db_project = db.query(ProjectTable).filter(ProjectTable.id == project_id).first()
    if not db_project:
@@ -174,6 +184,87 @@ def delete_project(project_id: int, db: Session = Depends(get_db)):
    return {"message": "Project deleted successfully"}
@router.post("/projects/{project_id}/archive")
 def archive_project(project_id: int, db: Session = Depends(get_db)):
    """Archive a project (soft delete)"""
    db_project = db.query(ProjectTable).filter(ProjectTable.id == project_id).first()
    if not db_project:
        raise HTTPException(status_code=404, detail="Project not found")
    db_project.is_archived = 1
    db.commit()
    db.refresh(db_project)
    return {"message": "Project archived successfully", "project_id": project_id}
@router.post("/projects/{project_id}/unarchive")
 def unarchive_project(project_id: int, db: Session = Depends(get_db)):
    """Unarchive a project (restore from archive)"""
    db_project = db.query(ProjectTable).filter(ProjectTable.id == project_id).first()
    if not db_project:
        raise HTTPException(status_code=404, detail="Project not found")
    db_project.is_archived = 0
    db.commit()
    db.refresh(db_project)
    return {"message": "Project unarchived successfully", "project_id": project_id}
@router.get("/projects/archived", response_model=PaginatedResponse[ProjectData])
 def read_archived_projects(
    page: int = Query(1, ge=1, description="Page number (starts at 1)"),
    page_size: int = Query(10, ge=1, le=100, description="Items per page (max 100)"),
    db: Session = Depends(get_db),
 ):
    """Get all archived projects (paginated)"""
    # Calculate offset
    offset = (page - 1) * page_size
    # Query only archived projects
    query = db.query(ProjectTable).filter(ProjectTable.is_archived == 1)
    # Get total count
    total_count = query.count()
    # Get paginated results
    projects = (
        query.options(
            selectinload(ProjectTable.sector),
            selectinload(ProjectTable.investors),
            selectinload(ProjectTable.companies),
        )
        .offset(offset)
        .limit(page_size)
        .all()
    )
    # Transform ProjectTable objects to ProjectData format
    project_data_list = []
    for project in projects:
        project_data = ProjectData(
            project=project,
            sector=project.sector,
            investors=project.investors,
            companies=project.companies,
        )
        project_data_list.append(project_data)
    # Calculate total pages
    total_pages = (total_count + page_size - 1) // page_size
    return PaginatedResponse(
        items=project_data_list,
        total=total_count,
        page=page,
        page_size=page_size,
        total_pages=total_pages,
    )
@router.get("/projects/filter", response_model=PaginatedResponse[ProjectData])
 def filter_projects(
    stage: Optional[InvestmentStage] = Query(
@@ -182,6 +273,7 @@ def filter_projects(
    min_valuation: Optional[int] = Query(None, description="Minimum valuation"),
    max_valuation: Optional[int] = Query(None, description="Maximum valuation"),
    location: Optional[str] = Query(None, description="Location (partial match)"),
    industry: Optional[str] = Query(None, description="Industry (partial match)"),
    sector: Optional[str] = Query(None, description="Sector name (partial match)"),
    investor_name: Optional[str] = Query(
        None, description="Investor name (partial match)"
@@ -215,6 +307,9 @@ def filter_projects(
    if location:
        query = query.filter(ProjectTable.location.ilike(f"%{location}%"))
    if industry:
        query = query.filter(ProjectTable.industry.ilike(f"%{industry}%"))
    if sector:
        query = query.join(ProjectTable.sector).filter(
            SectorTable.name.ilike(f"%{sector}%")
@@ -52,7 +52,6 @@ async def generate_investor_report(
        "website": investor.website,
        "headquarters": investor.headquarters,
        "aum": investor.aum,
        "geographic_focus": investor.geographic_focus,
        "portfolio_highlights": investor.portfolio_highlights or [],
        "investment_thesis": investor.investment_thesis or [],
        "sectors": [sector.name for sector in investor.sectors],
@@ -65,24 +64,22 @@ async def generate_investor_report(
            }
            for member in investor.team_members
        ],
-        "check_size_lower": None,
+        "funds": [],
        "check_size_upper": None,
        "investment_stages": [],
    }
-    # Get check sizes and stages from funds
+    # Get all funds with their data
    if investor.funds:
        # Use the first fund's data or aggregate
        fund = investor.funds[0]
        investor_data["check_size_lower"] = fund.check_size_lower
        investor_data["check_size_upper"] = fund.check_size_upper
        # Aggregate all investment stages from all funds
        stages = set()
        for fund in investor.funds:
-            for stage in fund.investment_stages:
+            fund_data = {
-                stages.add(stage.name)
+                "fund_name": fund.fund_name,
-        investor_data["investment_stages"] = list(stages)
+                "fund_size": fund.fund_size,
                "check_size_lower": fund.check_size_lower,
                "check_size_upper": fund.check_size_upper,
                "geographic_focus": fund.geographic_focus,
                "investment_stages": [stage.name for stage in fund.investment_stages],
                "sectors": [sector.name for sector in fund.sectors],
            }
            investor_data["funds"].append(fund_data)
    # Fetch project data if project_id is provided
    project_data = None
@@ -109,7 +106,7 @@ async def generate_investor_report(
    # Generate PDF report
    report_generator = ReportGenerator()
    pdf_bytes = await report_generator.generate_investor_report(
-        investor_data, project_data
+        investor_data, project_data, investor_model=investor, project_model=project
    )
    # Return PDF as downloadable file
@@ -60,6 +60,7 @@ class ProjectSchema(BaseModel):
    valuation: int | None
    stage: InvestmentStage | None
    location: str | None
    industry: str | None
    description: Optional[str]
    start_date: Optional[datetime]
    end_date: Optional[datetime]
@@ -75,6 +76,7 @@ class ProjectCreate(BaseModel):
    valuation: Optional[int] = None
    stage: Optional[InvestmentStage] = None
    location: Optional[str] = None
    industry: Optional[str] = None
    description: Optional[str] = None
    start_date: Optional[datetime] = None
    end_date: Optional[datetime] = None
@@ -85,6 +87,7 @@ class ProjectUpdate(BaseModel):
    valuation: Optional[int] = None
    stage: Optional[InvestmentStage] = None
    location: Optional[str] = None
    industry: Optional[str] = None
    description: Optional[str] = None
    start_date: Optional[datetime] = None
    end_date: Optional[datetime] = None
@@ -38,6 +38,7 @@ class InvestorMemberSchema(BaseModel):
    name: str
    role: str | None
    email: str | None
    linkedin: str | None
    class Config:
        from_attributes = True
@@ -168,6 +169,7 @@ class InvestorFundData(BaseModel):
    class Config:
        from_attributes = True
 class InvestorMinimal(BaseModel):
    """Minimal investor info with just id and name"""
@@ -177,6 +179,7 @@ class InvestorMinimal(BaseModel):
    class Config:
        from_attributes = True
 class CompanySchemaMinimal(BaseModel):
    id: int
    name: str
@@ -188,9 +191,12 @@ class CompanySchemaMinimal(BaseModel):
    class Config:
        from_attributes = True
 class CompanyData(BaseModel):  # Renamed from CompaniesData for consistency
    company: CompanySchemaMinimal
    investors: List[InvestorMinimal]
    members: List[CompanyMemberSchema] = []
    sectors: List[SectorSchema] = []
    class Config:
        from_attributes = True
@@ -0,0 +1,228 @@
 import asyncio
 import hashlib
 import logging
 import os
 from typing import List
 from db.db import get_db
 from db.models import CompanyTable
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_openai import ChatOpenAI
 from schemas.router_schemas import CompanyData, PaginatedResponse
 from sqlalchemy import text
 from sqlalchemy.orm import selectinload
 logger = logging.getLogger(__name__)
 class CompanyQueryProcessor:
    def __init__(self):
        self.llm = ChatOpenAI(
            api_key=os.getenv("OPENROUTER_API_KEY"),
            base_url="https://openrouter.ai/api/v1",
            model="openai/gpt-4o-mini",
            temperature=0,
        )
        # Query cache for performance
        self.query_cache = {}
        # SQL generation prompt
        self.sql_prompt = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """You are a SQL expert. Generate a SQLite query to find company IDs based on user requirements.
 Database Schema:
 - companies: id, name, industry, location, description, founded_year, website
 - company_sector: company_id, sector_id
 - sectors: id, name
 - investor_companies: investor_id, company_id
 - investors: id, name, aum
 - team_members: id, company_id, name, title
 IMPORTANT RULES:
 1. ALWAYS return ONLY company IDs (companies.id) - use SELECT DISTINCT c.id
 2. For industry: Check BOTH industry field AND sectors table with synonyms
   - Use LEFT JOIN for sectors so companies without sector tags still match
   - Include related terms: 'Fintech' → c.industry LIKE '%Fintech%' OR c.industry LIKE '%Finance%' OR sec.name LIKE '%Fintech%' OR sec.name LIKE '%Financial%'
   - 'AI' → c.industry LIKE '%AI%' OR c.industry LIKE '%Artificial Intelligence%' OR c.industry LIKE '%Machine Learning%' OR sec.name LIKE '%AI%' OR sec.name LIKE '%ML%'
 3. For location: Be FLEXIBLE with variations and abbreviations
   - 'San Francisco' → c.location LIKE '%San Francisco%' OR c.location LIKE '%SF%' OR c.location LIKE '%Bay Area%'
   - 'New York' → c.location LIKE '%New York%' OR c.location LIKE '%NYC%' OR c.location LIKE '%NY%'
   - 'Europe' → c.location LIKE '%Europe%' OR c.location LIKE '%UK%' OR c.location LIKE '%London%' OR c.location LIKE '%Berlin%' OR c.location LIKE '%Paris%'
 4. For sectors: Use LEFT JOIN and include multiple synonyms
   - 'Healthcare' → sec.name LIKE '%Healthcare%' OR sec.name LIKE '%Health%' OR sec.name LIKE '%Medical%' OR sec.name LIKE '%Biotech%' OR c.industry LIKE '%Health%'
 5. For founding year filters (include NULL to be inclusive):
   - "founded after 2020" → WHERE (founded_year >= 2020 OR founded_year IS NULL)
   - "founded before 2018" → WHERE (founded_year <= 2018 OR founded_year IS NULL)
   - "founded in 2020" → WHERE founded_year = 2020
 6. For investor-related queries: Use JOIN investor_companies
 7. Use LEFT JOIN for sectors so companies without tags still match
 8. Use DISTINCT to avoid duplicates from joins
 9. Be INCLUSIVE - use OR conditions with synonyms and variations
 10. Return a single, complete SELECT query
 Example Queries:
 Q: "Fintech companies founded in 2020"
 A: SELECT DISTINCT c.id FROM companies c 
   LEFT JOIN company_sector cs ON c.id = cs.company_id 
   LEFT JOIN sectors sec ON cs.sector_id = sec.id 
   WHERE (c.industry LIKE '%Fintech%' OR c.industry LIKE '%Finance%' OR c.industry LIKE '%Financial%' OR sec.name LIKE '%Fintech%' OR sec.name LIKE '%Financial Services%') 
   AND c.founded_year = 2020
 Q: "AI companies in San Francisco"
 A: SELECT DISTINCT c.id FROM companies c 
   LEFT JOIN company_sector cs ON c.id = cs.company_id 
   LEFT JOIN sectors sec ON cs.sector_id = sec.id 
   WHERE (c.industry LIKE '%AI%' OR c.industry LIKE '%Artificial Intelligence%' OR c.industry LIKE '%Machine Learning%' OR sec.name LIKE '%AI%' OR sec.name LIKE '%Machine Learning%' OR sec.name LIKE '%ML%') 
   AND (c.location LIKE '%San Francisco%' OR c.location LIKE '%SF%' OR c.location LIKE '%Bay Area%')
 Q: "Healthcare companies"
 A: SELECT DISTINCT c.id FROM companies c 
   LEFT JOIN company_sector cs ON c.id = cs.company_id 
   LEFT JOIN sectors sec ON cs.sector_id = sec.id 
   WHERE c.industry LIKE '%Healthcare%' OR c.industry LIKE '%Health%' OR c.industry LIKE '%Medical%' OR sec.name LIKE '%Healthcare%' OR sec.name LIKE '%Medical%' OR sec.name LIKE '%Biotech%' OR sec.name LIKE '%Pharma%'
 Q: "Companies funded by Sequoia"
 A: SELECT DISTINCT c.id FROM companies c 
   JOIN investor_companies ic ON c.id = ic.company_id 
   JOIN investors i ON ic.investor_id = i.id 
   WHERE i.name LIKE '%Sequoia%'
 Q: "European startups founded after 2019"
 A: SELECT DISTINCT c.id FROM companies c 
   WHERE (c.location LIKE '%Europe%' OR c.location LIKE '%UK%' OR c.location LIKE '%London%' OR c.location LIKE '%Germany%' OR c.location LIKE '%Berlin%' OR c.location LIKE '%France%' OR c.location LIKE '%Paris%') 
   AND (c.founded_year > 2019 OR c.founded_year IS NULL)
 Q: "SaaS companies"
 A: SELECT DISTINCT c.id FROM companies c 
   LEFT JOIN company_sector cs ON c.id = cs.company_id 
   LEFT JOIN sectors sec ON cs.sector_id = sec.id 
   WHERE c.industry LIKE '%SaaS%' OR c.industry LIKE '%Software%' OR c.industry LIKE '%Cloud%' OR sec.name LIKE '%SaaS%' OR sec.name LIKE '%Software%'
 IMPORTANT: 
 - Use LEFT JOIN so companies without sector tags still match via industry field
 - Use OR conditions with related keywords/synonyms to cast a wider net
 - Include NULL checks for optional filters to avoid excluding companies with missing data
 Return ONLY the SQL query, no explanations or markdown.""",
                ),
                ("user", "{question}"),
            ]
        )
    def _get_cache_key(self, question: str) -> str:
        """Generate cache key from normalized question."""
        return hashlib.md5(question.lower().strip().encode()).hexdigest()
    # synchronous helper is provided below as `_process_query_sync` and an
    # async wrapper `process_query` runs it in a thread. This keeps the
    # FastAPI event loop non-blocking while reusing the existing sync code.
    async def process_query(self, question: str) -> PaginatedResponse[CompanyData]:
        """Async wrapper for process_query. Runs blocking work in a thread to avoid
        blocking the event loop.
        """
        return await asyncio.to_thread(self._process_query_sync, question)
    def _process_query_sync(self, question: str) -> PaginatedResponse[CompanyData]:
        """Synchronous implementation of process_query. This is run in a thread by
        the async wrapper above.
        """
        cache_key = self._get_cache_key(question)
        # Check cache first
        if cache_key in self.query_cache:
            sql_query = self.query_cache[cache_key]
            logger.info(f"Using cached SQL: {sql_query}")
        else:
            # Generate SQL query
            messages = self.sql_prompt.format_messages(question=question)
            response = self.llm.invoke(messages)
            sql_query = response.content.strip()
            # Clean up SQL (remove markdown code blocks if present)
            sql_query = sql_query.replace("```sql", "").replace("```", "").strip()
            # Cache the query
            self.query_cache[cache_key] = sql_query
            logger.info(f"Generated SQL: {sql_query}")
        # Execute query to get company IDs
        db_session = next(get_db())
        try:
            result = db_session.execute(text(sql_query))
            company_ids = [row[0] for row in result.fetchall()]
            logger.info(
                f"Found {len(company_ids)} company IDs: {company_ids[:10]}{'...' if len(company_ids) > 10 else ''}"
            )
            return self._fetch_companies_by_ids(company_ids)
        except Exception as e:
            logger.error(f"SQL execution error: {e}")
            logger.error(f"Failed SQL: {sql_query}")
            # Return empty result
            return PaginatedResponse(
                items=[], total=0, page=1, page_size=10, total_pages=0
            )
        finally:
            db_session.close()
    def _fetch_companies_by_ids(
        self, company_ids: List[int]
    ) -> PaginatedResponse[CompanyData]:
        """Fetch companies with all their relationships from the database using company IDs.
        Args:
            company_ids: List of company IDs to fetch
        """
        if not company_ids:
            return PaginatedResponse(
                items=[],
                total=0,
                page=1,
                page_size=10,
                total_pages=0,
            )
        # Get database session
        db_session = next(get_db())
        try:
            # Query companies with all necessary relationships loaded
            companies = (
                db_session.query(CompanyTable)
                .options(
                    selectinload(CompanyTable.investors),
                    selectinload(CompanyTable.members),
                    selectinload(CompanyTable.sectors),
                )
                .filter(CompanyTable.id.in_(company_ids))
                .all()
            )
            # Transform to CompanyData format
            company_data_list = []
            for company in companies:
                company_data = CompanyData(
                    company=company,
                    investors=company.investors,
                    members=company.members,
                    sectors=company.sectors,
                )
                company_data_list.append(company_data)
            total_count = len(company_data_list)
            total_pages = 1 if total_count > 0 else 0
            return PaginatedResponse(
                items=company_data_list,
                total=total_count,
                page=1,
                page_size=total_count,
                total_pages=total_pages,
            )
        finally:
            db_session.close()
@@ -6,6 +6,7 @@ The scoring system evaluates multiple dimensions to determine how well a project
 matches with an investor's investment criteria.
 """
 from difflib import SequenceMatcher
 from typing import List, Optional, Tuple
 from db.models import FundTable, InvestorTable, ProjectTable
@@ -99,12 +100,16 @@ def _calculate_project_fund_compatibility(
            else str(project.stage)
        )
-        if project_stage_name in fund_stage_names:
+        # Normalize both for case-insensitive comparison
        project_stage_normalized = project_stage_name.upper().strip()
        fund_stages_normalized = {name.upper().strip() for name in fund_stage_names}
        if project_stage_normalized in fund_stages_normalized:
            stage_score = 30
        else:
            # Partial credit for adjacent stages
            stage_score = _calculate_stage_proximity(
-                project_stage_name, fund_stage_names
+                project_stage_normalized, fund_stages_normalized
            )
    total_score += stage_score
@@ -112,22 +117,53 @@ def _calculate_project_fund_compatibility(
    # 2. Sector Overlap (30 points)
    sector_score = 0
    if project.sector and fund.sectors:
-        project_sector_ids = {sector.id for sector in project.sector}
+        project_sectors = [s for s in project.sector if hasattr(s, "name")]
-        fund_sector_ids = {sector.id for sector in fund.sectors}
+        fund_sectors = [s for s in fund.sectors if hasattr(s, "name")]
-        if project_sector_ids and fund_sector_ids:
+        if project_sectors and fund_sectors:
-            common_sectors = project_sector_ids.intersection(fund_sector_ids)
+            # Use fuzzy matching to account for similar but not identical sector names
-            # Score based on what percentage of project sectors are covered by fund
+            match_count = 0
-            overlap_ratio = len(common_sectors) / len(project_sector_ids)
+            total_matches = 0
-            sector_score = int(30 * overlap_ratio)
+
            for proj_sector in project_sectors:
                best_match_score = 0
                proj_name = proj_sector.name.lower().strip()
                for fund_sector in fund_sectors:
                    fund_name = fund_sector.name.lower().strip()
                    # Exact match
                    if proj_name == fund_name:
                        best_match_score = 1.0
                        break
                    # Fuzzy match using sequence matcher
                    similarity = SequenceMatcher(None, proj_name, fund_name).ratio()
                    # Also check if one contains the other (substring match)
                    if proj_name in fund_name or fund_name in proj_name:
                        similarity = max(similarity, 0.8)
                    best_match_score = max(best_match_score, similarity)
                # Count matches with threshold
                # Perfect match (1.0), strong match (>0.75), partial match (>0.6)
                if best_match_score >= 0.6:
                    total_matches += best_match_score
                    match_count += 1
            if match_count > 0:
                # Calculate overlap ratio based on fuzzy matches
                overlap_ratio = total_matches / len(project_sectors)
                sector_score = int(30 * overlap_ratio)
    total_score += sector_score
    # 3. Geographic Match (20 points)
    geo_score = 0
    if project.location and fund.geographic_focus:
-        project_location_lower = project.location.lower()
+        project_location_lower = project.location.lower().strip()
-        fund_geo_lower = (fund.geographic_focus or "").lower()
+        fund_geo_lower = (fund.geographic_focus or "").lower().strip()
        # Exact match
        if project_location_lower == fund_geo_lower:
@@ -137,10 +173,11 @@ def _calculate_project_fund_compatibility(
            project_location_lower in fund_geo_lower
            or fund_geo_lower in project_location_lower
        ):
-            geo_score = 10
+            geo_score = 15
-        # Check for common geographic terms
+        # Check for common geographic terms or regional overlap (continent/country matching)
        elif _check_geographic_overlap(project_location_lower, fund_geo_lower):
-            geo_score = 5
+            # Give higher score for continent/country matches (e.g., Germany -> Europe)
            geo_score = 18
    total_score += geo_score
@@ -209,13 +246,44 @@ def _calculate_project_investor_direct_compatibility(
    # 2. Sector Overlap (30 points)
    sector_score = 0
    if project.sector and investor.sectors:
-        project_sector_ids = {sector.id for sector in project.sector}
+        project_sectors = [s for s in project.sector if hasattr(s, "name")]
-        investor_sector_ids = {sector.id for sector in investor.sectors}
+        investor_sectors = [s for s in investor.sectors if hasattr(s, "name")]
-        if project_sector_ids and investor_sector_ids:
+        if project_sectors and investor_sectors:
-            common_sectors = project_sector_ids.intersection(investor_sector_ids)
+            # Use fuzzy matching to account for similar but not identical sector names
-            overlap_ratio = len(common_sectors) / len(project_sector_ids)
+            match_count = 0
-            sector_score = int(30 * overlap_ratio)
+            total_matches = 0
            for proj_sector in project_sectors:
                best_match_score = 0
                proj_name = proj_sector.name.lower().strip()
                for inv_sector in investor_sectors:
                    inv_name = inv_sector.name.lower().strip()
                    # Exact match
                    if proj_name == inv_name:
                        best_match_score = 1.0
                        break
                    # Fuzzy match using sequence matcher
                    similarity = SequenceMatcher(None, proj_name, inv_name).ratio()
                    # Also check if one contains the other (substring match)
                    if proj_name in inv_name or inv_name in proj_name:
                        similarity = max(similarity, 0.8)
                    best_match_score = max(best_match_score, similarity)
                # Count matches with threshold
                if best_match_score >= 0.6:
                    total_matches += best_match_score
                    match_count += 1
            if match_count > 0:
                # Calculate overlap ratio based on fuzzy matches
                overlap_ratio = total_matches / len(project_sectors)
                sector_score = int(30 * overlap_ratio)
    total_score += sector_score
@@ -231,9 +299,10 @@ def _calculate_project_investor_direct_compatibility(
            project_location_lower in investor_geo_lower
            or investor_geo_lower in project_location_lower
        ):
-            geo_score = 10
+            geo_score = 15
        elif _check_geographic_overlap(project_location_lower, investor_geo_lower):
-            geo_score = 5
+            # Give higher score for continent/country matches (e.g., Germany -> Europe)
            geo_score = 18
    total_score += geo_score
@@ -278,8 +347,11 @@ def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int:
    """
    stage_order = ["SEED", "SERIES_A", "SERIES_B", "SERIES_C", "GROWTH", "LATE_STAGE"]
    # Normalize project stage for comparison
    project_stage_normalized = project_stage.upper().strip()
    try:
-        project_idx = stage_order.index(project_stage)
+        project_idx = stage_order.index(project_stage_normalized)
    except ValueError:
        return 0
@@ -290,8 +362,10 @@ def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int:
    if project_idx < len(stage_order) - 1:
        adjacent_stages.append(stage_order[project_idx + 1])
    # Normalize fund stages and check for matches
    for stage in fund_stages:
-        if stage in adjacent_stages:
+        stage_normalized = stage.upper().strip()
        if stage_normalized in adjacent_stages:
            return 15  # Half credit for adjacent stage
    return 0
@@ -305,25 +379,90 @@ def _check_geographic_overlap(location1: str, location2: str) -> bool:
        - "San Francisco, CA" and "California" -> True
        - "New York" and "USA" -> True (if both contain USA/US)
        - "London, UK" and "United Kingdom" -> True
        - "Germany" and "Europe" -> True
    """
-    # Common geographic groupings
+    # Normalize inputs
    loc1 = location1.lower().strip()
    loc2 = location2.lower().strip()
    # Common geographic groupings with broader regional mappings
    geo_groups = [
-        ["usa", "us", "united states", "america"],
+        # North America
-        ["uk", "united kingdom", "britain"],
+        ["usa", "us", "united states", "america", "u.s.", "u.s.a"],
-        ["california", "ca"],
+        ["canada", "canadian"],
-        ["new york", "ny"],
+        ["mexico", "mexican"],
        # Europe and countries
        [
            "europe",
            "european",
            "eu",
            "germany",
            "france",
            "uk",
            "united kingdom",
            "britain",
            "spain",
            "italy",
            "netherlands",
            "belgium",
            "sweden",
            "denmark",
            "norway",
            "finland",
            "poland",
            "portugal",
            "austria",
            "switzerland",
            "ireland",
            "greece",
            "czech",
            "romania",
        ],
        # UK specific
        ["uk", "united kingdom", "britain", "england", "scotland", "wales", "london"],
        # US states
        ["california", "ca", "san francisco", "los angeles", "silicon valley"],
        ["new york", "ny", "nyc"],
        ["texas", "tx"],
-        ["europe", "eu"],
+        ["massachusetts", "ma", "boston"],
-        ["asia", "asian"],
+        ["washington", "seattle"],
-        ["africa", "african"],
+        # Asia
        [
            "asia",
            "asian",
            "china",
            "japan",
            "korea",
            "singapore",
            "hong kong",
            "india",
            "indonesia",
            "thailand",
            "vietnam",
            "malaysia",
            "philippines",
        ],
        # Middle East
        ["middle east", "israel", "uae", "dubai", "saudi arabia"],
        # Latin America
        ["latin america", "brazil", "argentina", "chile", "colombia", "mexico"],
        # Africa
        ["africa", "african", "south africa", "nigeria", "kenya", "egypt"],
        # Oceania
        ["australia", "australian", "new zealand"],
    ]
    # Check if both locations match any group
    for group in geo_groups:
-        found_in_1 = any(term in location1 for term in group)
+        found_in_1 = any(term in loc1 for term in group)
-        found_in_2 = any(term in location2 for term in group)
+        found_in_2 = any(term in loc2 for term in group)
        if found_in_1 and found_in_2:
            return True
    # Check for direct substring match (one contains the other)
    if loc1 in loc2 or loc2 in loc1:
        return True
    return False
@@ -1,14 +1,24 @@
 import logging
 import os
 import sys
 import requests
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()],
 )
 logger = logging.getLogger(__name__)
 class FolkAPI:
    BASE_URL = "https://api.folk.app/v1"
    def __init__(self, api_key: str):
        api_key = os.environ.get("FOLK_API_KEY", api_key)
        self.headers = {"Authorization": f"Bearer {api_key}"}
        logger.info(f"FolkAPI initialized with API key: {api_key[:4]}***")
    def get_groups(self):
        """Fetch all groups from Folk."""
@@ -109,6 +119,7 @@ class FolkAPI:
        email: str = None,
        company_id: str = None,
        group_id: str = None,
        linkedin_url: str = None,
        companies=None,
        emails=None,
        phones=None,
@@ -174,7 +185,9 @@ class FolkAPI:
        addresses_list = _to_list(addresses)
        if addresses_list:
            data["addresses"] = addresses_list
-        urls_list = _to_list(urls)
+        urls_list = _to_list(urls) or []
        if linkedin_url:
            urls_list.append(linkedin_url)
        if urls_list:
            data["urls"] = urls_list
@@ -190,71 +203,3 @@ class FolkAPI:
        response.raise_for_status()
        return response.json()
 # Prefer getting the API key from the environment. If not set, fall back to the
 # existing (hard-coded) key so behavior is unchanged for now.
 DEFAULT_API_KEY = "FOLKfIGXuv74ML9EAajxyiUR39ePaNrZ"
 api_key = os.environ.get("FOLK_API_KEY", DEFAULT_API_KEY)
 folk = FolkAPI(api_key=api_key)
 def example_flow():
    # Step 1: Get groups
    groups = folk.get_groups()
    print(groups)
    # Safely dig into the returned structure. The API returns groups under
    # groups['data']['items'] (not groups['data'][0]). Handle missing/empty.
    items = groups.get("data", {}).get("items", [])
    if not items:
        print("No groups returned by Folk API.")
        sys.exit(1)
    # Choose the first group as an example
    group_id = items[0].get("id")
    if not group_id:
        print("No id found for the first group item.")
        sys.exit(1)
    # Step 2: Choose a group_id and create a company
    company = folk.create_company(
        name="2050 Investment Partners",
        group_id=group_id,
        website="https://2050.com",
        linkedin_url="https://linkedin.com/company/2050-investments",
    )
    # Step 3: Add a person to the same group or company
    person = folk.create_person(
        first_name="John",
        last_name="Doe",
        email="john@2050.com",
        company_id=company.get("data", {}).get("id"),
        group_id=group_id,
    )
    print("Created company:", company)
    print("Created person:", person)
 if __name__ == "__main__":
    try:
        example_flow()
    except requests.HTTPError as e:
        # Try to include response body for easier debugging if available
        resp = getattr(e, "response", None)
        if resp is not None:
            try:
                body = resp.text
            except Exception:
                body = "<unreadable response body>"
            print("HTTP error while talking to Folk API:", e)
            print("Response status:", resp.status_code)
            print("Response body:", body)
        else:
            print("HTTP error while talking to Folk API:", e)
        sys.exit(1)
    except Exception as e:  # pragma: no cover - top-level safety
        print("Unexpected error:", e)
        sys.exit(1)
@@ -49,7 +49,7 @@ class QueryProcessor:
        """Tool to search the web using google, provide the relevant query to get the information"""
        logger.info(f"\nWeb Search Tool Called with query: {query}")
        if query:
-            result = self.ddg_search.text(query, max_results=10, backend="google")
+            result = self.ddg_search.text(query, max_results=10)
            return result
        return "No query provided."
@@ -87,11 +87,15 @@ class QueryProcessor:
            context_parts.append(f"Location: {investor_headquarters}")
        if investor_description:
            context_parts.append(f"Description: {investor_description}")
-        if investment_thesis:
+        if investment_thesis and isinstance(investment_thesis, list):
-            thesis_str = ", ".join(investment_thesis[:3])  # Limit to first 3
+            thesis_str = ", ".join(
                str(item) for item in investment_thesis[:3]
            )  # Limit to first 3
            context_parts.append(f"Investment Focus: {thesis_str}")
-        if portfolio_highlights:
+        if portfolio_highlights and isinstance(portfolio_highlights, list):
-            portfolio_str = ", ".join(portfolio_highlights[:5])  # Limit to first 5
+            portfolio_str = ", ".join(
                str(item) for item in portfolio_highlights[:5]
            )  # Limit to first 5
            context_parts.append(f"Notable Portfolio Companies: {portfolio_str}")
        context = "\n".join(context_parts)
@@ -145,16 +145,74 @@ Return the lower and upper bounds in USD."""
        """
        Manually parse the JSON profile from the CSV.
        Returns a cleaned dictionary with the investor profile data.
        Handles JSON wrapped in markdown code blocks (```json ... ```).
        Handles trailing quotes and extra data after JSON.
        """
        if not json_str or pd.isna(json_str):
            return None
        try:
            # Clean the JSON string
            cleaned_json = json_str.strip()
            # Check if it's plain text (no JSON structure)
            if not cleaned_json.startswith(("{", "```", "'")):
                print("   ⚠️  No JSON structure found - skipping")
                return None
            # Remove markdown code block markers if present
            if cleaned_json.startswith("```"):
                # Remove opening marker (```json or ```Json or ```)
                lines = cleaned_json.split("\n")
                if lines[0].startswith("```"):
                    lines = lines[1:]  # Remove first line
                # Remove closing marker (``` or ```')
                if lines and lines[-1].strip() in ("```", "```'", '```"'):
                    lines = lines[:-1]  # Remove last line
                cleaned_json = "\n".join(lines).strip()
            # Remove trailing quotes that might be left over
            if cleaned_json.endswith(("'", '"')):
                cleaned_json = cleaned_json[:-1].strip()
            # Try to find JSON boundaries if there's extra data
            # Look for the first { and the last }
            start_idx = cleaned_json.find("{")
            if start_idx == -1:
                print("   ⚠️  No opening brace found - not valid JSON")
                return None
            # Find the matching closing brace
            # We need to count braces to find the actual end
            brace_count = 0
            end_idx = -1
            for i in range(start_idx, len(cleaned_json)):
                if cleaned_json[i] == "{":
                    brace_count += 1
                elif cleaned_json[i] == "}":
                    brace_count -= 1
                    if brace_count == 0:
                        end_idx = i + 1
                        break
            if end_idx == -1:
                print("   ⚠️  No matching closing brace found")
                return None
            # Extract just the JSON part
            cleaned_json = cleaned_json[start_idx:end_idx]
            # Parse JSON string
-            profile = json.loads(json_str)
+            profile = json.loads(cleaned_json)
            return profile
        except json.JSONDecodeError as e:
-            print(f"Error parsing JSON: {e}")
+            print(f"   ❌ JSON parsing error: {e}")
            # Print first 200 chars for debugging
            preview = json_str[:200] if len(json_str) > 200 else json_str
            print(f"   Preview: {preview}...")
            return None
        except Exception as e:
            print(f"   ❌ Unexpected error: {e}")
            return None
    async def process_investor_profile(
@@ -338,34 +396,45 @@ Return the lower and upper bounds in USD."""
            if existing_company:
                # Update only founded_year on existing company
                company = existing_company
                updated_fields = []
                if company_data.get("founded_year"):
                    company.founded_year = company_data["founded_year"]
                    updated_fields.append(
                        f"founded_year: {company_data['founded_year']}"
                    )
                # Add/update company members (key executives)
                # First, remove existing members if updating
                db.query(CompanyMember).filter_by(company_id=company.id).delete()
                exec_count = 0
                for exec_data in company_data.get("key_executives", []):
                    member = CompanyMember(
                        name=exec_data.get("name"),
                        role=exec_data.get("title"),
                        linkedin=exec_data.get(
                            "source_url"
                        ),  # Store source URL in linkedin field
                        company_id=company.id,
                    )
                    db.add(member)
                    exec_count += 1
                if exec_count > 0:
                    updated_fields.append(f"{exec_count} executives")
                if updated_fields:
                    print(f"      📝 Updated: {', '.join(updated_fields)}")
                return company
            else:
-                # Company should already be in base database, but if not found, skip
+                # Company not found in base database, skip
-                print(
+                print("      ⚠️  Not in database - skipping")
                    f"⚠️  Company '{company_data['name']}' not found in base database - skipping"
                )
                return None
            # Add/update company members (key executives)
            # First, remove existing members if updating
            db.query(CompanyMember).filter_by(company_id=company.id).delete()
            for exec_data in company_data.get("key_executives", []):
                member = CompanyMember(
                    name=exec_data.get("name"),
                    role=exec_data.get("title"),
                    linkedin=exec_data.get(
                        "source_url"
                    ),  # Store source URL in linkedin field
                    company_id=company.id,
                )
                db.add(member)
            return company
        except Exception as e:
-            print(f"Error saving company to database: {e}")
+            print(f"      ❌ Error saving: {e}")
            db.rollback()
            return None
@@ -789,8 +858,11 @@ Return the lower and upper bounds in USD."""
                if pd.notna(row.get("Investor"))
                else None
            )
            # Try both column names for flexibility
            profile_json = (
-                row.get("Final Investor Profile", "")
+                row.get("Perplexity Gap Output", "")
                if pd.notna(row.get("Perplexity Gap Output"))
                else row.get("Final Investor Profile", "")
                if pd.notna(row.get("Final Investor Profile"))
                else None
            )
@@ -1,29 +1,25 @@
-import json
+import asyncio
 import hashlib
 import logging
 import os
 from typing import List, Optional
-from db.db import DATABASE_URL, get_db
+from db.db import get_db
 from db.models import FundTable, InvestorTable, ProjectTable
-from langchain import hub
+from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.agent_toolkits import SQLDatabaseToolkit
 from langchain_community.utilities import SQLDatabase
 from langchain_openai import ChatOpenAI
 from langgraph.prebuilt import create_react_agent
 from schemas.router_schemas import (
    CompanyMinimal,
    InvestmentResponse,
    PaginatedResponse,
    SectorMinimal,
 )
 from sqlalchemy import text
 from sqlalchemy.orm import selectinload
 from services.compatibility_score import calculate_project_investor_compatibility
 logger = logging.getLogger(__name__)
 # Connect to SQLite
 prompt_template = hub.pull("langchain-ai/sql-agent-system-prompt")
 db = SQLDatabase.from_uri(DATABASE_URL)
 class QueryProcessor:
@@ -34,78 +30,155 @@ class QueryProcessor:
            model="openai/gpt-4o-mini",
            temperature=0,
        )
-        self.toolkit = SQLDatabaseToolkit(db=db, llm=self.llm)
+
-        # Update system message to specifically request only fund IDs
+        # Query cache for performance
-        system_message_updated = (
+        self.query_cache = {}
-            prompt_template.format(dialect="SQLite", top_k=5)
+
-            + "\n\n=== IMPORTANT TERMINOLOGY ==="
+        # SQL generation prompt
-            + "\n- When users say 'investors' or 'find me investors', they mean FUNDS"
+        self.sql_prompt = ChatPromptTemplate.from_messages(
-            + "\n- Always query the 'funds' table for investment opportunities"
+            [
-            + "\n- The 'investors' table is for parent company information only"
+                (
-            + "\n- Relationship: investors (1) -> (many) funds"
+                    "system",
-            + "\n\n=== YOUR TASK ==="
+                    """You are a SQL expert. Generate a SQLite query to find fund IDs based on user requirements.
-            + "\nReturn ONLY fund IDs (funds.id) that match the user's criteria."
+
-            + "\nFormat: comma-separated numbers only (e.g., 1, 5, 12, 23)"
+Database Schema:
-            + "\nNo explanations, no other data."
+- funds: id, fund_name, investor_id, check_size_lower, check_size_upper, geographic_focus
-            + "\n\n=== QUERY GUIDELINES ==="
+- fund_sectors: fund_id, sector_id
-            + "\n1. For geographic searches: use funds.geographic_focus"
+- fund_investment_stages: fund_id, stage_id
-            + "\n2. For sector searches: JOIN with fund_sectors table"
+- sectors: id, name
-            + "\n3. For stage searches: JOIN with fund_investment_stages table"
+- investment_stages: id, name
-            + "\n4. If no results: respond with 'NO_RESULTS'"
+- investors: id, name, aum
-            + "\n5. Never repeat the same failed query"
+
-        )
+IMPORTANT RULES:
-        self.agent = create_react_agent(
+1. ALWAYS return ONLY fund IDs (funds.id) - use SELECT DISTINCT f.id
-            model=self.llm,
+2. For geography: Be FLEXIBLE - use OR with variations and partial matches
-            tools=self.toolkit.get_tools(),
+   - 'Europe' → WHERE geographic_focus LIKE '%Europe%' OR geographic_focus LIKE '%European%'
-            prompt=system_message_updated,
+   - 'America' → WHERE geographic_focus LIKE '%America%' OR geographic_focus LIKE '%US%' OR geographic_focus LIKE '%United States%'
   - 'Asia' → WHERE geographic_focus LIKE '%Asia%' OR geographic_focus LIKE '%Asian%'
   - If no geography specified, DON'T filter by geography
 3. For stages: Use LEFT JOIN and LIKE for flexible matching with synonyms
   - 'Seed' → s.name LIKE '%Seed%' OR s.name LIKE '%Pre-Seed%' OR s.name LIKE '%Early%'
   - 'Series A' → s.name LIKE '%Series A%' OR s.name LIKE '%A%'
   - 'Growth' → s.name LIKE '%Growth%' OR s.name LIKE '%Late%' OR s.name LIKE '%Expansion%'
   - If stage not specified, include ALL funds
 4. For sectors: Use LEFT JOIN and include related terms with OR
   - 'Fintech' → sec.name LIKE '%Fintech%' OR sec.name LIKE '%Finance%' OR sec.name LIKE '%Financial%'
   - 'AI' → sec.name LIKE '%AI%' OR sec.name LIKE '%Artificial Intelligence%' OR sec.name LIKE '%Machine Learning%' OR sec.name LIKE '%ML%'
   - 'Healthcare' → sec.name LIKE '%Healthcare%' OR sec.name LIKE '%Health%' OR sec.name LIKE '%Medical%' OR sec.name LIKE '%Biotech%'
 5. For check size filters (be flexible with ranges):
   - "under X" → WHERE (check_size_upper <= X OR check_size_upper IS NULL)
   - "over X" → WHERE (check_size_lower >= X OR check_size_lower IS NULL)
   - "between X and Y" → WHERE check_size_lower >= X AND check_size_upper <= Y
 6. Use LEFT JOIN for stages and sectors so funds without tags still match
 7. Use DISTINCT to avoid duplicates from joins
 8. Be INCLUSIVE - use OR conditions to cast a wider net
 9. If query is very simple (e.g., just "seed stage"), don't add unnecessary filters
 10. Return a single, complete SELECT query
 Example Queries:
 Q: "Seed stage investors in Europe"
 A: SELECT DISTINCT f.id FROM funds f 
   LEFT JOIN fund_investment_stages fis ON f.id = fis.fund_id 
   LEFT JOIN investment_stages s ON fis.stage_id = s.id 
   WHERE (s.name LIKE '%Seed%' OR s.name LIKE '%Pre-Seed%' OR s.name LIKE '%Early%' OR s.id IS NULL)
   AND (f.geographic_focus LIKE '%Europe%' OR f.geographic_focus LIKE '%European%')
 Q: "Fintech investors with check size under 5 million"
 A: SELECT DISTINCT f.id FROM funds f 
   LEFT JOIN fund_sectors fs ON f.id = fs.fund_id 
   LEFT JOIN sectors sec ON fs.sector_id = sec.id 
   WHERE (sec.name LIKE '%Fintech%' OR sec.name LIKE '%Finance%' OR sec.name LIKE '%Financial%' OR sec.id IS NULL)
   AND (f.check_size_upper <= 5000000 OR f.check_size_upper IS NULL)
 Q: "Seed stage investors"
 A: SELECT DISTINCT f.id FROM funds f 
   LEFT JOIN fund_investment_stages fis ON f.id = fis.fund_id 
   LEFT JOIN investment_stages s ON fis.stage_id = s.id 
   WHERE s.name LIKE '%Seed%' OR s.name LIKE '%Pre-Seed%' OR s.name LIKE '%Early%'
 Q: "Growth stage investors"
 A: SELECT DISTINCT f.id FROM funds f 
   LEFT JOIN fund_investment_stages fis ON f.id = fis.fund_id 
   LEFT JOIN investment_stages s ON fis.stage_id = s.id 
   WHERE s.name LIKE '%Growth%' OR s.name LIKE '%Late%' OR s.name LIKE '%Expansion%' OR s.name LIKE '%Series C%' OR s.name LIKE '%Series D%'
 Q: "AI investors in America"
 A: SELECT DISTINCT f.id FROM funds f 
   LEFT JOIN fund_sectors fs ON f.id = fs.fund_id 
   LEFT JOIN sectors sec ON fs.sector_id = sec.id 
   WHERE (sec.name LIKE '%AI%' OR sec.name LIKE '%Artificial Intelligence%' OR sec.name LIKE '%Machine Learning%' OR sec.name LIKE '%ML%')
   AND (f.geographic_focus LIKE '%America%' OR f.geographic_focus LIKE '%US%' OR f.geographic_focus LIKE '%United States%' OR f.geographic_focus LIKE '%USA%')
 Q: "Healthcare investors"
 A: SELECT DISTINCT f.id FROM funds f 
   LEFT JOIN fund_sectors fs ON f.id = fs.fund_id 
   LEFT JOIN sectors sec ON fs.sector_id = sec.id 
   WHERE sec.name LIKE '%Healthcare%' OR sec.name LIKE '%Health%' OR sec.name LIKE '%Medical%' OR sec.name LIKE '%Biotech%' OR sec.name LIKE '%Pharma%'
 IMPORTANT: Use LEFT JOIN so funds without sector/stage tags can still match. Include synonym terms with OR for better recall.
 Return ONLY the SQL query, no explanations or markdown.""",
                ),
                ("user", "{question}"),
            ]
        )
-    def process_query(
+    def _get_cache_key(self, question: str) -> str:
        """Generate cache key from normalized question."""
        return hashlib.md5(question.lower().strip().encode()).hexdigest()
    async def process_query(
        self, question: str, project_id: Optional[int] = None
    ) -> PaginatedResponse[InvestmentResponse]:
-        """Process a query using the LLM and return investment response data.
+        """Async wrapper for process_query. Runs blocking work in a thread to avoid
-
+        blocking the event loop.
        Args:
            question: The natural language query to process
            project_id: Optional project ID for compatibility scoring
        """
-        # Let the LLM handle all database interactions and filtering to get fund IDs
+        return await asyncio.to_thread(self._process_query_sync, question, project_id)
        response = self.agent.invoke(
            {"messages": [("user", question)]},
            config={"recursion_limit": 50},
        )
-        # Extract the actual message content
+    def _process_query_sync(
-        logger.info(f"{response}")
+        self, question: str, project_id: Optional[int] = None
-        final_message_content = response["messages"][-1].content
+    ) -> PaginatedResponse[InvestmentResponse]:
-        logger.info(f"AI Response: \n{final_message_content}")
+        """Synchronous implementation of process_query. This is run in a thread by
-        # Extract fund IDs from the AI response
+        the async wrapper above.
-        fund_ids = self._extract_fund_ids_from_response(final_message_content)
+        """
        cache_key = self._get_cache_key(question)
-        # Fetch full fund data with investor relationships using the IDs
+        # Check cache first
-        return self._fetch_funds_by_ids(fund_ids, project_id)
+        if cache_key in self.query_cache:
            sql_query = self.query_cache[cache_key]
            logger.info(f"Using cached SQL: {sql_query}")
        else:
            # Generate SQL query
            messages = self.sql_prompt.format_messages(question=question)
            response = self.llm.invoke(messages)
            sql_query = response.content.strip()
-    def _extract_fund_ids_from_response(self, ai_response: str) -> List[int]:
+            # Clean up SQL (remove markdown code blocks if present)
-        """Extract fund IDs from AI response."""
+            sql_query = sql_query.replace("```sql", "").replace("```", "").strip()
        import re
-        fund_ids = []
+            # Cache the query
            self.query_cache[cache_key] = sql_query
            logger.info(f"Generated SQL: {sql_query}")
        # Execute query to get fund IDs
        db_session = next(get_db())
        try:
-            # Try multiple patterns to extract IDs from the response
+            result = db_session.execute(text(sql_query))
-            # Pattern 1: Simple numbers (assuming they are IDs)
+            fund_ids = [row[0] for row in result.fetchall()]
-            numbers = re.findall(r"\b\d+\b", ai_response)
+            logger.info(
-            fund_ids = [int(num) for num in numbers]
+                f"Found {len(fund_ids)} fund IDs: {fund_ids[:10]}{'...' if len(fund_ids) > 10 else ''}"
-
+            )
            # Pattern 2: If response contains explicit ID references
            id_matches = re.findall(r"\bid[:\s]*(\d+)", ai_response.lower())
            if id_matches:
                fund_ids = [int(id_str) for id_str in id_matches]
            return self._fetch_funds_by_ids(fund_ids, project_id)
        except Exception as e:
-            print(f"Error extracting IDs from response: {e}")
+            logger.error(f"SQL execution error: {e}")
-            return []
+            logger.error(f"Failed SQL: {sql_query}")
-
+            # Return empty result
-        return fund_ids
+            return PaginatedResponse(
                items=[], total=0, page=1, page_size=10, total_pages=0
            )
        finally:
            db_session.close()
    def _fetch_funds_by_ids(
        self, fund_ids: List[int], project_id: Optional[int] = None
@@ -185,10 +258,10 @@ class QueryProcessor:
                    else None
                )
-                # Get top 3 sectors from fund (id and name only)
+                # Get top 3 sectors from fund (id and name only) - sorted alphabetically
                fund_sectors = [
                    SectorMinimal(id=sector.id, name=sector.name)
-                    for sector in (fund.sectors[:3] if fund.sectors else [])
+                    for sector in sorted(fund.sectors[:3] if fund.sectors else [], key=lambda s: s.name)
                ]
                investment_response = InvestmentResponse(
@@ -1,9 +1,13 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 # Import database models and compatibility score service
 from db.models import InvestorTable, ProjectTable
 from jinja2 import Environment, FileSystemLoader
 from playwright.async_api import async_playwright
 from services.compatibility_score import calculate_project_investor_compatibility
 class ReportGenerator:
    """Service for generating PDF reports from HTML templates"""
@@ -17,6 +21,8 @@ class ReportGenerator:
        self,
        investor_data: Dict[str, Any],
        project_data: Optional[Dict[str, Any]] = None,
        investor_model: Optional[InvestorTable] = None,
        project_model: Optional[ProjectTable] = None,
    ) -> bytes:
        """
        Generate a PDF report for an investor profile.
@@ -24,12 +30,16 @@ class ReportGenerator:
        Args:
            investor_data: Dictionary containing investor information
            project_data: Optional dictionary containing project information for compatibility analysis
            investor_model: Optional database model for investor (used for compatibility scoring)
            project_model: Optional database model for project (used for compatibility scoring)
        Returns:
            bytes: PDF file content
        """
        # Prepare template context
-        context = self._prepare_context(investor_data, project_data)
+        context = self._prepare_context(
            investor_data, project_data, investor_model, project_model
        )
        # Render HTML from template
        template = self.env.get_template("report.html")
@@ -43,6 +53,8 @@ class ReportGenerator:
        self,
        investor_data: Dict[str, Any],
        project_data: Optional[Dict[str, Any]] = None,
        investor_model: Optional[InvestorTable] = None,
        project_model: Optional[ProjectTable] = None,
    ) -> Dict[str, Any]:
        """Prepare the context dictionary for template rendering"""
        context = {
@@ -55,9 +67,20 @@ class ReportGenerator:
        # If project data is provided, calculate compatibility
        if project_data:
-            context["compatibility_score"] = self._calculate_compatibility_score(
+            # Use the compatibility_score service if models are provided
-                investor_data, project_data
+            if investor_model and project_model:
-            )
+                # Calculate using the standardized compatibility score service
                # Returns score between 0 and 1, convert to percentage (0-100)
                score_decimal = calculate_project_investor_compatibility(
                    project=project_model, investor=investor_model, use_funds=True
                )
                context["compatibility_score"] = int(score_decimal * 100)
            else:
                # Fallback to old calculation method if models not provided
                context["compatibility_score"] = self._calculate_compatibility_score(
                    investor_data, project_data
                )
            context["match_criteria"] = self._generate_match_criteria(
                investor_data, project_data
            )
@@ -76,43 +99,75 @@ class ReportGenerator:
            "sector": 30,
            "stage": 30,
            "geography": 20,
-            "check_size": 15,
+            "check_size": 20,
            "thesis": 5,
        }
        # Aggregate data from all funds
        all_sectors = set(investor_data.get("sectors", []))
        all_stages = set()
        all_geographies = []
        check_ranges = []
        for fund in investor_data.get("funds", []):
            all_sectors.update(fund.get("sectors", []))
            all_stages.update(fund.get("investment_stages", []))
            if fund.get("geographic_focus"):
                all_geographies.append(fund["geographic_focus"])
            if fund.get("check_size_lower") and fund.get("check_size_upper"):
                check_ranges.append(
                    {
                        "lower": fund["check_size_lower"],
                        "upper": fund["check_size_upper"],
                    }
                )
        # Sector match
        investor_sectors = set(investor_data.get("sectors", []))
        project_sectors = set(project_data.get("sectors", []))
-        if investor_sectors and project_sectors:
+        if all_sectors and project_sectors:
-            if investor_sectors & project_sectors:
+            if all_sectors & project_sectors:
                score += weights["sector"]
-        # Stage match
+        # Stage match - case insensitive comparison
        investor_stages = set(investor_data.get("investment_stages", []))
        project_stage = project_data.get("stage")
-        if project_stage and project_stage in investor_stages:
+        if project_stage and all_stages:
-            score += weights["stage"]
+            # Normalize stage names for comparison (case-insensitive)
            normalized_stages = {
                stage.lower().replace("_", " ") for stage in all_stages
            }
            project_stage_normalized = project_stage.lower().replace("_", " ")
            if project_stage_normalized in normalized_stages:
                score += weights["stage"]
-        # Geography match
+        # Geography match - check if any fund matches
        investor_geo = (investor_data.get("geographic_focus") or "").lower()
        project_geo = (project_data.get("location") or "").lower()
-        if investor_geo and project_geo and investor_geo in project_geo:
+        geo_match = False
        if all_geographies:
            for geo in all_geographies:
                if geo:
                    geo_lower = geo.lower()
                    # Match if investor geography is "global" or if there's a location overlap
                    if "global" in geo_lower or "worldwide" in geo_lower:
                        geo_match = True
                        break
                    if project_geo and (
                        geo_lower in project_geo or project_geo in geo_lower
                    ):
                        geo_match = True
                        break
        if geo_match:
            score += weights["geography"]
-        # Check size match
+        # Check size match - check if any fund's range matches
        project_valuation = project_data.get("valuation", 0)
-        check_lower = investor_data.get("check_size_lower") or 0
+        check_match = False
-        check_upper = investor_data.get("check_size_upper") or float("inf")
+        if project_valuation and check_ranges:
-        if (
+            for check_range in check_ranges:
-            check_lower
+                if check_range["lower"] <= project_valuation <= check_range["upper"]:
-            and check_upper
+                    check_match = True
-            and check_lower <= project_valuation <= check_upper
+                    break
-        ):
+        if check_match:
            score += weights["check_size"]
        # Thesis alignment (simplified)
        score += weights["thesis"]
        return min(score, 100)
    def _generate_match_criteria(
@@ -121,86 +176,124 @@ class ReportGenerator:
        """Generate detailed match criteria table"""
        criteria = []
        # Aggregate data from all funds
        all_sectors = set(investor_data.get("sectors", []))
        all_stages = set()
        all_geographies = []
        check_ranges = []
        for fund in investor_data.get("funds", []):
            all_sectors.update(fund.get("sectors", []))
            all_stages.update(fund.get("investment_stages", []))
            if fund.get("geographic_focus"):
                all_geographies.append(fund["geographic_focus"])
            if fund.get("check_size_lower") and fund.get("check_size_upper"):
                check_ranges.append(
                    {
                        "lower": fund["check_size_lower"],
                        "upper": fund["check_size_upper"],
                        "fund_name": fund.get("fund_name", "Unnamed Fund"),
                    }
                )
        # Sector criterion
        investor_sectors = investor_data.get("sectors", [])
        project_sectors = project_data.get("sectors", [])
-        sector_match = (
+        sector_match = "Perfect" if all_sectors & set(project_sectors) else "Mismatch"
            "Perfect" if set(investor_sectors) & set(project_sectors) else "Mismatch"
        )
        criteria.append(
            {
                "name": "Sector",
-                "requirement": "Cybersecurity, B2B SaaS" if project_sectors else "N/A",
+                "requirement": ", ".join(project_sectors) if project_sectors else "N/A",
-                "evidence": ", ".join(investor_sectors[:3])
+                "evidence": ", ".join(list(all_sectors)[:3]) if all_sectors else "N/A",
                if investor_sectors
                else "N/A",
                "match": sector_match,
                "weight": "30%",
            }
        )
-        # Stage criterion
+        # Stage criterion - case insensitive comparison
        investor_stages = investor_data.get("investment_stages", [])
        project_stage = project_data.get("stage", "N/A")
-        stage_match = "Perfect" if project_stage in investor_stages else "Mismatch"
+        stage_match = "Mismatch"
        if project_stage != "N/A" and all_stages:
            # Normalize stage names for comparison
            normalized_stages = {
                stage.lower().replace("_", " ") for stage in all_stages
            }
            project_stage_normalized = project_stage.lower().replace("_", " ")
            stage_match = (
                "Perfect"
                if project_stage_normalized in normalized_stages
                else "Mismatch"
            )
        elif project_stage == "N/A":
            stage_match = "N/A"
        criteria.append(
            {
                "name": "Stage",
                "requirement": str(project_stage),
-                "evidence": ", ".join(investor_stages) if investor_stages else "N/A",
+                "evidence": ", ".join(all_stages) if all_stages else "N/A",
                "match": stage_match,
                "weight": "30%",
            }
        )
        # Geography criterion
        investor_geo = investor_data.get("geographic_focus") or "N/A"
        project_geo = project_data.get("location") or "N/A"
        investor_geo_display = ", ".join(all_geographies) if all_geographies else "N/A"
        # Safe comparison handling None values and "Global" matches
        geo_match = "Mismatch"
        if project_geo != "N/A" and all_geographies:
            for geo in all_geographies:
                if geo:
                    geo_lower = geo.lower()
                    # Match if investor geography is "global" or if there's a location overlap
                    if "global" in geo_lower or "worldwide" in geo_lower:
                        geo_match = "Perfect"
                        break
                    if (
                        geo_lower in project_geo.lower()
                        or project_geo.lower() in geo_lower
                    ):
                        geo_match = "Strong"
                        break
        elif not all_geographies and project_geo == "N/A":
            geo_match = "N/A"
        # Safe comparison handling None values
        if investor_geo == "N/A" or project_geo == "N/A":
            geo_match = (
                "N/A" if investor_geo == "N/A" and project_geo == "N/A" else "Mismatch"
            )
        else:
            investor_geo_lower = investor_geo.lower()
            project_geo_lower = project_geo.lower()
            geo_match = (
                "Strong"
                if investor_geo_lower in project_geo_lower
                or project_geo_lower in investor_geo_lower
                else "Mismatch"
            )
        criteria.append(
            {
                "name": "Geography",
                "requirement": project_geo,
-                "evidence": investor_geo,
+                "evidence": investor_geo_display,
                "match": geo_match,
                "weight": "20%",
            }
        )
        # Check Size criterion
        check_lower = investor_data.get("check_size_lower") or 0
        check_upper = investor_data.get("check_size_upper") or 0
        project_val = project_data.get("valuation", 0)
        # Build evidence string from all fund ranges
        check_evidence = "N/A"
-        if check_lower and check_upper:
+        if check_ranges:
-            check_evidence = (
+            evidence_parts = []
-                f"€{check_lower / 1000000:.0f}M - €{check_upper / 1000000:.0f}M"
+            for cr in check_ranges[:3]:  # Show up to 3 funds
-            )
+                range_str = (
-        elif check_lower:
+                    f"€{cr['lower'] / 1000000:.0f}M - €{cr['upper'] / 1000000:.0f}M"
-            check_evidence = f"€{check_lower / 1000000:.0f}M+"
+                )
                if cr["fund_name"]:
                    evidence_parts.append(f"{cr['fund_name']}: {range_str}")
                else:
                    evidence_parts.append(range_str)
            check_evidence = "; ".join(evidence_parts)
        # Check if project valuation matches any fund
        check_match = "N/A"
        if project_val > 0 and check_ranges:
            match_found = any(
                cr["lower"] <= project_val <= cr["upper"] for cr in check_ranges
            )
            check_match = "Perfect" if match_found else "Mismatch"
        check_match = (
            "Perfect"
            if check_lower and check_upper and check_lower <= project_val <= check_upper
            else "Strong"
            if project_val > 0
            else "N/A"
        )
        criteria.append(
            {
                "name": "Check Size",
@@ -209,19 +302,7 @@ class ReportGenerator:
                else "N/A",
                "evidence": check_evidence,
                "match": check_match,
-                "weight": "15%",
+                "weight": "20%",
            }
        )
        # Thesis criterion
        thesis = investor_data.get("investment_thesis", [])
        criteria.append(
            {
                "name": "Thesis",
                "requirement": "Founder-led, ESG focus",
                "evidence": ", ".join(thesis[:2]) if thesis else "Entrepreneur-led",
                "match": "Strong",
                "weight": "5%",
            }
        )
@@ -161,13 +161,6 @@
                                </p>
                            </div>
                            <div>
                                <p class="text-xs text-gray-600">DACH Region:</p>
                                <p class="font-semibold text-gray-900">
                                    {{ investor.geographic_focus or 'N/A' }}
                                </p>
                            </div>
                            <div>
                                <p class="text-xs text-gray-600">AUM (EUR million):</p>
                                <p class="font-semibold text-gray-900">
@@ -179,33 +172,47 @@
                                </p>
                            </div>
-                    <div class="mb-4">
+                            <div>
-                        <p class="text-xs text-gray-600 mb-1">
+                                <p class="text-xs text-gray-600 mb-1">Number of Funds:</p>
-                            Investment Stage:
+                                <p class="font-semibold text-gray-900">
-                        </p>
+                                    {{ investor.funds | length if investor.funds else 'N/A' }}
                        <p class="text-sm font-semibold text-gray-900">
                            {% if investor.investment_stages %} {{
                            investor.investment_stages | join(', ') }} {% else
                            %} N/A {% endif %}
                        </p>
                    </div>
                    <div class="mb-4">
                        <p class="text-xs text-gray-600 mb-1">
                            Est. Investment Size:
                        </p>
                        <p class="text-sm font-semibold text-gray-900">
                            {% if investor.check_size_lower and
                            investor.check_size_upper %} €{{
                            '{:,.0f}'.format(investor.check_size_lower /
                            1000000) }}M - €{{
                            '{:,.0f}'.format(investor.check_size_upper /
                            1000000) }}M {% elif investor.check_size_lower %}
                            €{{ '{:,.0f}'.format(investor.check_size_lower /
                            1000000) }}M+ {% else %} N/A {% endif %}
                                </p>
                            </div>
                        </div>
                        <div class="mt-4">
                            <h3 class="text-xs font-bold text-gray-900 uppercase mb-2">
                                Fund Details
                            </h3>
                            {% if investor.funds %}
                            {% for fund in investor.funds %}
                            <div class="mb-3 pb-3 border-b border-gray-200">
                                <p class="text-sm font-semibold text-gray-900 mb-1">
                                    {{ fund.fund_name or 'Fund ' + loop.index|string }}
                                </p>
                                <div class="text-xs text-gray-700 space-y-1">
                                    {% if fund.fund_size %}
                                    <p>Fund Size: €{{ '{:,.0f}'.format(fund.fund_size / 1000000) }}M</p>
                                    {% endif %}
                                    {% if fund.check_size_lower and fund.check_size_upper %}
                                    <p>Check Size: €{{ '{:,.0f}'.format(fund.check_size_lower / 1000000) }}M - €{{ '{:,.0f}'.format(fund.check_size_upper / 1000000) }}M</p>
                                    {% endif %}
                                    {% if fund.geographic_focus %}
                                    <p>Geography: {{ fund.geographic_focus }}</p>
                                    {% endif %}
                                    {% if fund.investment_stages %}
                                    <p>Stages: {{ fund.investment_stages | join(', ') }}</p>
                                    {% endif %}
                                    {% if fund.sectors %}
                                    <p>Sectors: {{ fund.sectors[:3] | join(', ') }}</p>
                                    {% endif %}
                                </div>
                            </div>
                            {% endfor %}
                            {% else %}
                            <p class="text-xs text-gray-500">No fund information available</p>
                            {% endif %}
                        </div>
                    </div>
                </div>
@@ -0,0 +1,117 @@
 """
 Migration: Add fields from feedback fixes
 Date: 2025-01-07
 Adds the following fields:
 - projects.is_archived (INTEGER, default 0)
 - companies.product_service (TEXT, nullable)
 - companies.clients (TEXT, nullable - stored as JSON string)
 - investor_members.linkedin (VARCHAR, nullable)
 """
 import sys
 from pathlib import Path
 # Add parent directory to path to import app modules
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from sqlalchemy import text
 from app.db.db import engine
 def check_column_exists(conn, table_name, column_name):
    """Check if a column exists in a table"""
    result = conn.execute(text(f"PRAGMA table_info({table_name})"))
    columns = [row[1] for row in result]
    return column_name in columns
 def upgrade():
    """Add new columns to tables"""
    print("Running migration: Add feedback fixes fields")
    print("=" * 60)
    with engine.begin() as conn:  # Use begin() for transaction management
        # 1. Add is_archived to projects table
        print("\n1. Adding 'is_archived' column to projects table...")
        if check_column_exists(conn, "projects", "is_archived"):
            print("   ✓ Column 'is_archived' already exists. Skipping.")
        else:
            conn.execute(
                text(
                    "ALTER TABLE projects ADD COLUMN is_archived INTEGER DEFAULT 0 NOT NULL"
                )
            )
            # Set default value for existing rows
            conn.execute(
                text("UPDATE projects SET is_archived = 0 WHERE is_archived IS NULL")
            )
            print("   ✓ Successfully added 'is_archived' column to projects table")
        # 2. Add product_service to companies table
        print("\n2. Adding 'product_service' column to companies table...")
        if check_column_exists(conn, "companies", "product_service"):
            print("   ✓ Column 'product_service' already exists. Skipping.")
        else:
            conn.execute(text("ALTER TABLE companies ADD COLUMN product_service TEXT"))
            print("   ✓ Successfully added 'product_service' column to companies table")
        # 3. Add clients to companies table
        print("\n3. Adding 'clients' column to companies table...")
        if check_column_exists(conn, "companies", "clients"):
            print("   ✓ Column 'clients' already exists. Skipping.")
        else:
            conn.execute(text("ALTER TABLE companies ADD COLUMN clients TEXT"))
            print("   ✓ Successfully added 'clients' column to companies table")
        # 4. Add linkedin to investor_members table
        print("\n4. Adding 'linkedin' column to investor_members table...")
        if check_column_exists(conn, "investor_members", "linkedin"):
            print("   ✓ Column 'linkedin' already exists. Skipping.")
        else:
            conn.execute(
                text("ALTER TABLE investor_members ADD COLUMN linkedin VARCHAR")
            )
            print("   ✓ Successfully added 'linkedin' column to investor_members table")
    print("\n" + "=" * 60)
    print("Migration completed successfully!")
 def downgrade():
    """Remove added columns from tables"""
    print("Running downgrade: Remove feedback fixes fields")
    print("=" * 60)
    # Note: SQLite doesn't support DROP COLUMN directly
    print("\nWarning: SQLite doesn't support DROP COLUMN directly.")
    print("To remove these columns, you would need to:")
    print("1. Create new tables without the columns")
    print("2. Copy data from old tables to new tables")
    print("3. Drop old tables and rename new tables")
    print("\nColumns to remove:")
    print("  - projects.is_archived")
    print("  - companies.product_service")
    print("  - companies.clients")
    print("  - investor_members.linkedin")
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Run database migration")
    parser.add_argument(
        "direction",
        choices=["upgrade", "downgrade"],
        default="upgrade",
        nargs="?",
        help="Migration direction (default: upgrade)",
    )
    args = parser.parse_args()
    if args.direction == "upgrade":
        upgrade()
    else:
        downgrade()
@@ -0,0 +1,67 @@
 """
 Migration: Add industry column to projects table
 Date: 2025-10-23
 """
 import os
 import sys
 from pathlib import Path
 # Add parent directory to path to import app modules
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from sqlalchemy import create_engine, text
 from app.db.db import DATABASE_URL, engine
 def upgrade():
    """Add industry column to projects table"""
    print("Running migration: Add industry column to projects table")
    with engine.connect() as conn:
        # Check if column already exists
        result = conn.execute(text("PRAGMA table_info(projects)"))
        columns = [row[1] for row in result]
        if 'industry' in columns:
            print("Column 'industry' already exists in projects table. Skipping migration.")
            return
        # Add the industry column
        conn.execute(text("ALTER TABLE projects ADD COLUMN industry VARCHAR"))
        conn.commit()
        print("Successfully added 'industry' column to projects table")
 def downgrade():
    """Remove industry column from projects table"""
    print("Running downgrade: Remove industry column from projects table")
    # Note: SQLite doesn't support DROP COLUMN directly
    # This is a simplified version - in production you'd need to recreate the table
    print("Warning: SQLite doesn't support DROP COLUMN.")
    print("To remove the column, you would need to:")
    print("1. Create a new table without the industry column")
    print("2. Copy data from old table to new table")
    print("3. Drop old table and rename new table")
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Run database migration")
    parser.add_argument(
        "direction",
        choices=["upgrade", "downgrade"],
        default="upgrade",
        nargs="?",
        help="Migration direction (default: upgrade)"
    )
    args = parser.parse_args()
    if args.direction == "upgrade":
        upgrade()
    else:
        downgrade()
@@ -0,0 +1,68 @@
 #!/bin/bash
 # Server management script for app/main.py
 # Usage: ./server_manager.sh start|stop|restart
 PID_FILE="server.pid"
 LOG_FILE="server.log"
 start() {
    if [ -f "$PID_FILE" ] && kill -0 $(cat "$PID_FILE") 2>/dev/null; then
        echo "Server is already running (PID: $(cat "$PID_FILE"))"
        return 1
    fi
    echo "Starting server..."
    nohup uv run app/main.py > "$LOG_FILE" 2>&1 &
    echo $! > "$PID_FILE"
    echo "Server started (PID: $(cat "$PID_FILE"))"
 }
 stop() {
    if [ ! -f "$PID_FILE" ]; then
        echo "Server is not running (no PID file found)"
        return 1
    fi
    PID=$(cat "$PID_FILE")
    if ! kill -0 "$PID" 2>/dev/null; then
        echo "Server is not running (PID $PID not found)"
        rm -f "$PID_FILE"
        return 1
    fi
    echo "Stopping server (PID: $PID)..."
    kill "$PID"
    # Wait for process to stop
    for i in {1..10}; do
        if ! kill -0 "$PID" 2>/dev/null; then
            break
        fi
        sleep 1
    done
    if kill -0 "$PID" 2>/dev/null; then
        echo "Force killing server..."
        kill -9 "$PID"
    fi
    rm -f "$PID_FILE"
    echo "Server stopped"
 }
 restart() {
    stop
    sleep 2
    start
 }
 case "$1" in
    start)
        start
        ;;
    stop)
        stop
        ;;
    restart)
        restart
        ;;
    *)
        echo "Usage: $0 {start|stop|restart}"
        exit 1
        ;;
 esac
@@ -0,0 +1,310 @@
 #!/usr/bin/env python3
 """
 Update Investor Members LinkedIn Profiles Script
 This script finds and updates LinkedIn profile URLs for investor members in the database.
 Uses crawl4ai to efficiently scrape team pages and extract LinkedIn URLs.
 Usage:
    python update_linkedin_profiles.py [--test] [--limit N] [--skip-existing]
 Options:
    --test          Test mode: process only 10 records and don't update database
    --limit N       Process only N records (default: all)
    --skip-existing Skip members that already have LinkedIn URLs
    --start-from N  Start from record N (for resuming)
 """
 import argparse
 import asyncio
 import json
 import os
 import sys
 from datetime import datetime
 # Add app to path
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "app"))
 from db.db import get_db_session
 from db.models import InvestorMember, InvestorTable
 from linkedin_scraper import LinkedInProfileScraper, format_linkedin_url
 def progress_callback(current, total, result):
    """Print progress updates"""
    percent = (current / total) * 100
    status = "✓" if result["linkedin_url"] else "✗"
    print(f"[{current}/{total} - {percent:.1f}%] {status} {result['member_name']}")
    if result["linkedin_url"]:
        print(
            f"  → {result['linkedin_url']} (confidence: {result['confidence']}%, method: {result['method']})"
        )
 def create_db_callback(test_mode=False):
    """
    Create a callback function that saves LinkedIn profiles to the database immediately.
    This allows stopping and resuming without losing progress.
    """
    saved_count = {"count": 0}  # Use dict to allow modification in closure
    def db_callback(member_id: int, linkedin_url: str) -> bool:
        """Save LinkedIn URL to database immediately"""
        if test_mode:
            print(f"  [TEST] Would save to DB: member {member_id}")
            saved_count["count"] += 1
            return True
        try:
            db = get_db_session()
            member = db.query(InvestorMember).filter_by(id=member_id).first()
            if member:
                member.linkedin = format_linkedin_url(linkedin_url)
                db.commit()
                saved_count["count"] += 1
                return True
        except Exception as e:
            print(f"  ⚠️  DB Error for member {member_id}: {e}")
            try:
                db.rollback()
            except Exception:
                pass
            return False
        finally:
            try:
                db.close()
            except Exception:
                pass
        return False
    return db_callback, saved_count
 def update_database(members_data, test_mode=False):
    """Update database with found LinkedIn profiles"""
    db = get_db_session()
    try:
        updated_count = 0
        for data in members_data:
            if data["linkedin_url"] and data["member_id"]:
                if not test_mode:
                    member = (
                        db.query(InvestorMember).filter_by(id=data["member_id"]).first()
                    )
                    if member:
                        member.linkedin = format_linkedin_url(data["linkedin_url"])
                        updated_count += 1
                else:
                    print(
                        f"  [TEST MODE] Would update member {data['member_id']}: {data['linkedin_url']}"
                    )
                    updated_count += 1
        if not test_mode:
            db.commit()
            print(f"\n✓ Successfully updated {updated_count} records in database")
        else:
            print(f"\n[TEST MODE] Would have updated {updated_count} records")
        return updated_count
    except Exception as e:
        db.rollback()
        print(f"\n✗ Error updating database: {e}")
        raise
    finally:
        db.close()
 def save_results(results, filename="linkedin_scraping_results.json"):
    """Save results to JSON file for backup/analysis"""
    output = {
        "timestamp": datetime.now().isoformat(),
        "total_processed": len(results),
        "found_count": sum(1 for r in results if r["linkedin_url"]),
        "results": results,
    }
    with open(filename, "w") as f:
        json.dump(output, f, indent=2)
    print(f"\n✓ Results saved to {filename}")
 def print_summary(results):
    """Print summary statistics"""
    total = len(results)
    found = sum(1 for r in results if r["linkedin_url"])
    not_found = total - found
    # Count by method
    methods = {}
    for r in results:
        if r["linkedin_url"]:
            method = r["method"]
            methods[method] = methods.get(method, 0) + 1
    # Average confidence for found profiles
    avg_confidence = (
        sum(r["confidence"] for r in results if r["linkedin_url"]) / found
        if found > 0
        else 0
    )
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Total processed:        {total}")
    print(f"LinkedIn found:         {found} ({found / total * 100:.1f}%)")
    print(f"Not found:              {not_found} ({not_found / total * 100:.1f}%)")
    print(f"\nAverage confidence:     {avg_confidence:.1f}%")
    print("\nMethods used:")
    for method, count in sorted(methods.items(), key=lambda x: x[1], reverse=True):
        print(f"  {method:20s} {count:5d} ({count / found * 100:.1f}%)")
    print("=" * 60)
 def main():
    parser = argparse.ArgumentParser(
        description="Update LinkedIn profiles for investor members"
    )
    parser.add_argument(
        "--test",
        action="store_true",
        help="Test mode: process only 10 records without updating database",
    )
    parser.add_argument("--limit", type=int, help="Limit number of records to process")
    parser.add_argument(
        "--skip-existing",
        action="store_true",
        help="Skip members that already have LinkedIn URLs",
    )
    parser.add_argument(
        "--start-from",
        type=int,
        default=0,
        help="Start from record N (for resuming interrupted runs)",
    )
    parser.add_argument(
        "--rate-limit",
        type=float,
        default=0.5,
        help="Delay between URL crawls in seconds (default: 0.5)",
    )
    args = parser.parse_args()
    # Test mode overrides limit
    if args.test and not args.limit:
        args.limit = 10
    print("=" * 60)
    print("LinkedIn Profile Scraper for Investor Members (crawl4ai)")
    print("=" * 60)
    if args.test:
        print("\n⚠️  TEST MODE - No database changes will be made")
    # Initialize database and scraper
    db = get_db_session()
    try:
        # Build query
        query = db.query(InvestorMember, InvestorTable).join(
            InvestorTable, InvestorMember.investor_id == InvestorTable.id
        )
        # Filter existing if requested
        if args.skip_existing:
            query = query.filter(
                (InvestorMember.linkedin.is_(None)) | (InvestorMember.linkedin == "")
            )
            print("\n✓ Filtering to members without LinkedIn profiles")
        # Get total count
        total_available = query.count()
        print(f"\n✓ Found {total_available} members to process")
        # Apply offset and limit
        if args.start_from > 0:
            query = query.offset(args.start_from)
            print(f"✓ Starting from record {args.start_from}")
        if args.limit:
            query = query.limit(args.limit)
            print(f"✓ Processing {args.limit} records")
        # Fetch members
        members_data = []
        for member, investor in query.all():
            members_data.append(
                {
                    "id": member.id,
                    "name": member.name,
                    "company": investor.name,
                    "role": member.role,
                    "source_url": member.source_url,
                }
            )
        if not members_data:
            print("\n⚠️  No members to process")
            return
        # Count unique source URLs
        unique_urls = len(set(m["source_url"] for m in members_data if m["source_url"]))
        with_urls = sum(1 for m in members_data if m["source_url"])
        print(f"\n✓ Loaded {len(members_data)} members")
        print(
            f"✓ {with_urls} members have source URLs ({unique_urls} unique pages to crawl)"
        )
        print(f"✓ {len(members_data) - with_urls} members without source URLs")
        print(f"✓ Rate limit: {args.rate_limit}s between page crawls")
        print("\nStarting LinkedIn profile search using crawl4ai...\n")
    finally:
        db.close()
    # Initialize scraper
    scraper = LinkedInProfileScraper(rate_limit_delay=args.rate_limit, use_cache=True)
    print("ℹ️  Using crawl4ai to scrape team pages and extract LinkedIn URLs")
    print(
        "ℹ️  Profiles are saved to database IMMEDIATELY when found - safe to stop anytime!\n"
    )
    # Create database callback for real-time saving
    db_callback, saved_count = create_db_callback(test_mode=args.test)
    # Process members asynchronously with real-time DB saving
    results = asyncio.run(
        scraper.batch_find_profiles(
            members_data, progress_callback=progress_callback, db_callback=db_callback
        )
    )
    # Print summary
    print_summary(results)
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"linkedin_results_{timestamp}.json"
    save_results(results, results_file)
    # Show database update summary
    if not args.test:
        print(
            f"\n✓ Database updated in real-time: {saved_count['count']} profiles saved"
        )
    else:
        print(
            f"\n[TEST MODE] Would have saved {saved_count['count']} profiles to database"
        )
    print("\n✓ Done! You can resume anytime with --skip-existing")
 if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
bolade	25d83f24b7	completed investors linkedin	2025-11-28 07:19:58 +01:00
michael	3bc8a24c0c	feat: Add LinkedIn URL support for investor synchronization and update schemas	2025-11-28 06:18:04 +00:00
bolade	495f8a0ff6	added linkedin profiles	2025-11-27 16:44:22 +01:00
michael	100e0b2b0c	made improvements	2025-11-26 08:04:11 +00:00
bolade	b92feaa13a	refactor: Clean up migration script and improve readability by removing unnecessary imports and formatting	2025-11-11 20:28:20 +01:00
bolade	215fec2895	made corrections based on feedback	2025-11-11 20:27:55 +01:00
bolade	5e83734acf	feat: Enhance data models and sorting logic for investors and projects	2025-11-11 13:10:28 +01:00
michael	0e4763bf4f	updated db	2025-11-11 12:07:01 +00:00
michael	8a25e892ad	Merge branch 'version_three' of http://23.29.118.76:3000/bolade/Anton_wireframe into version_three	2025-10-28 23:31:13 +00:00
bolade	6b9fd86ab7	refactor: Improve report generation logic and adjust scoring weights	2025-10-29 00:27:39 +01:00
michael	db2addb835	Merge branch 'version_three' of http://23.29.118.76:3000/bolade/Anton_wireframe into version_three	2025-10-28 22:16:29 +00:00
michael	7048847a42	db update	2025-10-28 22:16:06 +00:00
bolade	45e1f099b8	fixed insight	2025-10-28 23:14:57 +01:00
bolade	e19c8f96eb	feat: Add server management script with start, stop, and restart functionality	2025-10-28 22:03:32 +01:00
bolade	3ab2592c22	Added logging to main	2025-10-28 21:34:35 +01:00
michael	f63672bdac	added db	2025-10-28 20:13:49 +00:00
michael	c53455cc06	feat: Enhance compatibility scoring and report generation with new methods and models	2025-10-28 20:13:45 +00:00
bolade	02c8bb816f	made querying async	2025-10-28 21:09:47 +01:00
bolade	bb03f6ade4	fixed querying	2025-10-28 20:54:15 +01:00
bolade	ff0010019e	feat: Implement company querying functionality with natural language processing and logging	2025-10-27 20:13:24 +01:00
michael	1ac755b2d7	feat: Add industry column to ProjectTable and update related schemas and query filters	2025-10-23 12:52:52 +00:00
bolade	483c2cc114	feat: Update investor report generation and HTML template to include fund details and improve data handling	2025-10-21 10:48:58 +01:00