Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 25d83f24b7 | |||
| 3bc8a24c0c | |||
| 495f8a0ff6 | |||
| 100e0b2b0c | |||
| b92feaa13a | |||
| 215fec2895 | |||
| 5e83734acf | |||
| 0e4763bf4f | |||
| 8a25e892ad | |||
| 6b9fd86ab7 | |||
| db2addb835 | |||
| 7048847a42 | |||
| 45e1f099b8 | |||
| e19c8f96eb | |||
| 3ab2592c22 | |||
| f63672bdac | |||
| c53455cc06 | |||
| 02c8bb816f | |||
| bb03f6ade4 | |||
| ff0010019e | |||
| 1ac755b2d7 | |||
| 483c2cc114 |
+5
-1
@@ -13,4 +13,8 @@
|
|||||||
|
|
||||||
*.cypython
|
*.cypython
|
||||||
|
|
||||||
nohup.out
|
nohup.out
|
||||||
|
|
||||||
|
server.log
|
||||||
|
|
||||||
|
server.pid
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,5 +1,4 @@
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
|
||||||
from typing import Annotated
|
from typing import Annotated
|
||||||
|
|
||||||
from fastapi import Depends
|
from fastapi import Depends
|
||||||
|
|||||||
@@ -162,6 +162,7 @@ class InvestorMember(Base, TimestampMixin):
|
|||||||
role = Column(String, nullable=True)
|
role = Column(String, nullable=True)
|
||||||
title = Column(String, nullable=True) # Alternative to role
|
title = Column(String, nullable=True) # Alternative to role
|
||||||
email = Column(String, nullable=True)
|
email = Column(String, nullable=True)
|
||||||
|
linkedin = Column(String, nullable=True) # LinkedIn profile URL
|
||||||
source_url = Column(String, nullable=True) # URL where member info was found
|
source_url = Column(String, nullable=True) # URL where member info was found
|
||||||
|
|
||||||
investor_id = Column(Integer, ForeignKey("investors.id"))
|
investor_id = Column(Integer, ForeignKey("investors.id"))
|
||||||
@@ -215,6 +216,8 @@ class CompanyTable(Base, TimestampMixin):
|
|||||||
description = Column(String, nullable=True)
|
description = Column(String, nullable=True)
|
||||||
founded_year = Column(Integer, nullable=True)
|
founded_year = Column(Integer, nullable=True)
|
||||||
website = Column(String, nullable=True)
|
website = Column(String, nullable=True)
|
||||||
|
product_service = Column(Text, nullable=True) # Product/service description
|
||||||
|
clients = Column(JSON, nullable=True) # List of client names or client information
|
||||||
|
|
||||||
members = relationship(
|
members = relationship(
|
||||||
"CompanyMember", back_populates="company", cascade="all, delete-orphan"
|
"CompanyMember", back_populates="company", cascade="all, delete-orphan"
|
||||||
@@ -296,9 +299,11 @@ class ProjectTable(Base, TimestampMixin):
|
|||||||
|
|
||||||
stage = Column(Enum(InvestmentStage), nullable=True)
|
stage = Column(Enum(InvestmentStage), nullable=True)
|
||||||
location = Column(String, nullable=True)
|
location = Column(String, nullable=True)
|
||||||
|
industry = Column(String, nullable=True)
|
||||||
description = Column(Text, nullable=True)
|
description = Column(Text, nullable=True)
|
||||||
start_date = Column(DateTime, nullable=True)
|
start_date = Column(DateTime, nullable=True)
|
||||||
end_date = Column(DateTime, nullable=True)
|
end_date = Column(DateTime, nullable=True)
|
||||||
|
is_archived = Column(Integer, default=0, nullable=False) # 0 = active, 1 = archived
|
||||||
|
|
||||||
sector = relationship(
|
sector = relationship(
|
||||||
"SectorTable", secondary=project_sector_association, back_populates="projects"
|
"SectorTable", secondary=project_sector_association, back_populates="projects"
|
||||||
|
|||||||
@@ -0,0 +1,730 @@
|
|||||||
|
"""
|
||||||
|
LinkedIn Profile Scraper for Investor Members
|
||||||
|
|
||||||
|
This module uses crawl4ai to scrape team pages and find LinkedIn profiles.
|
||||||
|
Strategies:
|
||||||
|
1. Crawl the source_url (team pages) to extract LinkedIn profile links
|
||||||
|
2. Use LLM-powered web search to find LinkedIn profiles by name
|
||||||
|
|
||||||
|
Key advantages of crawl4ai:
|
||||||
|
- Handles JavaScript-rendered pages
|
||||||
|
- Better at extracting content from modern websites
|
||||||
|
- More reliable than simple requests
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
from ddgs import DDGS
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from langchain_openai import ChatOpenAI
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("linkedin_scraper")
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
||||||
|
|
||||||
|
|
||||||
|
class LinkedInProfileScraper:
|
||||||
|
"""
|
||||||
|
LinkedIn profile finder using crawl4ai and LLM-powered web search.
|
||||||
|
|
||||||
|
Strategies:
|
||||||
|
1. Crawl source URLs (team pages) to extract LinkedIn links
|
||||||
|
2. Use LLM-powered web search to find profiles by name
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
rate_limit_delay: float = 0.5,
|
||||||
|
use_cache: bool = True,
|
||||||
|
use_llm_search: bool = True,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the scraper
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rate_limit_delay: Delay between requests in seconds
|
||||||
|
use_cache: Whether to cache crawled pages
|
||||||
|
use_llm_search: Whether to use LLM-powered web search as fallback
|
||||||
|
"""
|
||||||
|
self.rate_limit_delay = rate_limit_delay
|
||||||
|
self.use_cache = use_cache
|
||||||
|
self.use_llm_search = use_llm_search and OPENROUTER_API_KEY
|
||||||
|
self.page_cache: Dict[str, str] = {} # Cache crawled pages by URL
|
||||||
|
self.html_cache: Dict[str, str] = {} # Cache HTML separately
|
||||||
|
self.profile_cache: Dict[str, Dict] = {} # Cache results by member
|
||||||
|
|
||||||
|
# Initialize LLM agent if API key available
|
||||||
|
if self.use_llm_search:
|
||||||
|
self._init_llm_agent()
|
||||||
|
else:
|
||||||
|
self.llm = None
|
||||||
|
self.agent = None
|
||||||
|
self.ddg_search = None
|
||||||
|
logger.info("LLM search disabled (no OPENROUTER_API_KEY)")
|
||||||
|
|
||||||
|
def _init_llm_agent(self):
|
||||||
|
"""Initialize LLM agent for web search"""
|
||||||
|
try:
|
||||||
|
self.llm = ChatOpenAI(
|
||||||
|
api_key=OPENROUTER_API_KEY,
|
||||||
|
base_url="https://openrouter.ai/api/v1",
|
||||||
|
model="x-ai/grok-4.1-fast:free",
|
||||||
|
temperature=0,
|
||||||
|
)
|
||||||
|
self.ddg_search = DDGS()
|
||||||
|
logger.info("LLM search agent initialized")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to initialize LLM agent: {e}")
|
||||||
|
self.llm = None
|
||||||
|
self.ddg_search = None
|
||||||
|
|
||||||
|
def web_search(self, query: str) -> List[Dict]:
|
||||||
|
"""Tool to search the web using DuckDuckGo"""
|
||||||
|
if not self.ddg_search:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
results = list(self.ddg_search.text(query, max_results=10))
|
||||||
|
return results
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Web search error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
async def crawl_page(self, url: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Crawl a webpage and return its content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to crawl
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Page content as markdown/text, or None if failed
|
||||||
|
"""
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check cache first
|
||||||
|
if self.use_cache and url in self.page_cache:
|
||||||
|
logger.debug(f"Using cached page for {url}")
|
||||||
|
return self.page_cache[url]
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"Crawling: {url}")
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url)
|
||||||
|
|
||||||
|
if result and result.markdown:
|
||||||
|
content = result.markdown
|
||||||
|
# Also get HTML for better link extraction
|
||||||
|
html_content = result.html if hasattr(result, "html") else ""
|
||||||
|
|
||||||
|
# Cache the results
|
||||||
|
if self.use_cache:
|
||||||
|
self.page_cache[url] = content
|
||||||
|
self.html_cache[url] = html_content
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error crawling {url}: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_linkedin_urls_from_content(self, content: str) -> List[Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Extract all LinkedIn profile URLs from content (HTML or markdown).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts with 'url', 'context', and 'username'
|
||||||
|
"""
|
||||||
|
linkedin_links = []
|
||||||
|
|
||||||
|
# Pattern for LinkedIn profile URLs (handles country-specific domains)
|
||||||
|
linkedin_pattern = (
|
||||||
|
r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)/?"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Find all LinkedIn URLs
|
||||||
|
matches = list(re.finditer(linkedin_pattern, content, re.IGNORECASE))
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
url = match.group(0).rstrip("/")
|
||||||
|
# Normalize URL
|
||||||
|
url = self._normalize_linkedin_url(url)
|
||||||
|
|
||||||
|
# Get surrounding context (200 chars before and after)
|
||||||
|
start = max(0, match.start() - 200)
|
||||||
|
end = min(len(content), match.end() + 200)
|
||||||
|
context = content[start:end]
|
||||||
|
|
||||||
|
# Clean up context (remove HTML tags for readability)
|
||||||
|
context = re.sub(r"<[^>]+>", " ", context)
|
||||||
|
context = " ".join(context.split()) # Normalize whitespace
|
||||||
|
|
||||||
|
linkedin_links.append(
|
||||||
|
{"url": url, "context": context, "username": match.group(1)}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove duplicates while preserving order
|
||||||
|
seen_urls = set()
|
||||||
|
unique_links = []
|
||||||
|
for link in linkedin_links:
|
||||||
|
if link["url"] not in seen_urls:
|
||||||
|
seen_urls.add(link["url"])
|
||||||
|
unique_links.append(link)
|
||||||
|
|
||||||
|
return unique_links
|
||||||
|
|
||||||
|
def _normalize_linkedin_url(self, url: str) -> str:
|
||||||
|
"""Normalize LinkedIn URL to standard format"""
|
||||||
|
# Remove trailing slashes
|
||||||
|
url = url.rstrip("/")
|
||||||
|
|
||||||
|
# Convert country-specific to www
|
||||||
|
url = re.sub(
|
||||||
|
r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url
|
||||||
|
)
|
||||||
|
|
||||||
|
# Ensure https
|
||||||
|
if url.startswith("http://"):
|
||||||
|
url = url.replace("http://", "https://")
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
def _name_matches_context(self, name: str, context: str) -> float:
|
||||||
|
"""
|
||||||
|
Check if a person's name appears in the context around a LinkedIn URL.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Confidence score 0-100
|
||||||
|
"""
|
||||||
|
if not name or not context:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
context_lower = context.lower()
|
||||||
|
name_lower = name.lower()
|
||||||
|
|
||||||
|
# Split name into parts (handle multiple spaces, titles like "Dr.", etc.)
|
||||||
|
name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 1]
|
||||||
|
|
||||||
|
# Check for full name match
|
||||||
|
if name_lower in context_lower:
|
||||||
|
return 95
|
||||||
|
|
||||||
|
# Check for name parts in context
|
||||||
|
matches = sum(
|
||||||
|
1 for part in name_parts if part in context_lower and len(part) > 2
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(name_parts) > 0:
|
||||||
|
if matches == len(name_parts):
|
||||||
|
return 90 # All name parts found
|
||||||
|
elif matches >= 2:
|
||||||
|
return 75 # At least 2 parts found (first + last typically)
|
||||||
|
elif matches == 1 and len(name_parts) <= 2:
|
||||||
|
return 50 # Only one part found but name is short
|
||||||
|
elif matches == 1:
|
||||||
|
return 35 # Only one part found
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def _name_matches_username(self, name: str, username: str) -> float:
|
||||||
|
"""
|
||||||
|
Check if LinkedIn username contains parts of the name.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Confidence score 0-100
|
||||||
|
"""
|
||||||
|
if not name or not username:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
name_lower = name.lower()
|
||||||
|
username_lower = username.lower().replace("-", " ").replace("_", " ")
|
||||||
|
|
||||||
|
name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 2]
|
||||||
|
|
||||||
|
matches = sum(1 for part in name_parts if part in username_lower)
|
||||||
|
|
||||||
|
if len(name_parts) > 0:
|
||||||
|
if matches == len(name_parts) and len(name_parts) >= 2:
|
||||||
|
return 85 # Full name in username
|
||||||
|
elif matches >= 2:
|
||||||
|
return 70 # Multiple parts match
|
||||||
|
elif matches == 1:
|
||||||
|
return 35 # Only one part matches
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
async def find_linkedin_from_source(
|
||||||
|
self, name: str, source_url: str, role: Optional[str] = None
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Find LinkedIn profile by crawling the source URL (team page).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Person's name
|
||||||
|
source_url: URL of the team/about page
|
||||||
|
role: Person's role (for additional context matching)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with linkedin_url, confidence, method, notes
|
||||||
|
"""
|
||||||
|
if not source_url:
|
||||||
|
return {
|
||||||
|
"linkedin_url": None,
|
||||||
|
"confidence": 0,
|
||||||
|
"method": "source_crawl",
|
||||||
|
"notes": "No source URL provided",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Crawl the page
|
||||||
|
content = await self.crawl_page(source_url)
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
return {
|
||||||
|
"linkedin_url": None,
|
||||||
|
"confidence": 0,
|
||||||
|
"method": "source_crawl",
|
||||||
|
"notes": f"Failed to crawl {source_url}",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get HTML for better link extraction
|
||||||
|
html = self.html_cache.get(source_url, content)
|
||||||
|
|
||||||
|
# Extract all LinkedIn URLs from both HTML and markdown
|
||||||
|
linkedin_links = self.extract_linkedin_urls_from_content(html)
|
||||||
|
if not linkedin_links:
|
||||||
|
linkedin_links = self.extract_linkedin_urls_from_content(content)
|
||||||
|
|
||||||
|
if not linkedin_links:
|
||||||
|
return {
|
||||||
|
"linkedin_url": None,
|
||||||
|
"confidence": 0,
|
||||||
|
"method": "source_crawl",
|
||||||
|
"notes": f"No LinkedIn URLs found on {source_url}",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Score each LinkedIn URL based on name matching
|
||||||
|
best_match = None
|
||||||
|
best_score = 0
|
||||||
|
|
||||||
|
for link in linkedin_links:
|
||||||
|
# Score based on context matching
|
||||||
|
context_score = self._name_matches_context(name, link["context"])
|
||||||
|
|
||||||
|
# Score based on username matching
|
||||||
|
username_score = self._name_matches_username(name, link["username"])
|
||||||
|
|
||||||
|
# Also check if role appears in context
|
||||||
|
role_bonus = 0
|
||||||
|
if role and role.lower() in link["context"].lower():
|
||||||
|
role_bonus = 10
|
||||||
|
|
||||||
|
# Combined score (take best of context or username, plus role bonus)
|
||||||
|
total_score = max(context_score, username_score) + role_bonus
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f" {name} -> {link['url']}: context={context_score}, username={username_score}, role={role_bonus}, total={total_score}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if total_score > best_score:
|
||||||
|
best_score = total_score
|
||||||
|
best_match = link
|
||||||
|
|
||||||
|
if best_match and best_score >= 30: # Minimum threshold
|
||||||
|
return {
|
||||||
|
"linkedin_url": best_match["url"],
|
||||||
|
"confidence": min(best_score, 100),
|
||||||
|
"method": "source_crawl",
|
||||||
|
"notes": f"Found on {source_url}",
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"linkedin_url": None,
|
||||||
|
"confidence": 0,
|
||||||
|
"method": "source_crawl",
|
||||||
|
"notes": f'No matching LinkedIn profile found for "{name}" on {source_url}',
|
||||||
|
}
|
||||||
|
|
||||||
|
async def find_linkedin_via_search(
|
||||||
|
self, name: str, company: str, role: Optional[str] = None
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Find LinkedIn profile using web search.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Person's name
|
||||||
|
company: Company/investor name
|
||||||
|
role: Person's role (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with linkedin_url, confidence, method, notes
|
||||||
|
"""
|
||||||
|
if not self.ddg_search:
|
||||||
|
return {
|
||||||
|
"linkedin_url": None,
|
||||||
|
"confidence": 0,
|
||||||
|
"method": "web_search",
|
||||||
|
"notes": "Web search not available",
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Build search query - search for LinkedIn profile
|
||||||
|
query = f"{name} {company} site:linkedin.com/in"
|
||||||
|
if role:
|
||||||
|
query = f"{name} {role} {company} site:linkedin.com/in"
|
||||||
|
|
||||||
|
logger.debug(f"Searching: {query}")
|
||||||
|
results = self.web_search(query)
|
||||||
|
|
||||||
|
if results:
|
||||||
|
# Look for LinkedIn profile URLs in results
|
||||||
|
linkedin_pattern = r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)"
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
url = result.get("href") or result.get("link") or ""
|
||||||
|
title = result.get("title", "").lower()
|
||||||
|
body = result.get("body", "").lower()
|
||||||
|
|
||||||
|
match = re.search(linkedin_pattern, url, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
linkedin_url = self._normalize_linkedin_url(match.group(0))
|
||||||
|
username = match.group(1)
|
||||||
|
|
||||||
|
# Score based on name matching in title/body and username
|
||||||
|
context = f"{title} {body}"
|
||||||
|
context_score = self._name_matches_context(name, context)
|
||||||
|
username_score = self._name_matches_username(name, username)
|
||||||
|
|
||||||
|
total_score = max(context_score, username_score)
|
||||||
|
|
||||||
|
if total_score >= 30:
|
||||||
|
return {
|
||||||
|
"linkedin_url": linkedin_url,
|
||||||
|
"confidence": min(
|
||||||
|
total_score, 90
|
||||||
|
), # Cap at 90 for search results
|
||||||
|
"method": "web_search",
|
||||||
|
"notes": "Found via web search",
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"linkedin_url": None,
|
||||||
|
"confidence": 0,
|
||||||
|
"method": "web_search",
|
||||||
|
"notes": "No matching profile found in search results",
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Web search error for {name}: {e}")
|
||||||
|
return {
|
||||||
|
"linkedin_url": None,
|
||||||
|
"confidence": 0,
|
||||||
|
"method": "web_search",
|
||||||
|
"notes": f"Search error: {str(e)}",
|
||||||
|
}
|
||||||
|
|
||||||
|
async def find_linkedin_profile(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
company: str,
|
||||||
|
role: Optional[str] = None,
|
||||||
|
source_url: Optional[str] = None,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Find LinkedIn profile for a person.
|
||||||
|
|
||||||
|
Primary strategy: Crawl source URL to find LinkedIn links.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Person's name
|
||||||
|
company: Company/investor name
|
||||||
|
role: Person's role/title (optional)
|
||||||
|
source_url: URL where person info was found (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with:
|
||||||
|
- linkedin_url: Found LinkedIn URL or None
|
||||||
|
- confidence: Confidence score (0-100)
|
||||||
|
- method: Method used to find the profile
|
||||||
|
- notes: Additional information
|
||||||
|
"""
|
||||||
|
cache_key = f"{name}|{company}"
|
||||||
|
|
||||||
|
# Check cache
|
||||||
|
if self.use_cache and cache_key in self.profile_cache:
|
||||||
|
logger.debug(f"Using cached result for {name}")
|
||||||
|
return self.profile_cache[cache_key]
|
||||||
|
|
||||||
|
result = {"linkedin_url": None, "confidence": 0, "method": "none", "notes": ""}
|
||||||
|
|
||||||
|
# Primary strategy: Crawl source URL
|
||||||
|
if source_url:
|
||||||
|
result = await self.find_linkedin_from_source(name, source_url, role)
|
||||||
|
|
||||||
|
if result["linkedin_url"]:
|
||||||
|
if self.use_cache:
|
||||||
|
self.profile_cache[cache_key] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Fallback strategy: Web search (if enabled and no result from source crawl)
|
||||||
|
if self.use_llm_search and not result.get("linkedin_url"):
|
||||||
|
search_result = await self.find_linkedin_via_search(name, company, role)
|
||||||
|
if search_result["linkedin_url"]:
|
||||||
|
if self.use_cache:
|
||||||
|
self.profile_cache[cache_key] = search_result
|
||||||
|
return search_result
|
||||||
|
|
||||||
|
# If no source URL or no match found
|
||||||
|
if not result["linkedin_url"]:
|
||||||
|
result = {
|
||||||
|
"linkedin_url": None,
|
||||||
|
"confidence": 0,
|
||||||
|
"method": "none",
|
||||||
|
"notes": "No source URL available"
|
||||||
|
if not source_url
|
||||||
|
else result.get("notes", "Not found"),
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.use_cache:
|
||||||
|
self.profile_cache[cache_key] = result
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def batch_find_profiles(
|
||||||
|
self, members: List[Dict], progress_callback=None, db_callback=None
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Find LinkedIn profiles for multiple members efficiently.
|
||||||
|
|
||||||
|
Groups members by source_url to minimize crawling the same page multiple times.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
members: List of dicts with 'name', 'company', 'role', 'source_url', 'id'
|
||||||
|
progress_callback: Optional callback function(current, total, result)
|
||||||
|
db_callback: Optional callback to save to database immediately when profile found
|
||||||
|
Signature: db_callback(member_id, linkedin_url) -> bool
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of results for each member
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
total = len(members)
|
||||||
|
|
||||||
|
# Group members by source_url for efficient crawling
|
||||||
|
url_groups: Dict[str, List[Dict]] = {}
|
||||||
|
no_url_members = []
|
||||||
|
|
||||||
|
for member in members:
|
||||||
|
url = member.get("source_url")
|
||||||
|
if url:
|
||||||
|
if url not in url_groups:
|
||||||
|
url_groups[url] = []
|
||||||
|
url_groups[url].append(member)
|
||||||
|
else:
|
||||||
|
no_url_members.append(member)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Processing {len(url_groups)} unique source URLs for {total} members"
|
||||||
|
)
|
||||||
|
logger.info(f"Members with source URLs: {total - len(no_url_members)}")
|
||||||
|
logger.info(f"Members without source URLs: {len(no_url_members)}")
|
||||||
|
if self.use_llm_search:
|
||||||
|
logger.info("Web search fallback: ENABLED")
|
||||||
|
else:
|
||||||
|
logger.info("Web search fallback: DISABLED")
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
|
||||||
|
# Process members grouped by URL (efficient - one crawl per page)
|
||||||
|
for url, group_members in url_groups.items():
|
||||||
|
# Crawl the page once
|
||||||
|
content = await self.crawl_page(url)
|
||||||
|
html = self.html_cache.get(url, content or "")
|
||||||
|
|
||||||
|
# Extract all LinkedIn URLs from this page
|
||||||
|
linkedin_links = []
|
||||||
|
if content:
|
||||||
|
linkedin_links = self.extract_linkedin_urls_from_content(html)
|
||||||
|
if not linkedin_links:
|
||||||
|
linkedin_links = self.extract_linkedin_urls_from_content(content)
|
||||||
|
|
||||||
|
# Match each member in this group
|
||||||
|
for member in group_members:
|
||||||
|
processed += 1
|
||||||
|
result = None
|
||||||
|
found_linkedin = False
|
||||||
|
|
||||||
|
if linkedin_links:
|
||||||
|
# Find best matching LinkedIn for this member
|
||||||
|
best_match = None
|
||||||
|
best_score = 0
|
||||||
|
|
||||||
|
for link in linkedin_links:
|
||||||
|
context_score = self._name_matches_context(
|
||||||
|
member["name"], link["context"]
|
||||||
|
)
|
||||||
|
username_score = self._name_matches_username(
|
||||||
|
member["name"], link["username"]
|
||||||
|
)
|
||||||
|
role_bonus = (
|
||||||
|
10
|
||||||
|
if member.get("role")
|
||||||
|
and member["role"].lower() in link["context"].lower()
|
||||||
|
else 0
|
||||||
|
)
|
||||||
|
total_score = max(context_score, username_score) + role_bonus
|
||||||
|
|
||||||
|
if total_score > best_score:
|
||||||
|
best_score = total_score
|
||||||
|
best_match = link
|
||||||
|
|
||||||
|
if best_match and best_score >= 30:
|
||||||
|
result = {
|
||||||
|
"linkedin_url": best_match["url"],
|
||||||
|
"confidence": min(best_score, 100),
|
||||||
|
"method": "source_crawl",
|
||||||
|
"notes": f"Found on {url}",
|
||||||
|
"member_id": member.get("id"),
|
||||||
|
"member_name": member["name"],
|
||||||
|
}
|
||||||
|
found_linkedin = True
|
||||||
|
# Save to database immediately if callback provided
|
||||||
|
if db_callback and member.get("id"):
|
||||||
|
db_callback(member["id"], best_match["url"])
|
||||||
|
|
||||||
|
# If no result from source crawl, try web search IMMEDIATELY
|
||||||
|
if not found_linkedin and self.use_llm_search:
|
||||||
|
search_result = await self.find_linkedin_via_search(
|
||||||
|
member["name"], member["company"], member.get("role")
|
||||||
|
)
|
||||||
|
|
||||||
|
if search_result["linkedin_url"]:
|
||||||
|
result = {
|
||||||
|
"linkedin_url": search_result["linkedin_url"],
|
||||||
|
"confidence": search_result["confidence"],
|
||||||
|
"method": "web_search",
|
||||||
|
"notes": search_result.get("notes", "Found via web search"),
|
||||||
|
"member_id": member.get("id"),
|
||||||
|
"member_name": member["name"],
|
||||||
|
}
|
||||||
|
found_linkedin = True
|
||||||
|
# Save to database immediately
|
||||||
|
if db_callback and member.get("id"):
|
||||||
|
db_callback(member["id"], search_result["linkedin_url"])
|
||||||
|
|
||||||
|
# If still no result, record as not found
|
||||||
|
if not found_linkedin:
|
||||||
|
result = {
|
||||||
|
"linkedin_url": None,
|
||||||
|
"confidence": 0,
|
||||||
|
"method": "source_crawl" if content else "none",
|
||||||
|
"notes": f"No match on {url}"
|
||||||
|
if linkedin_links
|
||||||
|
else (
|
||||||
|
f"No LinkedIn URLs on {url}"
|
||||||
|
if content
|
||||||
|
else f"Failed to crawl {url}"
|
||||||
|
),
|
||||||
|
"member_id": member.get("id"),
|
||||||
|
"member_name": member["name"],
|
||||||
|
}
|
||||||
|
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
if progress_callback:
|
||||||
|
progress_callback(processed, total, result)
|
||||||
|
|
||||||
|
# Small delay between different URLs
|
||||||
|
await asyncio.sleep(self.rate_limit_delay)
|
||||||
|
|
||||||
|
# Process members without source URLs - do web search immediately for each
|
||||||
|
for member in no_url_members:
|
||||||
|
processed += 1
|
||||||
|
result = None
|
||||||
|
|
||||||
|
# Try web search immediately
|
||||||
|
if self.use_llm_search:
|
||||||
|
search_result = await self.find_linkedin_via_search(
|
||||||
|
member["name"], member["company"], member.get("role")
|
||||||
|
)
|
||||||
|
|
||||||
|
if search_result["linkedin_url"]:
|
||||||
|
result = {
|
||||||
|
"linkedin_url": search_result["linkedin_url"],
|
||||||
|
"confidence": search_result["confidence"],
|
||||||
|
"method": "web_search",
|
||||||
|
"notes": search_result.get("notes", "Found via web search"),
|
||||||
|
"member_id": member.get("id"),
|
||||||
|
"member_name": member["name"],
|
||||||
|
}
|
||||||
|
# Save to database immediately
|
||||||
|
if db_callback and member.get("id"):
|
||||||
|
db_callback(member["id"], search_result["linkedin_url"])
|
||||||
|
|
||||||
|
# If no result from search
|
||||||
|
if not result:
|
||||||
|
result = {
|
||||||
|
"linkedin_url": None,
|
||||||
|
"confidence": 0,
|
||||||
|
"method": "web_search" if self.use_llm_search else "none",
|
||||||
|
"notes": "No LinkedIn profile found"
|
||||||
|
if self.use_llm_search
|
||||||
|
else "No source URL available",
|
||||||
|
"member_id": member.get("id"),
|
||||||
|
"member_name": member["name"],
|
||||||
|
}
|
||||||
|
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
if progress_callback:
|
||||||
|
progress_callback(processed, total, result)
|
||||||
|
|
||||||
|
# Rate limit between searches
|
||||||
|
await asyncio.sleep(self.rate_limit_delay)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def format_linkedin_url(url: str) -> str:
|
||||||
|
"""Normalize LinkedIn URL format"""
|
||||||
|
if not url:
|
||||||
|
return url
|
||||||
|
|
||||||
|
# Remove trailing slashes
|
||||||
|
url = url.rstrip("/")
|
||||||
|
|
||||||
|
# Ensure https and normalize to www
|
||||||
|
url = re.sub(r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url)
|
||||||
|
if url.startswith("http://"):
|
||||||
|
url = url.replace("http://", "https://")
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
# Async wrapper for sync contexts
|
||||||
|
def run_batch_scraper(
|
||||||
|
members: List[Dict], rate_limit: float = 0.5, progress_callback=None
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Synchronous wrapper for batch_find_profiles.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
members: List of member dicts
|
||||||
|
rate_limit: Delay between URL crawls
|
||||||
|
progress_callback: Optional progress callback
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of results
|
||||||
|
"""
|
||||||
|
scraper = LinkedInProfileScraper(rate_limit_delay=rate_limit)
|
||||||
|
return asyncio.run(scraper.batch_find_profiles(members, progress_callback))
|
||||||
+56
-18
@@ -1,11 +1,14 @@
|
|||||||
import io
|
import io
|
||||||
|
import logging
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from db.db import Base, db_dependency, engine
|
from db.db import Base, db_dependency, engine
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from fastapi import FastAPI, File, Form, UploadFile
|
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from routers import (
|
from routers import (
|
||||||
|
addition,
|
||||||
companies,
|
companies,
|
||||||
folk_crm,
|
folk_crm,
|
||||||
insight_route,
|
insight_route,
|
||||||
@@ -13,7 +16,8 @@ from routers import (
|
|||||||
projects,
|
projects,
|
||||||
report_route,
|
report_route,
|
||||||
)
|
)
|
||||||
from schemas.router_schemas import InvestmentResponse, PaginatedResponse
|
from schemas.router_schemas import CompanyData, InvestmentResponse, PaginatedResponse
|
||||||
|
from services.company_querying import CompanyQueryProcessor
|
||||||
from services.llm_parser import InvestorProcessor
|
from services.llm_parser import InvestorProcessor
|
||||||
from services.querying import QueryProcessor
|
from services.querying import QueryProcessor
|
||||||
|
|
||||||
@@ -25,10 +29,21 @@ def init_database():
|
|||||||
Base.metadata.create_all(bind=engine)
|
Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
init_database()
|
init_database()
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
# Add CORS middleware to allow frontend requests
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"], # In production, replace with specific origins
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Request models
|
# Request models
|
||||||
class QueryRequest(BaseModel):
|
class QueryRequest(BaseModel):
|
||||||
@@ -42,6 +57,17 @@ class QueryRequest(BaseModel):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CompanyQueryRequest(BaseModel):
|
||||||
|
question: str
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
json_schema_extra = {
|
||||||
|
"example": {
|
||||||
|
"question": "Find me companies in the fintech sector located in San Francisco."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
def health():
|
def health():
|
||||||
return {"Hello": "World"}
|
return {"Hello": "World"}
|
||||||
@@ -61,16 +87,18 @@ async def parse_csv(
|
|||||||
- Handles AUM, fund sizes, and check sizes as integers
|
- Handles AUM, fund sizes, and check sizes as integers
|
||||||
|
|
||||||
**For companies:**
|
**For companies:**
|
||||||
- Expected columns: Name, Website, Investor, Final Investor Profile (company profile)
|
- Expected columns: Name, Website, Perplexity Gap Output (or Final Investor Profile)
|
||||||
- 100% manual JSON parsing - no LLM needed
|
- 100% manual JSON parsing - no LLM needed
|
||||||
- Extracts company details, executives, investors, and client categories
|
- **Only extracts:** founded_year and key_executives
|
||||||
- Automatically links companies to investors in database
|
- **Only updates companies already in the database** (syncs with existing records)
|
||||||
|
- Skips companies not found in the database
|
||||||
|
|
||||||
**Benefits:**
|
**Benefits:**
|
||||||
- Fast processing (5-10s per record)
|
- Fast processing (5-10s per record)
|
||||||
- Low cost (minimal or no LLM usage)
|
- Low cost (minimal or no LLM usage)
|
||||||
- Accurate data extraction
|
- Accurate data extraction
|
||||||
- Automatic database persistence
|
- Automatic database persistence
|
||||||
|
- Safe: won't create duplicate companies
|
||||||
"""
|
"""
|
||||||
# Read uploaded CSV with pandas
|
# Read uploaded CSV with pandas
|
||||||
content = await file.read()
|
content = await file.read()
|
||||||
@@ -95,21 +123,30 @@ async def parse_csv(
|
|||||||
"/query", response_model=PaginatedResponse[InvestmentResponse], tags=["Querying"]
|
"/query", response_model=PaginatedResponse[InvestmentResponse], tags=["Querying"]
|
||||||
)
|
)
|
||||||
async def query_investors(request: QueryRequest):
|
async def query_investors(request: QueryRequest):
|
||||||
"""
|
"""Query investors/funds using natural language"""
|
||||||
Query investors using natural language.
|
try:
|
||||||
|
processor = QueryProcessor()
|
||||||
|
result = await processor.process_query(request.question)
|
||||||
|
logger.info(f"Query completed successfully with {result.total} results")
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in query_investors: {e}", exc_info=True)
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
Returns fund-level matches (one row per fund) with investor details.
|
|
||||||
This ensures only relevant funds are included in the response.
|
|
||||||
|
|
||||||
Supports queries like:
|
@app.post(
|
||||||
- "Show me seed stage investors"
|
"/query-companies", response_model=PaginatedResponse[CompanyData], tags=["Querying"]
|
||||||
- "Find fintech investors in Silicon Valley"
|
)
|
||||||
- "Growth stage investors with $5M+ check sizes"
|
async def query_companies(request: CompanyQueryRequest):
|
||||||
- "Healthcare investors in Europe"
|
"""Query companies using natural language"""
|
||||||
"""
|
try:
|
||||||
processor = QueryProcessor()
|
processor = CompanyQueryProcessor()
|
||||||
results = processor.process_query(request.question)
|
result = await processor.process_query(request.question)
|
||||||
return results
|
logger.info(f"Company query completed successfully with {result.total} results")
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in query_companies: {e}", exc_info=True)
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
app.include_router(investors.router)
|
app.include_router(investors.router)
|
||||||
@@ -118,6 +155,7 @@ app.include_router(projects.router)
|
|||||||
app.include_router(folk_crm.router)
|
app.include_router(folk_crm.router)
|
||||||
app.include_router(insight_route.router)
|
app.include_router(insight_route.router)
|
||||||
app.include_router(report_route.router)
|
app.include_router(report_route.router)
|
||||||
|
app.include_router(addition.router)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|||||||
Binary file not shown.
@@ -0,0 +1,370 @@
|
|||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from db.db import get_db
|
||||||
|
from db.models import FundTable, InvestorTable, SectorTable
|
||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
router = APIRouter(tags=["Additional Routes"])
|
||||||
|
|
||||||
|
|
||||||
|
# Response schemas
|
||||||
|
class SectorsResponse(BaseModel):
|
||||||
|
sectors: list[str]
|
||||||
|
total: int
|
||||||
|
|
||||||
|
|
||||||
|
class CountryInfo(BaseModel):
|
||||||
|
name: str
|
||||||
|
|
||||||
|
|
||||||
|
class ContinentInfo(BaseModel):
|
||||||
|
name: str
|
||||||
|
countries: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
class GeographyResponse(BaseModel):
|
||||||
|
continents: list[ContinentInfo]
|
||||||
|
total_continents: int
|
||||||
|
total_countries: int
|
||||||
|
|
||||||
|
|
||||||
|
# Mapping of countries to continents
|
||||||
|
COUNTRY_TO_CONTINENT = {
|
||||||
|
# Africa
|
||||||
|
"Algeria": "Africa",
|
||||||
|
"Angola": "Africa",
|
||||||
|
"Benin": "Africa",
|
||||||
|
"Botswana": "Africa",
|
||||||
|
"Burkina Faso": "Africa",
|
||||||
|
"Burundi": "Africa",
|
||||||
|
"Cameroon": "Africa",
|
||||||
|
"Cape Verde": "Africa",
|
||||||
|
"Central African Republic": "Africa",
|
||||||
|
"Chad": "Africa",
|
||||||
|
"Comoros": "Africa",
|
||||||
|
"Congo": "Africa",
|
||||||
|
"Democratic Republic of the Congo": "Africa",
|
||||||
|
"Djibouti": "Africa",
|
||||||
|
"Egypt": "Africa",
|
||||||
|
"Equatorial Guinea": "Africa",
|
||||||
|
"Eritrea": "Africa",
|
||||||
|
"Eswatini": "Africa",
|
||||||
|
"Ethiopia": "Africa",
|
||||||
|
"Gabon": "Africa",
|
||||||
|
"Gambia": "Africa",
|
||||||
|
"Ghana": "Africa",
|
||||||
|
"Guinea": "Africa",
|
||||||
|
"Guinea-Bissau": "Africa",
|
||||||
|
"Ivory Coast": "Africa",
|
||||||
|
"Kenya": "Africa",
|
||||||
|
"Lesotho": "Africa",
|
||||||
|
"Liberia": "Africa",
|
||||||
|
"Libya": "Africa",
|
||||||
|
"Madagascar": "Africa",
|
||||||
|
"Malawi": "Africa",
|
||||||
|
"Mali": "Africa",
|
||||||
|
"Mauritania": "Africa",
|
||||||
|
"Mauritius": "Africa",
|
||||||
|
"Morocco": "Africa",
|
||||||
|
"Mozambique": "Africa",
|
||||||
|
"Namibia": "Africa",
|
||||||
|
"Niger": "Africa",
|
||||||
|
"Nigeria": "Africa",
|
||||||
|
"Rwanda": "Africa",
|
||||||
|
"Sao Tome and Principe": "Africa",
|
||||||
|
"Senegal": "Africa",
|
||||||
|
"Seychelles": "Africa",
|
||||||
|
"Sierra Leone": "Africa",
|
||||||
|
"Somalia": "Africa",
|
||||||
|
"South Africa": "Africa",
|
||||||
|
"South Sudan": "Africa",
|
||||||
|
"Sudan": "Africa",
|
||||||
|
"Tanzania": "Africa",
|
||||||
|
"Togo": "Africa",
|
||||||
|
"Tunisia": "Africa",
|
||||||
|
"Uganda": "Africa",
|
||||||
|
"Zambia": "Africa",
|
||||||
|
"Zimbabwe": "Africa",
|
||||||
|
# Asia
|
||||||
|
"Afghanistan": "Asia",
|
||||||
|
"Armenia": "Asia",
|
||||||
|
"Azerbaijan": "Asia",
|
||||||
|
"Bahrain": "Asia",
|
||||||
|
"Bangladesh": "Asia",
|
||||||
|
"Bhutan": "Asia",
|
||||||
|
"Brunei": "Asia",
|
||||||
|
"Cambodia": "Asia",
|
||||||
|
"China": "Asia",
|
||||||
|
"Cyprus": "Asia",
|
||||||
|
"Georgia": "Asia",
|
||||||
|
"Hong Kong": "Asia",
|
||||||
|
"India": "Asia",
|
||||||
|
"Indonesia": "Asia",
|
||||||
|
"Iran": "Asia",
|
||||||
|
"Iraq": "Asia",
|
||||||
|
"Israel": "Asia",
|
||||||
|
"Japan": "Asia",
|
||||||
|
"Jordan": "Asia",
|
||||||
|
"Kazakhstan": "Asia",
|
||||||
|
"Kuwait": "Asia",
|
||||||
|
"Kyrgyzstan": "Asia",
|
||||||
|
"Laos": "Asia",
|
||||||
|
"Lebanon": "Asia",
|
||||||
|
"Malaysia": "Asia",
|
||||||
|
"Maldives": "Asia",
|
||||||
|
"Mongolia": "Asia",
|
||||||
|
"Myanmar": "Asia",
|
||||||
|
"Nepal": "Asia",
|
||||||
|
"North Korea": "Asia",
|
||||||
|
"Oman": "Asia",
|
||||||
|
"Pakistan": "Asia",
|
||||||
|
"Palestine": "Asia",
|
||||||
|
"Philippines": "Asia",
|
||||||
|
"Qatar": "Asia",
|
||||||
|
"Saudi Arabia": "Asia",
|
||||||
|
"Singapore": "Asia",
|
||||||
|
"South Korea": "Asia",
|
||||||
|
"Sri Lanka": "Asia",
|
||||||
|
"Syria": "Asia",
|
||||||
|
"Taiwan": "Asia",
|
||||||
|
"Tajikistan": "Asia",
|
||||||
|
"Thailand": "Asia",
|
||||||
|
"Timor-Leste": "Asia",
|
||||||
|
"Turkey": "Asia",
|
||||||
|
"Turkmenistan": "Asia",
|
||||||
|
"United Arab Emirates": "Asia",
|
||||||
|
"UAE": "Asia",
|
||||||
|
"Uzbekistan": "Asia",
|
||||||
|
"Vietnam": "Asia",
|
||||||
|
"Yemen": "Asia",
|
||||||
|
# Europe
|
||||||
|
"Albania": "Europe",
|
||||||
|
"Andorra": "Europe",
|
||||||
|
"Austria": "Europe",
|
||||||
|
"Belarus": "Europe",
|
||||||
|
"Belgium": "Europe",
|
||||||
|
"Bosnia and Herzegovina": "Europe",
|
||||||
|
"Bulgaria": "Europe",
|
||||||
|
"Croatia": "Europe",
|
||||||
|
"Czech Republic": "Europe",
|
||||||
|
"Czechia": "Europe",
|
||||||
|
"Denmark": "Europe",
|
||||||
|
"Estonia": "Europe",
|
||||||
|
"Finland": "Europe",
|
||||||
|
"France": "Europe",
|
||||||
|
"Germany": "Europe",
|
||||||
|
"Greece": "Europe",
|
||||||
|
"Hungary": "Europe",
|
||||||
|
"Iceland": "Europe",
|
||||||
|
"Ireland": "Europe",
|
||||||
|
"Italy": "Europe",
|
||||||
|
"Kosovo": "Europe",
|
||||||
|
"Latvia": "Europe",
|
||||||
|
"Liechtenstein": "Europe",
|
||||||
|
"Lithuania": "Europe",
|
||||||
|
"Luxembourg": "Europe",
|
||||||
|
"Malta": "Europe",
|
||||||
|
"Moldova": "Europe",
|
||||||
|
"Monaco": "Europe",
|
||||||
|
"Montenegro": "Europe",
|
||||||
|
"Netherlands": "Europe",
|
||||||
|
"North Macedonia": "Europe",
|
||||||
|
"Norway": "Europe",
|
||||||
|
"Poland": "Europe",
|
||||||
|
"Portugal": "Europe",
|
||||||
|
"Romania": "Europe",
|
||||||
|
"Russia": "Europe",
|
||||||
|
"San Marino": "Europe",
|
||||||
|
"Serbia": "Europe",
|
||||||
|
"Slovakia": "Europe",
|
||||||
|
"Slovenia": "Europe",
|
||||||
|
"Spain": "Europe",
|
||||||
|
"Sweden": "Europe",
|
||||||
|
"Switzerland": "Europe",
|
||||||
|
"Ukraine": "Europe",
|
||||||
|
"United Kingdom": "Europe",
|
||||||
|
"UK": "Europe",
|
||||||
|
"Vatican City": "Europe",
|
||||||
|
# North America
|
||||||
|
"Antigua and Barbuda": "North America",
|
||||||
|
"Bahamas": "North America",
|
||||||
|
"Barbados": "North America",
|
||||||
|
"Belize": "North America",
|
||||||
|
"Canada": "North America",
|
||||||
|
"Costa Rica": "North America",
|
||||||
|
"Cuba": "North America",
|
||||||
|
"Dominica": "North America",
|
||||||
|
"Dominican Republic": "North America",
|
||||||
|
"El Salvador": "North America",
|
||||||
|
"Grenada": "North America",
|
||||||
|
"Guatemala": "North America",
|
||||||
|
"Haiti": "North America",
|
||||||
|
"Honduras": "North America",
|
||||||
|
"Jamaica": "North America",
|
||||||
|
"Mexico": "North America",
|
||||||
|
"Nicaragua": "North America",
|
||||||
|
"Panama": "North America",
|
||||||
|
"Saint Kitts and Nevis": "North America",
|
||||||
|
"Saint Lucia": "North America",
|
||||||
|
"Saint Vincent and the Grenadines": "North America",
|
||||||
|
"Trinidad and Tobago": "North America",
|
||||||
|
"United States": "North America",
|
||||||
|
"USA": "North America",
|
||||||
|
"US": "North America",
|
||||||
|
# South America
|
||||||
|
"Argentina": "South America",
|
||||||
|
"Bolivia": "South America",
|
||||||
|
"Brazil": "South America",
|
||||||
|
"Chile": "South America",
|
||||||
|
"Colombia": "South America",
|
||||||
|
"Ecuador": "South America",
|
||||||
|
"Guyana": "South America",
|
||||||
|
"Paraguay": "South America",
|
||||||
|
"Peru": "South America",
|
||||||
|
"Suriname": "South America",
|
||||||
|
"Uruguay": "South America",
|
||||||
|
"Venezuela": "South America",
|
||||||
|
# Oceania
|
||||||
|
"Australia": "Oceania",
|
||||||
|
"Fiji": "Oceania",
|
||||||
|
"Kiribati": "Oceania",
|
||||||
|
"Marshall Islands": "Oceania",
|
||||||
|
"Micronesia": "Oceania",
|
||||||
|
"Nauru": "Oceania",
|
||||||
|
"New Zealand": "Oceania",
|
||||||
|
"Palau": "Oceania",
|
||||||
|
"Papua New Guinea": "Oceania",
|
||||||
|
"Samoa": "Oceania",
|
||||||
|
"Solomon Islands": "Oceania",
|
||||||
|
"Tonga": "Oceania",
|
||||||
|
"Tuvalu": "Oceania",
|
||||||
|
"Vanuatu": "Oceania",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Valid continent names for direct matching
|
||||||
|
VALID_CONTINENTS = {
|
||||||
|
"Africa",
|
||||||
|
"Asia",
|
||||||
|
"Europe",
|
||||||
|
"North America",
|
||||||
|
"South America",
|
||||||
|
"Oceania",
|
||||||
|
"Antarctica",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_countries_from_geographic_focus(geographic_focus: str) -> set[str]:
|
||||||
|
"""
|
||||||
|
Extract country names from a geographic_focus string.
|
||||||
|
Handles comma-separated values, slashes, and various formats.
|
||||||
|
"""
|
||||||
|
if not geographic_focus:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
countries = set()
|
||||||
|
# Split by common delimiters
|
||||||
|
parts = geographic_focus.replace("/", ",").replace(";", ",").split(",")
|
||||||
|
|
||||||
|
for part in parts:
|
||||||
|
cleaned = part.strip()
|
||||||
|
if cleaned:
|
||||||
|
# Check if it's a known country
|
||||||
|
if cleaned in COUNTRY_TO_CONTINENT:
|
||||||
|
countries.add(cleaned)
|
||||||
|
# Check for partial matches (e.g., "United States of America" -> "United States")
|
||||||
|
else:
|
||||||
|
for country in COUNTRY_TO_CONTINENT.keys():
|
||||||
|
if country.lower() in cleaned.lower() or cleaned.lower() in country.lower():
|
||||||
|
countries.add(country)
|
||||||
|
break
|
||||||
|
|
||||||
|
return countries
|
||||||
|
|
||||||
|
|
||||||
|
def organize_geography(geographic_data: list[str]) -> dict[str, set[str]]:
|
||||||
|
"""
|
||||||
|
Organize geographic data into continents and their countries.
|
||||||
|
Returns a dict with continent names as keys and sets of countries as values.
|
||||||
|
"""
|
||||||
|
continent_countries: dict[str, set[str]] = {}
|
||||||
|
|
||||||
|
for geo_focus in geographic_data:
|
||||||
|
if not geo_focus:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract countries from the geographic focus string
|
||||||
|
countries = extract_countries_from_geographic_focus(geo_focus)
|
||||||
|
|
||||||
|
for country in countries:
|
||||||
|
continent = COUNTRY_TO_CONTINENT.get(country)
|
||||||
|
if continent:
|
||||||
|
if continent not in continent_countries:
|
||||||
|
continent_countries[continent] = set()
|
||||||
|
continent_countries[continent].add(country)
|
||||||
|
|
||||||
|
# Also check if the geographic focus itself is a continent
|
||||||
|
cleaned_geo = geo_focus.strip()
|
||||||
|
if cleaned_geo in VALID_CONTINENTS:
|
||||||
|
if cleaned_geo not in continent_countries:
|
||||||
|
continent_countries[cleaned_geo] = set()
|
||||||
|
|
||||||
|
return continent_countries
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sectors", response_model=SectorsResponse)
|
||||||
|
def get_unique_sectors(db: Session = Depends(get_db)):
|
||||||
|
"""
|
||||||
|
Get all unique sectors from the database.
|
||||||
|
Returns a list of sector names sorted alphabetically.
|
||||||
|
"""
|
||||||
|
sectors = db.query(SectorTable.name).distinct().order_by(SectorTable.name).all()
|
||||||
|
sector_names = [s[0] for s in sectors if s[0]]
|
||||||
|
|
||||||
|
return SectorsResponse(sectors=sector_names, total=len(sector_names))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/geography", response_model=GeographyResponse)
|
||||||
|
def get_arranged_geography(db: Session = Depends(get_db)):
|
||||||
|
"""
|
||||||
|
Get all unique geographic locations arranged by continent and countries.
|
||||||
|
Extracts geography from both investors and funds tables.
|
||||||
|
Returns continents with their associated countries.
|
||||||
|
"""
|
||||||
|
# Collect all geographic focus data from investors
|
||||||
|
investor_geo = (
|
||||||
|
db.query(InvestorTable.geographic_focus)
|
||||||
|
.filter(InvestorTable.geographic_focus.isnot(None))
|
||||||
|
.distinct()
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Collect all geographic focus data from funds
|
||||||
|
fund_geo = (
|
||||||
|
db.query(FundTable.geographic_focus)
|
||||||
|
.filter(FundTable.geographic_focus.isnot(None))
|
||||||
|
.distinct()
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Combine all geographic data
|
||||||
|
all_geo_data = [g[0] for g in investor_geo] + [g[0] for g in fund_geo]
|
||||||
|
|
||||||
|
# Organize into continents and countries
|
||||||
|
continent_countries = organize_geography(all_geo_data)
|
||||||
|
|
||||||
|
# Build response
|
||||||
|
continents = []
|
||||||
|
total_countries = 0
|
||||||
|
|
||||||
|
for continent_name in sorted(continent_countries.keys()):
|
||||||
|
countries = sorted(continent_countries[continent_name])
|
||||||
|
total_countries += len(countries)
|
||||||
|
continents.append(ContinentInfo(name=continent_name, countries=countries))
|
||||||
|
|
||||||
|
return GeographyResponse(
|
||||||
|
continents=continents,
|
||||||
|
total_continents=len(continents),
|
||||||
|
total_countries=total_countries,
|
||||||
|
)
|
||||||
@@ -63,11 +63,13 @@ def read_companies(
|
|||||||
# Transform CompanyTable objects to CompanyData format
|
# Transform CompanyTable objects to CompanyData format
|
||||||
company_data_list = []
|
company_data_list = []
|
||||||
for company in companies:
|
for company in companies:
|
||||||
|
# Sort sectors alphabetically
|
||||||
|
sorted_sectors = sorted(company.sectors, key=lambda s: s.name) if company.sectors else []
|
||||||
company_data = CompanyData(
|
company_data = CompanyData(
|
||||||
company=company,
|
company=company,
|
||||||
investors=company.investors,
|
investors=company.investors,
|
||||||
members=company.members,
|
members=company.members,
|
||||||
sectors=company.sectors,
|
sectors=sorted_sectors,
|
||||||
)
|
)
|
||||||
company_data_list.append(company_data)
|
company_data_list.append(company_data)
|
||||||
|
|
||||||
@@ -147,11 +149,13 @@ def filter_companies(
|
|||||||
# Transform to CompanyData format
|
# Transform to CompanyData format
|
||||||
company_data_list = []
|
company_data_list = []
|
||||||
for company in companies:
|
for company in companies:
|
||||||
|
# Sort sectors alphabetically
|
||||||
|
sorted_sectors = sorted(company.sectors, key=lambda s: s.name) if company.sectors else []
|
||||||
company_data = CompanyData(
|
company_data = CompanyData(
|
||||||
company=company,
|
company=company,
|
||||||
investors=company.investors,
|
investors=company.investors,
|
||||||
members=company.members,
|
members=company.members,
|
||||||
sectors=company.sectors,
|
sectors=sorted_sectors,
|
||||||
)
|
)
|
||||||
company_data_list.append(company_data)
|
company_data_list.append(company_data)
|
||||||
|
|
||||||
@@ -184,12 +188,15 @@ def read_company(company_id: int, db: Session = Depends(get_db)):
|
|||||||
if not company:
|
if not company:
|
||||||
raise HTTPException(status_code=404, detail="Company not found")
|
raise HTTPException(status_code=404, detail="Company not found")
|
||||||
|
|
||||||
|
# Sort sectors alphabetically
|
||||||
|
sorted_sectors = sorted(company.sectors, key=lambda s: s.name) if company.sectors else []
|
||||||
|
|
||||||
# Transform to CompanyData format
|
# Transform to CompanyData format
|
||||||
return CompanyData(
|
return CompanyData(
|
||||||
company=company,
|
company=company,
|
||||||
investors=company.investors,
|
investors=company.investors,
|
||||||
members=company.members,
|
members=company.members,
|
||||||
sectors=company.sectors,
|
sectors=sorted_sectors,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -250,12 +257,15 @@ def update_company(
|
|||||||
.first()
|
.first()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Sort sectors alphabetically
|
||||||
|
sorted_sectors = sorted(company_with_relations.sectors, key=lambda s: s.name) if company_with_relations.sectors else []
|
||||||
|
|
||||||
# Transform to CompanyData format
|
# Transform to CompanyData format
|
||||||
return CompanyData(
|
return CompanyData(
|
||||||
company=company_with_relations,
|
company=company_with_relations,
|
||||||
investors=company_with_relations.investors,
|
investors=company_with_relations.investors,
|
||||||
members=company_with_relations.members,
|
members=company_with_relations.members,
|
||||||
sectors=company_with_relations.sectors,
|
sectors=sorted_sectors,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
+15
-1
@@ -1,15 +1,21 @@
|
|||||||
|
import os
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from db.db import get_db
|
from db.db import get_db
|
||||||
from db.models import InvestorTable
|
from db.models import InvestorTable
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from services.crm import folk
|
from services.crm import FolkAPI
|
||||||
from sqlalchemy.orm import Session, selectinload
|
from sqlalchemy.orm import Session, selectinload
|
||||||
|
|
||||||
router = APIRouter(prefix="/folk", tags=["Folk CRM"])
|
router = APIRouter(prefix="/folk", tags=["Folk CRM"])
|
||||||
|
|
||||||
|
|
||||||
|
def get_folk_client():
|
||||||
|
"""Get Folk API client with loaded environment variables"""
|
||||||
|
return FolkAPI(api_key=os.environ.get("FOLK_API_KEY", ""))
|
||||||
|
|
||||||
|
|
||||||
class GroupResponse(BaseModel):
|
class GroupResponse(BaseModel):
|
||||||
id: str
|
id: str
|
||||||
name: str
|
name: str
|
||||||
@@ -44,6 +50,7 @@ def get_folk_groups():
|
|||||||
to sync investors to Folk.
|
to sync investors to Folk.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
folk = get_folk_client()
|
||||||
groups_data = folk.get_groups()
|
groups_data = folk.get_groups()
|
||||||
items = groups_data.get("data", {}).get("items", [])
|
items = groups_data.get("data", {}).get("items", [])
|
||||||
|
|
||||||
@@ -71,6 +78,7 @@ def sync_investors_to_folk(
|
|||||||
Returns:
|
Returns:
|
||||||
Summary of sync operation including successes and errors
|
Summary of sync operation including successes and errors
|
||||||
"""
|
"""
|
||||||
|
folk = get_folk_client()
|
||||||
# Fetch investors with their team members
|
# Fetch investors with their team members
|
||||||
investors = (
|
investors = (
|
||||||
db.query(InvestorTable)
|
db.query(InvestorTable)
|
||||||
@@ -128,6 +136,11 @@ def sync_investors_to_folk(
|
|||||||
if hasattr(member, "source_url") and member.source_url:
|
if hasattr(member, "source_url") and member.source_url:
|
||||||
urls_list = [member.source_url]
|
urls_list = [member.source_url]
|
||||||
|
|
||||||
|
# Get LinkedIn URL if available
|
||||||
|
linkedin_url = None
|
||||||
|
if hasattr(member, "linkedin") and member.linkedin:
|
||||||
|
linkedin_url = member.linkedin
|
||||||
|
|
||||||
# Build job title from title or role
|
# Build job title from title or role
|
||||||
job_title = None
|
job_title = None
|
||||||
if hasattr(member, "title") and member.title:
|
if hasattr(member, "title") and member.title:
|
||||||
@@ -141,6 +154,7 @@ def sync_investors_to_folk(
|
|||||||
email=member.email,
|
email=member.email,
|
||||||
company_id=company_id,
|
company_id=company_id,
|
||||||
group_id=request.group_id,
|
group_id=request.group_id,
|
||||||
|
linkedin_url=linkedin_url,
|
||||||
urls=urls_list,
|
urls=urls_list,
|
||||||
jobTitle=job_title,
|
jobTitle=job_title,
|
||||||
)
|
)
|
||||||
|
|||||||
+87
-31
@@ -12,7 +12,10 @@ from schemas.router_schemas import (
|
|||||||
PaginatedResponse,
|
PaginatedResponse,
|
||||||
SectorMinimal,
|
SectorMinimal,
|
||||||
)
|
)
|
||||||
from services.compatibility_score import calculate_project_investor_compatibility
|
from services.compatibility_score import (
|
||||||
|
_calculate_project_fund_compatibility,
|
||||||
|
_calculate_project_investor_direct_compatibility,
|
||||||
|
)
|
||||||
from sqlalchemy.orm import Session, selectinload
|
from sqlalchemy.orm import Session, selectinload
|
||||||
|
|
||||||
router = APIRouter(tags=["Investor Routes"])
|
router = APIRouter(tags=["Investor Routes"])
|
||||||
@@ -77,31 +80,46 @@ def read_investors(
|
|||||||
if not project:
|
if not project:
|
||||||
raise HTTPException(status_code=404, detail="Project not found")
|
raise HTTPException(status_code=404, detail="Project not found")
|
||||||
|
|
||||||
# Get paginated results
|
# When project_id is provided, we need to get all investors first to sort by compatibility score
|
||||||
investors = (
|
# Otherwise, we can paginate at the database level
|
||||||
db.query(InvestorTable)
|
if project is not None:
|
||||||
.options(
|
# Get all investors (we'll sort by compatibility score, then paginate)
|
||||||
selectinload(InvestorTable.portfolio_companies),
|
all_investors = (
|
||||||
selectinload(InvestorTable.team_members),
|
db.query(InvestorTable)
|
||||||
selectinload(InvestorTable.sectors),
|
.options(
|
||||||
selectinload(InvestorTable.funds).selectinload(FundTable.investment_stages),
|
selectinload(InvestorTable.portfolio_companies),
|
||||||
selectinload(InvestorTable.funds).selectinload(FundTable.sectors),
|
selectinload(InvestorTable.team_members),
|
||||||
|
selectinload(InvestorTable.sectors),
|
||||||
|
selectinload(InvestorTable.funds).selectinload(
|
||||||
|
FundTable.investment_stages
|
||||||
|
),
|
||||||
|
selectinload(InvestorTable.funds).selectinload(FundTable.sectors),
|
||||||
|
)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
# We'll paginate after sorting by compatibility score
|
||||||
|
investors = all_investors
|
||||||
|
else:
|
||||||
|
# Get paginated results (no sorting needed)
|
||||||
|
investors = (
|
||||||
|
db.query(InvestorTable)
|
||||||
|
.options(
|
||||||
|
selectinload(InvestorTable.portfolio_companies),
|
||||||
|
selectinload(InvestorTable.team_members),
|
||||||
|
selectinload(InvestorTable.sectors),
|
||||||
|
selectinload(InvestorTable.funds).selectinload(
|
||||||
|
FundTable.investment_stages
|
||||||
|
),
|
||||||
|
selectinload(InvestorTable.funds).selectinload(FundTable.sectors),
|
||||||
|
)
|
||||||
|
.offset(offset)
|
||||||
|
.limit(page_size)
|
||||||
|
.all()
|
||||||
)
|
)
|
||||||
.offset(offset)
|
|
||||||
.limit(page_size)
|
|
||||||
.all()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Transform to InvestmentResponse format (one row per investor-fund combination)
|
# Transform to InvestmentResponse format (one row per investor-fund combination)
|
||||||
investment_responses = []
|
investment_responses = []
|
||||||
for investor in investors:
|
for investor in investors:
|
||||||
# Calculate compatibility score if project provided
|
|
||||||
compatibility_score = 1.0
|
|
||||||
if project is not None:
|
|
||||||
compatibility_score = calculate_project_investor_compatibility(
|
|
||||||
project=project, investor=investor, use_funds=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get top 3 portfolio companies (id and name only)
|
# Get top 3 portfolio companies (id and name only)
|
||||||
portfolio_companies = [
|
portfolio_companies = [
|
||||||
CompanyMinimal(id=company.id, name=company.name)
|
CompanyMinimal(id=company.id, name=company.name)
|
||||||
@@ -111,6 +129,13 @@ def read_investors(
|
|||||||
# If investor has funds, create one entry per fund
|
# If investor has funds, create one entry per fund
|
||||||
if investor.funds:
|
if investor.funds:
|
||||||
for fund in investor.funds:
|
for fund in investor.funds:
|
||||||
|
# Calculate compatibility score for this specific fund
|
||||||
|
compatibility_score = 1.0
|
||||||
|
if project is not None:
|
||||||
|
compatibility_score = _calculate_project_fund_compatibility(
|
||||||
|
project=project, fund=fund
|
||||||
|
)
|
||||||
|
|
||||||
# Get stage focus as comma-separated string
|
# Get stage focus as comma-separated string
|
||||||
stage_focus = (
|
stage_focus = (
|
||||||
", ".join([stage.name for stage in fund.investment_stages])
|
", ".join([stage.name for stage in fund.investment_stages])
|
||||||
@@ -118,10 +143,12 @@ def read_investors(
|
|||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get top 3 sectors from fund (id and name only)
|
# Get top 3 sectors from fund (id and name only) - sorted alphabetically
|
||||||
fund_sectors = [
|
fund_sectors = [
|
||||||
SectorMinimal(id=sector.id, name=sector.name)
|
SectorMinimal(id=sector.id, name=sector.name)
|
||||||
for sector in (fund.sectors[:3] if fund.sectors else [])
|
for sector in sorted(
|
||||||
|
fund.sectors[:3] if fund.sectors else [], key=lambda s: s.name
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
investment_response = InvestmentResponse(
|
investment_response = InvestmentResponse(
|
||||||
@@ -141,6 +168,13 @@ def read_investors(
|
|||||||
investment_responses.append(investment_response)
|
investment_responses.append(investment_response)
|
||||||
else:
|
else:
|
||||||
# If no funds, create one entry with null fund fields
|
# If no funds, create one entry with null fund fields
|
||||||
|
# Calculate compatibility using investor-level data
|
||||||
|
compatibility_score = 1.0
|
||||||
|
if project is not None:
|
||||||
|
compatibility_score = _calculate_project_investor_direct_compatibility(
|
||||||
|
project=project, investor=investor
|
||||||
|
)
|
||||||
|
|
||||||
investment_response = InvestmentResponse(
|
investment_response = InvestmentResponse(
|
||||||
id=investor.id,
|
id=investor.id,
|
||||||
name=investor.name,
|
name=investor.name,
|
||||||
@@ -155,6 +189,12 @@ def read_investors(
|
|||||||
)
|
)
|
||||||
investment_responses.append(investment_response)
|
investment_responses.append(investment_response)
|
||||||
|
|
||||||
|
# Sort by compatibility score (descending) when project_id is provided
|
||||||
|
if project is not None:
|
||||||
|
investment_responses.sort(key=lambda x: x.compatibility_score, reverse=True)
|
||||||
|
# Apply pagination after sorting
|
||||||
|
investment_responses = investment_responses[offset : offset + page_size]
|
||||||
|
|
||||||
# Calculate total pages
|
# Calculate total pages
|
||||||
total_pages = (total_count + page_size - 1) // page_size
|
total_pages = (total_count + page_size - 1) // page_size
|
||||||
|
|
||||||
@@ -246,20 +286,27 @@ def filter_investors(
|
|||||||
# Get total count before pagination
|
# Get total count before pagination
|
||||||
total_count = query.count()
|
total_count = query.count()
|
||||||
|
|
||||||
# Calculate offset and apply pagination
|
# When project_id is provided, we need to get all funds first to sort by compatibility score
|
||||||
offset = (page - 1) * page_size
|
# Otherwise, we can paginate at the database level
|
||||||
funds = query.offset(offset).limit(page_size).all()
|
if project is not None:
|
||||||
|
# Get all funds (we'll sort by compatibility score, then paginate)
|
||||||
|
all_funds = query.all()
|
||||||
|
funds = all_funds
|
||||||
|
else:
|
||||||
|
# Calculate offset and apply pagination (no sorting needed)
|
||||||
|
offset = (page - 1) * page_size
|
||||||
|
funds = query.offset(offset).limit(page_size).all()
|
||||||
|
|
||||||
# Transform to InvestmentResponse format (one row per fund)
|
# Transform to InvestmentResponse format (one row per fund)
|
||||||
investment_responses = []
|
investment_responses = []
|
||||||
for fund in funds:
|
for fund in funds:
|
||||||
investor = fund.investor
|
investor = fund.investor
|
||||||
|
|
||||||
# Calculate compatibility score if project provided
|
# Calculate compatibility score for this specific fund
|
||||||
compatibility_score = 1.0
|
compatibility_score = 1.0
|
||||||
if project is not None:
|
if project is not None:
|
||||||
compatibility_score = calculate_project_investor_compatibility(
|
compatibility_score = _calculate_project_fund_compatibility(
|
||||||
project=project, investor=investor, use_funds=True
|
project=project, fund=fund
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get top 3 portfolio companies (id and name only)
|
# Get top 3 portfolio companies (id and name only)
|
||||||
@@ -275,10 +322,12 @@ def filter_investors(
|
|||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get top 3 sectors from fund (id and name only)
|
# Get top 3 sectors from fund (id and name only) - sorted alphabetically
|
||||||
fund_sectors = [
|
fund_sectors = [
|
||||||
SectorMinimal(id=sector.id, name=sector.name)
|
SectorMinimal(id=sector.id, name=sector.name)
|
||||||
for sector in (fund.sectors[:3] if fund.sectors else [])
|
for sector in sorted(
|
||||||
|
fund.sectors[:3] if fund.sectors else [], key=lambda s: s.name
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
investment_response = InvestmentResponse(
|
investment_response = InvestmentResponse(
|
||||||
@@ -297,6 +346,13 @@ def filter_investors(
|
|||||||
)
|
)
|
||||||
investment_responses.append(investment_response)
|
investment_responses.append(investment_response)
|
||||||
|
|
||||||
|
# Sort by compatibility score (descending) when project_id is provided
|
||||||
|
if project is not None:
|
||||||
|
investment_responses.sort(key=lambda x: x.compatibility_score, reverse=True)
|
||||||
|
# Apply pagination after sorting
|
||||||
|
offset = (page - 1) * page_size
|
||||||
|
investment_responses = investment_responses[offset : offset + page_size]
|
||||||
|
|
||||||
# Calculate total pages
|
# Calculate total pages
|
||||||
total_pages = (total_count + page_size - 1) // page_size
|
total_pages = (total_count + page_size - 1) // page_size
|
||||||
|
|
||||||
|
|||||||
+100
-5
@@ -24,19 +24,29 @@ router = APIRouter(tags=["Project Routes"])
|
|||||||
def read_projects(
|
def read_projects(
|
||||||
page: int = Query(1, ge=1, description="Page number (starts at 1)"),
|
page: int = Query(1, ge=1, description="Page number (starts at 1)"),
|
||||||
page_size: int = Query(10, ge=1, le=100, description="Items per page (max 100)"),
|
page_size: int = Query(10, ge=1, le=100, description="Items per page (max 100)"),
|
||||||
|
include_archived: bool = Query(False, description="Include archived projects"),
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
):
|
):
|
||||||
"""Get all projects with their related data (paginated)"""
|
"""Get all projects with their related data (paginated)
|
||||||
|
|
||||||
|
By default, archived projects are excluded. Set include_archived=True to include them.
|
||||||
|
"""
|
||||||
# Calculate offset
|
# Calculate offset
|
||||||
offset = (page - 1) * page_size
|
offset = (page - 1) * page_size
|
||||||
|
|
||||||
|
# Start with base query
|
||||||
|
query = db.query(ProjectTable)
|
||||||
|
|
||||||
|
# Filter out archived projects by default
|
||||||
|
if not include_archived:
|
||||||
|
query = query.filter(ProjectTable.is_archived == 0)
|
||||||
|
|
||||||
# Get total count
|
# Get total count
|
||||||
total_count = db.query(ProjectTable).count()
|
total_count = query.count()
|
||||||
|
|
||||||
# Get paginated results
|
# Get paginated results
|
||||||
projects = (
|
projects = (
|
||||||
db.query(ProjectTable)
|
query.options(
|
||||||
.options(
|
|
||||||
selectinload(ProjectTable.sector),
|
selectinload(ProjectTable.sector),
|
||||||
selectinload(ProjectTable.investors),
|
selectinload(ProjectTable.investors),
|
||||||
selectinload(ProjectTable.companies),
|
selectinload(ProjectTable.companies),
|
||||||
@@ -162,7 +172,7 @@ def update_project(
|
|||||||
|
|
||||||
@router.delete("/projects/{project_id}")
|
@router.delete("/projects/{project_id}")
|
||||||
def delete_project(project_id: int, db: Session = Depends(get_db)):
|
def delete_project(project_id: int, db: Session = Depends(get_db)):
|
||||||
"""Delete a project"""
|
"""Delete a project permanently"""
|
||||||
db_project = db.query(ProjectTable).filter(ProjectTable.id == project_id).first()
|
db_project = db.query(ProjectTable).filter(ProjectTable.id == project_id).first()
|
||||||
|
|
||||||
if not db_project:
|
if not db_project:
|
||||||
@@ -174,6 +184,87 @@ def delete_project(project_id: int, db: Session = Depends(get_db)):
|
|||||||
return {"message": "Project deleted successfully"}
|
return {"message": "Project deleted successfully"}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/projects/{project_id}/archive")
|
||||||
|
def archive_project(project_id: int, db: Session = Depends(get_db)):
|
||||||
|
"""Archive a project (soft delete)"""
|
||||||
|
db_project = db.query(ProjectTable).filter(ProjectTable.id == project_id).first()
|
||||||
|
|
||||||
|
if not db_project:
|
||||||
|
raise HTTPException(status_code=404, detail="Project not found")
|
||||||
|
|
||||||
|
db_project.is_archived = 1
|
||||||
|
db.commit()
|
||||||
|
db.refresh(db_project)
|
||||||
|
|
||||||
|
return {"message": "Project archived successfully", "project_id": project_id}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/projects/{project_id}/unarchive")
|
||||||
|
def unarchive_project(project_id: int, db: Session = Depends(get_db)):
|
||||||
|
"""Unarchive a project (restore from archive)"""
|
||||||
|
db_project = db.query(ProjectTable).filter(ProjectTable.id == project_id).first()
|
||||||
|
|
||||||
|
if not db_project:
|
||||||
|
raise HTTPException(status_code=404, detail="Project not found")
|
||||||
|
|
||||||
|
db_project.is_archived = 0
|
||||||
|
db.commit()
|
||||||
|
db.refresh(db_project)
|
||||||
|
|
||||||
|
return {"message": "Project unarchived successfully", "project_id": project_id}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/projects/archived", response_model=PaginatedResponse[ProjectData])
|
||||||
|
def read_archived_projects(
|
||||||
|
page: int = Query(1, ge=1, description="Page number (starts at 1)"),
|
||||||
|
page_size: int = Query(10, ge=1, le=100, description="Items per page (max 100)"),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
"""Get all archived projects (paginated)"""
|
||||||
|
# Calculate offset
|
||||||
|
offset = (page - 1) * page_size
|
||||||
|
|
||||||
|
# Query only archived projects
|
||||||
|
query = db.query(ProjectTable).filter(ProjectTable.is_archived == 1)
|
||||||
|
|
||||||
|
# Get total count
|
||||||
|
total_count = query.count()
|
||||||
|
|
||||||
|
# Get paginated results
|
||||||
|
projects = (
|
||||||
|
query.options(
|
||||||
|
selectinload(ProjectTable.sector),
|
||||||
|
selectinload(ProjectTable.investors),
|
||||||
|
selectinload(ProjectTable.companies),
|
||||||
|
)
|
||||||
|
.offset(offset)
|
||||||
|
.limit(page_size)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Transform ProjectTable objects to ProjectData format
|
||||||
|
project_data_list = []
|
||||||
|
for project in projects:
|
||||||
|
project_data = ProjectData(
|
||||||
|
project=project,
|
||||||
|
sector=project.sector,
|
||||||
|
investors=project.investors,
|
||||||
|
companies=project.companies,
|
||||||
|
)
|
||||||
|
project_data_list.append(project_data)
|
||||||
|
|
||||||
|
# Calculate total pages
|
||||||
|
total_pages = (total_count + page_size - 1) // page_size
|
||||||
|
|
||||||
|
return PaginatedResponse(
|
||||||
|
items=project_data_list,
|
||||||
|
total=total_count,
|
||||||
|
page=page,
|
||||||
|
page_size=page_size,
|
||||||
|
total_pages=total_pages,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.get("/projects/filter", response_model=PaginatedResponse[ProjectData])
|
@router.get("/projects/filter", response_model=PaginatedResponse[ProjectData])
|
||||||
def filter_projects(
|
def filter_projects(
|
||||||
stage: Optional[InvestmentStage] = Query(
|
stage: Optional[InvestmentStage] = Query(
|
||||||
@@ -182,6 +273,7 @@ def filter_projects(
|
|||||||
min_valuation: Optional[int] = Query(None, description="Minimum valuation"),
|
min_valuation: Optional[int] = Query(None, description="Minimum valuation"),
|
||||||
max_valuation: Optional[int] = Query(None, description="Maximum valuation"),
|
max_valuation: Optional[int] = Query(None, description="Maximum valuation"),
|
||||||
location: Optional[str] = Query(None, description="Location (partial match)"),
|
location: Optional[str] = Query(None, description="Location (partial match)"),
|
||||||
|
industry: Optional[str] = Query(None, description="Industry (partial match)"),
|
||||||
sector: Optional[str] = Query(None, description="Sector name (partial match)"),
|
sector: Optional[str] = Query(None, description="Sector name (partial match)"),
|
||||||
investor_name: Optional[str] = Query(
|
investor_name: Optional[str] = Query(
|
||||||
None, description="Investor name (partial match)"
|
None, description="Investor name (partial match)"
|
||||||
@@ -215,6 +307,9 @@ def filter_projects(
|
|||||||
if location:
|
if location:
|
||||||
query = query.filter(ProjectTable.location.ilike(f"%{location}%"))
|
query = query.filter(ProjectTable.location.ilike(f"%{location}%"))
|
||||||
|
|
||||||
|
if industry:
|
||||||
|
query = query.filter(ProjectTable.industry.ilike(f"%{industry}%"))
|
||||||
|
|
||||||
if sector:
|
if sector:
|
||||||
query = query.join(ProjectTable.sector).filter(
|
query = query.join(ProjectTable.sector).filter(
|
||||||
SectorTable.name.ilike(f"%{sector}%")
|
SectorTable.name.ilike(f"%{sector}%")
|
||||||
|
|||||||
+13
-16
@@ -52,7 +52,6 @@ async def generate_investor_report(
|
|||||||
"website": investor.website,
|
"website": investor.website,
|
||||||
"headquarters": investor.headquarters,
|
"headquarters": investor.headquarters,
|
||||||
"aum": investor.aum,
|
"aum": investor.aum,
|
||||||
"geographic_focus": investor.geographic_focus,
|
|
||||||
"portfolio_highlights": investor.portfolio_highlights or [],
|
"portfolio_highlights": investor.portfolio_highlights or [],
|
||||||
"investment_thesis": investor.investment_thesis or [],
|
"investment_thesis": investor.investment_thesis or [],
|
||||||
"sectors": [sector.name for sector in investor.sectors],
|
"sectors": [sector.name for sector in investor.sectors],
|
||||||
@@ -65,24 +64,22 @@ async def generate_investor_report(
|
|||||||
}
|
}
|
||||||
for member in investor.team_members
|
for member in investor.team_members
|
||||||
],
|
],
|
||||||
"check_size_lower": None,
|
"funds": [],
|
||||||
"check_size_upper": None,
|
|
||||||
"investment_stages": [],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Get check sizes and stages from funds
|
# Get all funds with their data
|
||||||
if investor.funds:
|
if investor.funds:
|
||||||
# Use the first fund's data or aggregate
|
|
||||||
fund = investor.funds[0]
|
|
||||||
investor_data["check_size_lower"] = fund.check_size_lower
|
|
||||||
investor_data["check_size_upper"] = fund.check_size_upper
|
|
||||||
|
|
||||||
# Aggregate all investment stages from all funds
|
|
||||||
stages = set()
|
|
||||||
for fund in investor.funds:
|
for fund in investor.funds:
|
||||||
for stage in fund.investment_stages:
|
fund_data = {
|
||||||
stages.add(stage.name)
|
"fund_name": fund.fund_name,
|
||||||
investor_data["investment_stages"] = list(stages)
|
"fund_size": fund.fund_size,
|
||||||
|
"check_size_lower": fund.check_size_lower,
|
||||||
|
"check_size_upper": fund.check_size_upper,
|
||||||
|
"geographic_focus": fund.geographic_focus,
|
||||||
|
"investment_stages": [stage.name for stage in fund.investment_stages],
|
||||||
|
"sectors": [sector.name for sector in fund.sectors],
|
||||||
|
}
|
||||||
|
investor_data["funds"].append(fund_data)
|
||||||
|
|
||||||
# Fetch project data if project_id is provided
|
# Fetch project data if project_id is provided
|
||||||
project_data = None
|
project_data = None
|
||||||
@@ -109,7 +106,7 @@ async def generate_investor_report(
|
|||||||
# Generate PDF report
|
# Generate PDF report
|
||||||
report_generator = ReportGenerator()
|
report_generator = ReportGenerator()
|
||||||
pdf_bytes = await report_generator.generate_investor_report(
|
pdf_bytes = await report_generator.generate_investor_report(
|
||||||
investor_data, project_data
|
investor_data, project_data, investor_model=investor, project_model=project
|
||||||
)
|
)
|
||||||
|
|
||||||
# Return PDF as downloadable file
|
# Return PDF as downloadable file
|
||||||
|
|||||||
Binary file not shown.
@@ -60,6 +60,7 @@ class ProjectSchema(BaseModel):
|
|||||||
valuation: int | None
|
valuation: int | None
|
||||||
stage: InvestmentStage | None
|
stage: InvestmentStage | None
|
||||||
location: str | None
|
location: str | None
|
||||||
|
industry: str | None
|
||||||
description: Optional[str]
|
description: Optional[str]
|
||||||
start_date: Optional[datetime]
|
start_date: Optional[datetime]
|
||||||
end_date: Optional[datetime]
|
end_date: Optional[datetime]
|
||||||
@@ -75,6 +76,7 @@ class ProjectCreate(BaseModel):
|
|||||||
valuation: Optional[int] = None
|
valuation: Optional[int] = None
|
||||||
stage: Optional[InvestmentStage] = None
|
stage: Optional[InvestmentStage] = None
|
||||||
location: Optional[str] = None
|
location: Optional[str] = None
|
||||||
|
industry: Optional[str] = None
|
||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
start_date: Optional[datetime] = None
|
start_date: Optional[datetime] = None
|
||||||
end_date: Optional[datetime] = None
|
end_date: Optional[datetime] = None
|
||||||
@@ -85,6 +87,7 @@ class ProjectUpdate(BaseModel):
|
|||||||
valuation: Optional[int] = None
|
valuation: Optional[int] = None
|
||||||
stage: Optional[InvestmentStage] = None
|
stage: Optional[InvestmentStage] = None
|
||||||
location: Optional[str] = None
|
location: Optional[str] = None
|
||||||
|
industry: Optional[str] = None
|
||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
start_date: Optional[datetime] = None
|
start_date: Optional[datetime] = None
|
||||||
end_date: Optional[datetime] = None
|
end_date: Optional[datetime] = None
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ class InvestorMemberSchema(BaseModel):
|
|||||||
name: str
|
name: str
|
||||||
role: str | None
|
role: str | None
|
||||||
email: str | None
|
email: str | None
|
||||||
|
linkedin: str | None
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
from_attributes = True
|
from_attributes = True
|
||||||
@@ -168,6 +169,7 @@ class InvestorFundData(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
from_attributes = True
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
class InvestorMinimal(BaseModel):
|
class InvestorMinimal(BaseModel):
|
||||||
"""Minimal investor info with just id and name"""
|
"""Minimal investor info with just id and name"""
|
||||||
|
|
||||||
@@ -177,6 +179,7 @@ class InvestorMinimal(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
from_attributes = True
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
class CompanySchemaMinimal(BaseModel):
|
class CompanySchemaMinimal(BaseModel):
|
||||||
id: int
|
id: int
|
||||||
name: str
|
name: str
|
||||||
@@ -188,9 +191,12 @@ class CompanySchemaMinimal(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
from_attributes = True
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
class CompanyData(BaseModel): # Renamed from CompaniesData for consistency
|
class CompanyData(BaseModel): # Renamed from CompaniesData for consistency
|
||||||
company: CompanySchemaMinimal
|
company: CompanySchemaMinimal
|
||||||
investors: List[InvestorMinimal]
|
investors: List[InvestorMinimal]
|
||||||
|
members: List[CompanyMemberSchema] = []
|
||||||
|
sectors: List[SectorSchema] = []
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
from_attributes = True
|
from_attributes = True
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,228 @@
|
|||||||
|
import asyncio
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from db.db import get_db
|
||||||
|
from db.models import CompanyTable
|
||||||
|
from langchain_core.prompts import ChatPromptTemplate
|
||||||
|
from langchain_openai import ChatOpenAI
|
||||||
|
from schemas.router_schemas import CompanyData, PaginatedResponse
|
||||||
|
from sqlalchemy import text
|
||||||
|
from sqlalchemy.orm import selectinload
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CompanyQueryProcessor:
|
||||||
|
def __init__(self):
|
||||||
|
self.llm = ChatOpenAI(
|
||||||
|
api_key=os.getenv("OPENROUTER_API_KEY"),
|
||||||
|
base_url="https://openrouter.ai/api/v1",
|
||||||
|
model="openai/gpt-4o-mini",
|
||||||
|
temperature=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Query cache for performance
|
||||||
|
self.query_cache = {}
|
||||||
|
|
||||||
|
# SQL generation prompt
|
||||||
|
self.sql_prompt = ChatPromptTemplate.from_messages(
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"system",
|
||||||
|
"""You are a SQL expert. Generate a SQLite query to find company IDs based on user requirements.
|
||||||
|
|
||||||
|
Database Schema:
|
||||||
|
- companies: id, name, industry, location, description, founded_year, website
|
||||||
|
- company_sector: company_id, sector_id
|
||||||
|
- sectors: id, name
|
||||||
|
- investor_companies: investor_id, company_id
|
||||||
|
- investors: id, name, aum
|
||||||
|
- team_members: id, company_id, name, title
|
||||||
|
|
||||||
|
IMPORTANT RULES:
|
||||||
|
1. ALWAYS return ONLY company IDs (companies.id) - use SELECT DISTINCT c.id
|
||||||
|
2. For industry: Check BOTH industry field AND sectors table with synonyms
|
||||||
|
- Use LEFT JOIN for sectors so companies without sector tags still match
|
||||||
|
- Include related terms: 'Fintech' → c.industry LIKE '%Fintech%' OR c.industry LIKE '%Finance%' OR sec.name LIKE '%Fintech%' OR sec.name LIKE '%Financial%'
|
||||||
|
- 'AI' → c.industry LIKE '%AI%' OR c.industry LIKE '%Artificial Intelligence%' OR c.industry LIKE '%Machine Learning%' OR sec.name LIKE '%AI%' OR sec.name LIKE '%ML%'
|
||||||
|
3. For location: Be FLEXIBLE with variations and abbreviations
|
||||||
|
- 'San Francisco' → c.location LIKE '%San Francisco%' OR c.location LIKE '%SF%' OR c.location LIKE '%Bay Area%'
|
||||||
|
- 'New York' → c.location LIKE '%New York%' OR c.location LIKE '%NYC%' OR c.location LIKE '%NY%'
|
||||||
|
- 'Europe' → c.location LIKE '%Europe%' OR c.location LIKE '%UK%' OR c.location LIKE '%London%' OR c.location LIKE '%Berlin%' OR c.location LIKE '%Paris%'
|
||||||
|
4. For sectors: Use LEFT JOIN and include multiple synonyms
|
||||||
|
- 'Healthcare' → sec.name LIKE '%Healthcare%' OR sec.name LIKE '%Health%' OR sec.name LIKE '%Medical%' OR sec.name LIKE '%Biotech%' OR c.industry LIKE '%Health%'
|
||||||
|
5. For founding year filters (include NULL to be inclusive):
|
||||||
|
- "founded after 2020" → WHERE (founded_year >= 2020 OR founded_year IS NULL)
|
||||||
|
- "founded before 2018" → WHERE (founded_year <= 2018 OR founded_year IS NULL)
|
||||||
|
- "founded in 2020" → WHERE founded_year = 2020
|
||||||
|
6. For investor-related queries: Use JOIN investor_companies
|
||||||
|
7. Use LEFT JOIN for sectors so companies without tags still match
|
||||||
|
8. Use DISTINCT to avoid duplicates from joins
|
||||||
|
9. Be INCLUSIVE - use OR conditions with synonyms and variations
|
||||||
|
10. Return a single, complete SELECT query
|
||||||
|
|
||||||
|
Example Queries:
|
||||||
|
Q: "Fintech companies founded in 2020"
|
||||||
|
A: SELECT DISTINCT c.id FROM companies c
|
||||||
|
LEFT JOIN company_sector cs ON c.id = cs.company_id
|
||||||
|
LEFT JOIN sectors sec ON cs.sector_id = sec.id
|
||||||
|
WHERE (c.industry LIKE '%Fintech%' OR c.industry LIKE '%Finance%' OR c.industry LIKE '%Financial%' OR sec.name LIKE '%Fintech%' OR sec.name LIKE '%Financial Services%')
|
||||||
|
AND c.founded_year = 2020
|
||||||
|
|
||||||
|
Q: "AI companies in San Francisco"
|
||||||
|
A: SELECT DISTINCT c.id FROM companies c
|
||||||
|
LEFT JOIN company_sector cs ON c.id = cs.company_id
|
||||||
|
LEFT JOIN sectors sec ON cs.sector_id = sec.id
|
||||||
|
WHERE (c.industry LIKE '%AI%' OR c.industry LIKE '%Artificial Intelligence%' OR c.industry LIKE '%Machine Learning%' OR sec.name LIKE '%AI%' OR sec.name LIKE '%Machine Learning%' OR sec.name LIKE '%ML%')
|
||||||
|
AND (c.location LIKE '%San Francisco%' OR c.location LIKE '%SF%' OR c.location LIKE '%Bay Area%')
|
||||||
|
|
||||||
|
Q: "Healthcare companies"
|
||||||
|
A: SELECT DISTINCT c.id FROM companies c
|
||||||
|
LEFT JOIN company_sector cs ON c.id = cs.company_id
|
||||||
|
LEFT JOIN sectors sec ON cs.sector_id = sec.id
|
||||||
|
WHERE c.industry LIKE '%Healthcare%' OR c.industry LIKE '%Health%' OR c.industry LIKE '%Medical%' OR sec.name LIKE '%Healthcare%' OR sec.name LIKE '%Medical%' OR sec.name LIKE '%Biotech%' OR sec.name LIKE '%Pharma%'
|
||||||
|
|
||||||
|
Q: "Companies funded by Sequoia"
|
||||||
|
A: SELECT DISTINCT c.id FROM companies c
|
||||||
|
JOIN investor_companies ic ON c.id = ic.company_id
|
||||||
|
JOIN investors i ON ic.investor_id = i.id
|
||||||
|
WHERE i.name LIKE '%Sequoia%'
|
||||||
|
|
||||||
|
Q: "European startups founded after 2019"
|
||||||
|
A: SELECT DISTINCT c.id FROM companies c
|
||||||
|
WHERE (c.location LIKE '%Europe%' OR c.location LIKE '%UK%' OR c.location LIKE '%London%' OR c.location LIKE '%Germany%' OR c.location LIKE '%Berlin%' OR c.location LIKE '%France%' OR c.location LIKE '%Paris%')
|
||||||
|
AND (c.founded_year > 2019 OR c.founded_year IS NULL)
|
||||||
|
|
||||||
|
Q: "SaaS companies"
|
||||||
|
A: SELECT DISTINCT c.id FROM companies c
|
||||||
|
LEFT JOIN company_sector cs ON c.id = cs.company_id
|
||||||
|
LEFT JOIN sectors sec ON cs.sector_id = sec.id
|
||||||
|
WHERE c.industry LIKE '%SaaS%' OR c.industry LIKE '%Software%' OR c.industry LIKE '%Cloud%' OR sec.name LIKE '%SaaS%' OR sec.name LIKE '%Software%'
|
||||||
|
|
||||||
|
IMPORTANT:
|
||||||
|
- Use LEFT JOIN so companies without sector tags still match via industry field
|
||||||
|
- Use OR conditions with related keywords/synonyms to cast a wider net
|
||||||
|
- Include NULL checks for optional filters to avoid excluding companies with missing data
|
||||||
|
|
||||||
|
Return ONLY the SQL query, no explanations or markdown.""",
|
||||||
|
),
|
||||||
|
("user", "{question}"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_cache_key(self, question: str) -> str:
|
||||||
|
"""Generate cache key from normalized question."""
|
||||||
|
return hashlib.md5(question.lower().strip().encode()).hexdigest()
|
||||||
|
|
||||||
|
# synchronous helper is provided below as `_process_query_sync` and an
|
||||||
|
# async wrapper `process_query` runs it in a thread. This keeps the
|
||||||
|
# FastAPI event loop non-blocking while reusing the existing sync code.
|
||||||
|
async def process_query(self, question: str) -> PaginatedResponse[CompanyData]:
|
||||||
|
"""Async wrapper for process_query. Runs blocking work in a thread to avoid
|
||||||
|
blocking the event loop.
|
||||||
|
"""
|
||||||
|
return await asyncio.to_thread(self._process_query_sync, question)
|
||||||
|
|
||||||
|
def _process_query_sync(self, question: str) -> PaginatedResponse[CompanyData]:
|
||||||
|
"""Synchronous implementation of process_query. This is run in a thread by
|
||||||
|
the async wrapper above.
|
||||||
|
"""
|
||||||
|
cache_key = self._get_cache_key(question)
|
||||||
|
|
||||||
|
# Check cache first
|
||||||
|
if cache_key in self.query_cache:
|
||||||
|
sql_query = self.query_cache[cache_key]
|
||||||
|
logger.info(f"Using cached SQL: {sql_query}")
|
||||||
|
else:
|
||||||
|
# Generate SQL query
|
||||||
|
messages = self.sql_prompt.format_messages(question=question)
|
||||||
|
response = self.llm.invoke(messages)
|
||||||
|
sql_query = response.content.strip()
|
||||||
|
|
||||||
|
# Clean up SQL (remove markdown code blocks if present)
|
||||||
|
sql_query = sql_query.replace("```sql", "").replace("```", "").strip()
|
||||||
|
|
||||||
|
# Cache the query
|
||||||
|
self.query_cache[cache_key] = sql_query
|
||||||
|
logger.info(f"Generated SQL: {sql_query}")
|
||||||
|
|
||||||
|
# Execute query to get company IDs
|
||||||
|
db_session = next(get_db())
|
||||||
|
try:
|
||||||
|
result = db_session.execute(text(sql_query))
|
||||||
|
company_ids = [row[0] for row in result.fetchall()]
|
||||||
|
logger.info(
|
||||||
|
f"Found {len(company_ids)} company IDs: {company_ids[:10]}{'...' if len(company_ids) > 10 else ''}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._fetch_companies_by_ids(company_ids)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"SQL execution error: {e}")
|
||||||
|
logger.error(f"Failed SQL: {sql_query}")
|
||||||
|
# Return empty result
|
||||||
|
return PaginatedResponse(
|
||||||
|
items=[], total=0, page=1, page_size=10, total_pages=0
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
db_session.close()
|
||||||
|
|
||||||
|
def _fetch_companies_by_ids(
|
||||||
|
self, company_ids: List[int]
|
||||||
|
) -> PaginatedResponse[CompanyData]:
|
||||||
|
"""Fetch companies with all their relationships from the database using company IDs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
company_ids: List of company IDs to fetch
|
||||||
|
"""
|
||||||
|
if not company_ids:
|
||||||
|
return PaginatedResponse(
|
||||||
|
items=[],
|
||||||
|
total=0,
|
||||||
|
page=1,
|
||||||
|
page_size=10,
|
||||||
|
total_pages=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get database session
|
||||||
|
db_session = next(get_db())
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Query companies with all necessary relationships loaded
|
||||||
|
companies = (
|
||||||
|
db_session.query(CompanyTable)
|
||||||
|
.options(
|
||||||
|
selectinload(CompanyTable.investors),
|
||||||
|
selectinload(CompanyTable.members),
|
||||||
|
selectinload(CompanyTable.sectors),
|
||||||
|
)
|
||||||
|
.filter(CompanyTable.id.in_(company_ids))
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Transform to CompanyData format
|
||||||
|
company_data_list = []
|
||||||
|
for company in companies:
|
||||||
|
company_data = CompanyData(
|
||||||
|
company=company,
|
||||||
|
investors=company.investors,
|
||||||
|
members=company.members,
|
||||||
|
sectors=company.sectors,
|
||||||
|
)
|
||||||
|
company_data_list.append(company_data)
|
||||||
|
|
||||||
|
total_count = len(company_data_list)
|
||||||
|
total_pages = 1 if total_count > 0 else 0
|
||||||
|
|
||||||
|
return PaginatedResponse(
|
||||||
|
items=company_data_list,
|
||||||
|
total=total_count,
|
||||||
|
page=1,
|
||||||
|
page_size=total_count,
|
||||||
|
total_pages=total_pages,
|
||||||
|
)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db_session.close()
|
||||||
@@ -6,6 +6,7 @@ The scoring system evaluates multiple dimensions to determine how well a project
|
|||||||
matches with an investor's investment criteria.
|
matches with an investor's investment criteria.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from difflib import SequenceMatcher
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from db.models import FundTable, InvestorTable, ProjectTable
|
from db.models import FundTable, InvestorTable, ProjectTable
|
||||||
@@ -99,12 +100,16 @@ def _calculate_project_fund_compatibility(
|
|||||||
else str(project.stage)
|
else str(project.stage)
|
||||||
)
|
)
|
||||||
|
|
||||||
if project_stage_name in fund_stage_names:
|
# Normalize both for case-insensitive comparison
|
||||||
|
project_stage_normalized = project_stage_name.upper().strip()
|
||||||
|
fund_stages_normalized = {name.upper().strip() for name in fund_stage_names}
|
||||||
|
|
||||||
|
if project_stage_normalized in fund_stages_normalized:
|
||||||
stage_score = 30
|
stage_score = 30
|
||||||
else:
|
else:
|
||||||
# Partial credit for adjacent stages
|
# Partial credit for adjacent stages
|
||||||
stage_score = _calculate_stage_proximity(
|
stage_score = _calculate_stage_proximity(
|
||||||
project_stage_name, fund_stage_names
|
project_stage_normalized, fund_stages_normalized
|
||||||
)
|
)
|
||||||
|
|
||||||
total_score += stage_score
|
total_score += stage_score
|
||||||
@@ -112,22 +117,53 @@ def _calculate_project_fund_compatibility(
|
|||||||
# 2. Sector Overlap (30 points)
|
# 2. Sector Overlap (30 points)
|
||||||
sector_score = 0
|
sector_score = 0
|
||||||
if project.sector and fund.sectors:
|
if project.sector and fund.sectors:
|
||||||
project_sector_ids = {sector.id for sector in project.sector}
|
project_sectors = [s for s in project.sector if hasattr(s, "name")]
|
||||||
fund_sector_ids = {sector.id for sector in fund.sectors}
|
fund_sectors = [s for s in fund.sectors if hasattr(s, "name")]
|
||||||
|
|
||||||
if project_sector_ids and fund_sector_ids:
|
if project_sectors and fund_sectors:
|
||||||
common_sectors = project_sector_ids.intersection(fund_sector_ids)
|
# Use fuzzy matching to account for similar but not identical sector names
|
||||||
# Score based on what percentage of project sectors are covered by fund
|
match_count = 0
|
||||||
overlap_ratio = len(common_sectors) / len(project_sector_ids)
|
total_matches = 0
|
||||||
sector_score = int(30 * overlap_ratio)
|
|
||||||
|
for proj_sector in project_sectors:
|
||||||
|
best_match_score = 0
|
||||||
|
proj_name = proj_sector.name.lower().strip()
|
||||||
|
|
||||||
|
for fund_sector in fund_sectors:
|
||||||
|
fund_name = fund_sector.name.lower().strip()
|
||||||
|
|
||||||
|
# Exact match
|
||||||
|
if proj_name == fund_name:
|
||||||
|
best_match_score = 1.0
|
||||||
|
break
|
||||||
|
|
||||||
|
# Fuzzy match using sequence matcher
|
||||||
|
similarity = SequenceMatcher(None, proj_name, fund_name).ratio()
|
||||||
|
|
||||||
|
# Also check if one contains the other (substring match)
|
||||||
|
if proj_name in fund_name or fund_name in proj_name:
|
||||||
|
similarity = max(similarity, 0.8)
|
||||||
|
|
||||||
|
best_match_score = max(best_match_score, similarity)
|
||||||
|
|
||||||
|
# Count matches with threshold
|
||||||
|
# Perfect match (1.0), strong match (>0.75), partial match (>0.6)
|
||||||
|
if best_match_score >= 0.6:
|
||||||
|
total_matches += best_match_score
|
||||||
|
match_count += 1
|
||||||
|
|
||||||
|
if match_count > 0:
|
||||||
|
# Calculate overlap ratio based on fuzzy matches
|
||||||
|
overlap_ratio = total_matches / len(project_sectors)
|
||||||
|
sector_score = int(30 * overlap_ratio)
|
||||||
|
|
||||||
total_score += sector_score
|
total_score += sector_score
|
||||||
|
|
||||||
# 3. Geographic Match (20 points)
|
# 3. Geographic Match (20 points)
|
||||||
geo_score = 0
|
geo_score = 0
|
||||||
if project.location and fund.geographic_focus:
|
if project.location and fund.geographic_focus:
|
||||||
project_location_lower = project.location.lower()
|
project_location_lower = project.location.lower().strip()
|
||||||
fund_geo_lower = (fund.geographic_focus or "").lower()
|
fund_geo_lower = (fund.geographic_focus or "").lower().strip()
|
||||||
|
|
||||||
# Exact match
|
# Exact match
|
||||||
if project_location_lower == fund_geo_lower:
|
if project_location_lower == fund_geo_lower:
|
||||||
@@ -137,10 +173,11 @@ def _calculate_project_fund_compatibility(
|
|||||||
project_location_lower in fund_geo_lower
|
project_location_lower in fund_geo_lower
|
||||||
or fund_geo_lower in project_location_lower
|
or fund_geo_lower in project_location_lower
|
||||||
):
|
):
|
||||||
geo_score = 10
|
geo_score = 15
|
||||||
# Check for common geographic terms
|
# Check for common geographic terms or regional overlap (continent/country matching)
|
||||||
elif _check_geographic_overlap(project_location_lower, fund_geo_lower):
|
elif _check_geographic_overlap(project_location_lower, fund_geo_lower):
|
||||||
geo_score = 5
|
# Give higher score for continent/country matches (e.g., Germany -> Europe)
|
||||||
|
geo_score = 18
|
||||||
|
|
||||||
total_score += geo_score
|
total_score += geo_score
|
||||||
|
|
||||||
@@ -209,13 +246,44 @@ def _calculate_project_investor_direct_compatibility(
|
|||||||
# 2. Sector Overlap (30 points)
|
# 2. Sector Overlap (30 points)
|
||||||
sector_score = 0
|
sector_score = 0
|
||||||
if project.sector and investor.sectors:
|
if project.sector and investor.sectors:
|
||||||
project_sector_ids = {sector.id for sector in project.sector}
|
project_sectors = [s for s in project.sector if hasattr(s, "name")]
|
||||||
investor_sector_ids = {sector.id for sector in investor.sectors}
|
investor_sectors = [s for s in investor.sectors if hasattr(s, "name")]
|
||||||
|
|
||||||
if project_sector_ids and investor_sector_ids:
|
if project_sectors and investor_sectors:
|
||||||
common_sectors = project_sector_ids.intersection(investor_sector_ids)
|
# Use fuzzy matching to account for similar but not identical sector names
|
||||||
overlap_ratio = len(common_sectors) / len(project_sector_ids)
|
match_count = 0
|
||||||
sector_score = int(30 * overlap_ratio)
|
total_matches = 0
|
||||||
|
|
||||||
|
for proj_sector in project_sectors:
|
||||||
|
best_match_score = 0
|
||||||
|
proj_name = proj_sector.name.lower().strip()
|
||||||
|
|
||||||
|
for inv_sector in investor_sectors:
|
||||||
|
inv_name = inv_sector.name.lower().strip()
|
||||||
|
|
||||||
|
# Exact match
|
||||||
|
if proj_name == inv_name:
|
||||||
|
best_match_score = 1.0
|
||||||
|
break
|
||||||
|
|
||||||
|
# Fuzzy match using sequence matcher
|
||||||
|
similarity = SequenceMatcher(None, proj_name, inv_name).ratio()
|
||||||
|
|
||||||
|
# Also check if one contains the other (substring match)
|
||||||
|
if proj_name in inv_name or inv_name in proj_name:
|
||||||
|
similarity = max(similarity, 0.8)
|
||||||
|
|
||||||
|
best_match_score = max(best_match_score, similarity)
|
||||||
|
|
||||||
|
# Count matches with threshold
|
||||||
|
if best_match_score >= 0.6:
|
||||||
|
total_matches += best_match_score
|
||||||
|
match_count += 1
|
||||||
|
|
||||||
|
if match_count > 0:
|
||||||
|
# Calculate overlap ratio based on fuzzy matches
|
||||||
|
overlap_ratio = total_matches / len(project_sectors)
|
||||||
|
sector_score = int(30 * overlap_ratio)
|
||||||
|
|
||||||
total_score += sector_score
|
total_score += sector_score
|
||||||
|
|
||||||
@@ -231,9 +299,10 @@ def _calculate_project_investor_direct_compatibility(
|
|||||||
project_location_lower in investor_geo_lower
|
project_location_lower in investor_geo_lower
|
||||||
or investor_geo_lower in project_location_lower
|
or investor_geo_lower in project_location_lower
|
||||||
):
|
):
|
||||||
geo_score = 10
|
geo_score = 15
|
||||||
elif _check_geographic_overlap(project_location_lower, investor_geo_lower):
|
elif _check_geographic_overlap(project_location_lower, investor_geo_lower):
|
||||||
geo_score = 5
|
# Give higher score for continent/country matches (e.g., Germany -> Europe)
|
||||||
|
geo_score = 18
|
||||||
|
|
||||||
total_score += geo_score
|
total_score += geo_score
|
||||||
|
|
||||||
@@ -278,8 +347,11 @@ def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int:
|
|||||||
"""
|
"""
|
||||||
stage_order = ["SEED", "SERIES_A", "SERIES_B", "SERIES_C", "GROWTH", "LATE_STAGE"]
|
stage_order = ["SEED", "SERIES_A", "SERIES_B", "SERIES_C", "GROWTH", "LATE_STAGE"]
|
||||||
|
|
||||||
|
# Normalize project stage for comparison
|
||||||
|
project_stage_normalized = project_stage.upper().strip()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
project_idx = stage_order.index(project_stage)
|
project_idx = stage_order.index(project_stage_normalized)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@@ -290,8 +362,10 @@ def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int:
|
|||||||
if project_idx < len(stage_order) - 1:
|
if project_idx < len(stage_order) - 1:
|
||||||
adjacent_stages.append(stage_order[project_idx + 1])
|
adjacent_stages.append(stage_order[project_idx + 1])
|
||||||
|
|
||||||
|
# Normalize fund stages and check for matches
|
||||||
for stage in fund_stages:
|
for stage in fund_stages:
|
||||||
if stage in adjacent_stages:
|
stage_normalized = stage.upper().strip()
|
||||||
|
if stage_normalized in adjacent_stages:
|
||||||
return 15 # Half credit for adjacent stage
|
return 15 # Half credit for adjacent stage
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
@@ -305,25 +379,90 @@ def _check_geographic_overlap(location1: str, location2: str) -> bool:
|
|||||||
- "San Francisco, CA" and "California" -> True
|
- "San Francisco, CA" and "California" -> True
|
||||||
- "New York" and "USA" -> True (if both contain USA/US)
|
- "New York" and "USA" -> True (if both contain USA/US)
|
||||||
- "London, UK" and "United Kingdom" -> True
|
- "London, UK" and "United Kingdom" -> True
|
||||||
|
- "Germany" and "Europe" -> True
|
||||||
"""
|
"""
|
||||||
# Common geographic groupings
|
# Normalize inputs
|
||||||
|
loc1 = location1.lower().strip()
|
||||||
|
loc2 = location2.lower().strip()
|
||||||
|
|
||||||
|
# Common geographic groupings with broader regional mappings
|
||||||
geo_groups = [
|
geo_groups = [
|
||||||
["usa", "us", "united states", "america"],
|
# North America
|
||||||
["uk", "united kingdom", "britain"],
|
["usa", "us", "united states", "america", "u.s.", "u.s.a"],
|
||||||
["california", "ca"],
|
["canada", "canadian"],
|
||||||
["new york", "ny"],
|
["mexico", "mexican"],
|
||||||
|
# Europe and countries
|
||||||
|
[
|
||||||
|
"europe",
|
||||||
|
"european",
|
||||||
|
"eu",
|
||||||
|
"germany",
|
||||||
|
"france",
|
||||||
|
"uk",
|
||||||
|
"united kingdom",
|
||||||
|
"britain",
|
||||||
|
"spain",
|
||||||
|
"italy",
|
||||||
|
"netherlands",
|
||||||
|
"belgium",
|
||||||
|
"sweden",
|
||||||
|
"denmark",
|
||||||
|
"norway",
|
||||||
|
"finland",
|
||||||
|
"poland",
|
||||||
|
"portugal",
|
||||||
|
"austria",
|
||||||
|
"switzerland",
|
||||||
|
"ireland",
|
||||||
|
"greece",
|
||||||
|
"czech",
|
||||||
|
"romania",
|
||||||
|
],
|
||||||
|
# UK specific
|
||||||
|
["uk", "united kingdom", "britain", "england", "scotland", "wales", "london"],
|
||||||
|
# US states
|
||||||
|
["california", "ca", "san francisco", "los angeles", "silicon valley"],
|
||||||
|
["new york", "ny", "nyc"],
|
||||||
["texas", "tx"],
|
["texas", "tx"],
|
||||||
["europe", "eu"],
|
["massachusetts", "ma", "boston"],
|
||||||
["asia", "asian"],
|
["washington", "seattle"],
|
||||||
["africa", "african"],
|
# Asia
|
||||||
|
[
|
||||||
|
"asia",
|
||||||
|
"asian",
|
||||||
|
"china",
|
||||||
|
"japan",
|
||||||
|
"korea",
|
||||||
|
"singapore",
|
||||||
|
"hong kong",
|
||||||
|
"india",
|
||||||
|
"indonesia",
|
||||||
|
"thailand",
|
||||||
|
"vietnam",
|
||||||
|
"malaysia",
|
||||||
|
"philippines",
|
||||||
|
],
|
||||||
|
# Middle East
|
||||||
|
["middle east", "israel", "uae", "dubai", "saudi arabia"],
|
||||||
|
# Latin America
|
||||||
|
["latin america", "brazil", "argentina", "chile", "colombia", "mexico"],
|
||||||
|
# Africa
|
||||||
|
["africa", "african", "south africa", "nigeria", "kenya", "egypt"],
|
||||||
|
# Oceania
|
||||||
|
["australia", "australian", "new zealand"],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Check if both locations match any group
|
||||||
for group in geo_groups:
|
for group in geo_groups:
|
||||||
found_in_1 = any(term in location1 for term in group)
|
found_in_1 = any(term in loc1 for term in group)
|
||||||
found_in_2 = any(term in location2 for term in group)
|
found_in_2 = any(term in loc2 for term in group)
|
||||||
if found_in_1 and found_in_2:
|
if found_in_1 and found_in_2:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# Check for direct substring match (one contains the other)
|
||||||
|
if loc1 in loc2 or loc2 in loc1:
|
||||||
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
+14
-69
@@ -1,14 +1,24 @@
|
|||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
|
handlers=[logging.StreamHandler()],
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class FolkAPI:
|
class FolkAPI:
|
||||||
BASE_URL = "https://api.folk.app/v1"
|
BASE_URL = "https://api.folk.app/v1"
|
||||||
|
|
||||||
def __init__(self, api_key: str):
|
def __init__(self, api_key: str):
|
||||||
|
api_key = os.environ.get("FOLK_API_KEY", api_key)
|
||||||
self.headers = {"Authorization": f"Bearer {api_key}"}
|
self.headers = {"Authorization": f"Bearer {api_key}"}
|
||||||
|
logger.info(f"FolkAPI initialized with API key: {api_key[:4]}***")
|
||||||
|
|
||||||
def get_groups(self):
|
def get_groups(self):
|
||||||
"""Fetch all groups from Folk."""
|
"""Fetch all groups from Folk."""
|
||||||
@@ -109,6 +119,7 @@ class FolkAPI:
|
|||||||
email: str = None,
|
email: str = None,
|
||||||
company_id: str = None,
|
company_id: str = None,
|
||||||
group_id: str = None,
|
group_id: str = None,
|
||||||
|
linkedin_url: str = None,
|
||||||
companies=None,
|
companies=None,
|
||||||
emails=None,
|
emails=None,
|
||||||
phones=None,
|
phones=None,
|
||||||
@@ -174,7 +185,9 @@ class FolkAPI:
|
|||||||
addresses_list = _to_list(addresses)
|
addresses_list = _to_list(addresses)
|
||||||
if addresses_list:
|
if addresses_list:
|
||||||
data["addresses"] = addresses_list
|
data["addresses"] = addresses_list
|
||||||
urls_list = _to_list(urls)
|
urls_list = _to_list(urls) or []
|
||||||
|
if linkedin_url:
|
||||||
|
urls_list.append(linkedin_url)
|
||||||
if urls_list:
|
if urls_list:
|
||||||
data["urls"] = urls_list
|
data["urls"] = urls_list
|
||||||
|
|
||||||
@@ -190,71 +203,3 @@ class FolkAPI:
|
|||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
# Prefer getting the API key from the environment. If not set, fall back to the
|
|
||||||
# existing (hard-coded) key so behavior is unchanged for now.
|
|
||||||
DEFAULT_API_KEY = "FOLKfIGXuv74ML9EAajxyiUR39ePaNrZ"
|
|
||||||
api_key = os.environ.get("FOLK_API_KEY", DEFAULT_API_KEY)
|
|
||||||
|
|
||||||
folk = FolkAPI(api_key=api_key)
|
|
||||||
|
|
||||||
|
|
||||||
def example_flow():
|
|
||||||
# Step 1: Get groups
|
|
||||||
groups = folk.get_groups()
|
|
||||||
print(groups)
|
|
||||||
|
|
||||||
# Safely dig into the returned structure. The API returns groups under
|
|
||||||
# groups['data']['items'] (not groups['data'][0]). Handle missing/empty.
|
|
||||||
items = groups.get("data", {}).get("items", [])
|
|
||||||
if not items:
|
|
||||||
print("No groups returned by Folk API.")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# Choose the first group as an example
|
|
||||||
group_id = items[0].get("id")
|
|
||||||
if not group_id:
|
|
||||||
print("No id found for the first group item.")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# Step 2: Choose a group_id and create a company
|
|
||||||
company = folk.create_company(
|
|
||||||
name="2050 Investment Partners",
|
|
||||||
group_id=group_id,
|
|
||||||
website="https://2050.com",
|
|
||||||
linkedin_url="https://linkedin.com/company/2050-investments",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Step 3: Add a person to the same group or company
|
|
||||||
person = folk.create_person(
|
|
||||||
first_name="John",
|
|
||||||
last_name="Doe",
|
|
||||||
email="john@2050.com",
|
|
||||||
company_id=company.get("data", {}).get("id"),
|
|
||||||
group_id=group_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Created company:", company)
|
|
||||||
print("Created person:", person)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
example_flow()
|
|
||||||
except requests.HTTPError as e:
|
|
||||||
# Try to include response body for easier debugging if available
|
|
||||||
resp = getattr(e, "response", None)
|
|
||||||
if resp is not None:
|
|
||||||
try:
|
|
||||||
body = resp.text
|
|
||||||
except Exception:
|
|
||||||
body = "<unreadable response body>"
|
|
||||||
print("HTTP error while talking to Folk API:", e)
|
|
||||||
print("Response status:", resp.status_code)
|
|
||||||
print("Response body:", body)
|
|
||||||
else:
|
|
||||||
print("HTTP error while talking to Folk API:", e)
|
|
||||||
sys.exit(1)
|
|
||||||
except Exception as e: # pragma: no cover - top-level safety
|
|
||||||
print("Unexpected error:", e)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class QueryProcessor:
|
|||||||
"""Tool to search the web using google, provide the relevant query to get the information"""
|
"""Tool to search the web using google, provide the relevant query to get the information"""
|
||||||
logger.info(f"\nWeb Search Tool Called with query: {query}")
|
logger.info(f"\nWeb Search Tool Called with query: {query}")
|
||||||
if query:
|
if query:
|
||||||
result = self.ddg_search.text(query, max_results=10, backend="google")
|
result = self.ddg_search.text(query, max_results=10)
|
||||||
return result
|
return result
|
||||||
return "No query provided."
|
return "No query provided."
|
||||||
|
|
||||||
@@ -87,11 +87,15 @@ class QueryProcessor:
|
|||||||
context_parts.append(f"Location: {investor_headquarters}")
|
context_parts.append(f"Location: {investor_headquarters}")
|
||||||
if investor_description:
|
if investor_description:
|
||||||
context_parts.append(f"Description: {investor_description}")
|
context_parts.append(f"Description: {investor_description}")
|
||||||
if investment_thesis:
|
if investment_thesis and isinstance(investment_thesis, list):
|
||||||
thesis_str = ", ".join(investment_thesis[:3]) # Limit to first 3
|
thesis_str = ", ".join(
|
||||||
|
str(item) for item in investment_thesis[:3]
|
||||||
|
) # Limit to first 3
|
||||||
context_parts.append(f"Investment Focus: {thesis_str}")
|
context_parts.append(f"Investment Focus: {thesis_str}")
|
||||||
if portfolio_highlights:
|
if portfolio_highlights and isinstance(portfolio_highlights, list):
|
||||||
portfolio_str = ", ".join(portfolio_highlights[:5]) # Limit to first 5
|
portfolio_str = ", ".join(
|
||||||
|
str(item) for item in portfolio_highlights[:5]
|
||||||
|
) # Limit to first 5
|
||||||
context_parts.append(f"Notable Portfolio Companies: {portfolio_str}")
|
context_parts.append(f"Notable Portfolio Companies: {portfolio_str}")
|
||||||
|
|
||||||
context = "\n".join(context_parts)
|
context = "\n".join(context_parts)
|
||||||
|
|||||||
+97
-25
@@ -145,16 +145,74 @@ Return the lower and upper bounds in USD."""
|
|||||||
"""
|
"""
|
||||||
Manually parse the JSON profile from the CSV.
|
Manually parse the JSON profile from the CSV.
|
||||||
Returns a cleaned dictionary with the investor profile data.
|
Returns a cleaned dictionary with the investor profile data.
|
||||||
|
Handles JSON wrapped in markdown code blocks (```json ... ```).
|
||||||
|
Handles trailing quotes and extra data after JSON.
|
||||||
"""
|
"""
|
||||||
if not json_str or pd.isna(json_str):
|
if not json_str or pd.isna(json_str):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Clean the JSON string
|
||||||
|
cleaned_json = json_str.strip()
|
||||||
|
|
||||||
|
# Check if it's plain text (no JSON structure)
|
||||||
|
if not cleaned_json.startswith(("{", "```", "'")):
|
||||||
|
print(" ⚠️ No JSON structure found - skipping")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Remove markdown code block markers if present
|
||||||
|
if cleaned_json.startswith("```"):
|
||||||
|
# Remove opening marker (```json or ```Json or ```)
|
||||||
|
lines = cleaned_json.split("\n")
|
||||||
|
if lines[0].startswith("```"):
|
||||||
|
lines = lines[1:] # Remove first line
|
||||||
|
# Remove closing marker (``` or ```')
|
||||||
|
if lines and lines[-1].strip() in ("```", "```'", '```"'):
|
||||||
|
lines = lines[:-1] # Remove last line
|
||||||
|
cleaned_json = "\n".join(lines).strip()
|
||||||
|
|
||||||
|
# Remove trailing quotes that might be left over
|
||||||
|
if cleaned_json.endswith(("'", '"')):
|
||||||
|
cleaned_json = cleaned_json[:-1].strip()
|
||||||
|
|
||||||
|
# Try to find JSON boundaries if there's extra data
|
||||||
|
# Look for the first { and the last }
|
||||||
|
start_idx = cleaned_json.find("{")
|
||||||
|
if start_idx == -1:
|
||||||
|
print(" ⚠️ No opening brace found - not valid JSON")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Find the matching closing brace
|
||||||
|
# We need to count braces to find the actual end
|
||||||
|
brace_count = 0
|
||||||
|
end_idx = -1
|
||||||
|
for i in range(start_idx, len(cleaned_json)):
|
||||||
|
if cleaned_json[i] == "{":
|
||||||
|
brace_count += 1
|
||||||
|
elif cleaned_json[i] == "}":
|
||||||
|
brace_count -= 1
|
||||||
|
if brace_count == 0:
|
||||||
|
end_idx = i + 1
|
||||||
|
break
|
||||||
|
|
||||||
|
if end_idx == -1:
|
||||||
|
print(" ⚠️ No matching closing brace found")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Extract just the JSON part
|
||||||
|
cleaned_json = cleaned_json[start_idx:end_idx]
|
||||||
|
|
||||||
# Parse JSON string
|
# Parse JSON string
|
||||||
profile = json.loads(json_str)
|
profile = json.loads(cleaned_json)
|
||||||
return profile
|
return profile
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
print(f"Error parsing JSON: {e}")
|
print(f" ❌ JSON parsing error: {e}")
|
||||||
|
# Print first 200 chars for debugging
|
||||||
|
preview = json_str[:200] if len(json_str) > 200 else json_str
|
||||||
|
print(f" Preview: {preview}...")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Unexpected error: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def process_investor_profile(
|
async def process_investor_profile(
|
||||||
@@ -338,34 +396,45 @@ Return the lower and upper bounds in USD."""
|
|||||||
if existing_company:
|
if existing_company:
|
||||||
# Update only founded_year on existing company
|
# Update only founded_year on existing company
|
||||||
company = existing_company
|
company = existing_company
|
||||||
|
updated_fields = []
|
||||||
|
|
||||||
if company_data.get("founded_year"):
|
if company_data.get("founded_year"):
|
||||||
company.founded_year = company_data["founded_year"]
|
company.founded_year = company_data["founded_year"]
|
||||||
|
updated_fields.append(
|
||||||
|
f"founded_year: {company_data['founded_year']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add/update company members (key executives)
|
||||||
|
# First, remove existing members if updating
|
||||||
|
db.query(CompanyMember).filter_by(company_id=company.id).delete()
|
||||||
|
|
||||||
|
exec_count = 0
|
||||||
|
for exec_data in company_data.get("key_executives", []):
|
||||||
|
member = CompanyMember(
|
||||||
|
name=exec_data.get("name"),
|
||||||
|
role=exec_data.get("title"),
|
||||||
|
linkedin=exec_data.get(
|
||||||
|
"source_url"
|
||||||
|
), # Store source URL in linkedin field
|
||||||
|
company_id=company.id,
|
||||||
|
)
|
||||||
|
db.add(member)
|
||||||
|
exec_count += 1
|
||||||
|
|
||||||
|
if exec_count > 0:
|
||||||
|
updated_fields.append(f"{exec_count} executives")
|
||||||
|
|
||||||
|
if updated_fields:
|
||||||
|
print(f" 📝 Updated: {', '.join(updated_fields)}")
|
||||||
|
|
||||||
|
return company
|
||||||
else:
|
else:
|
||||||
# Company should already be in base database, but if not found, skip
|
# Company not found in base database, skip
|
||||||
print(
|
print(" ⚠️ Not in database - skipping")
|
||||||
f"⚠️ Company '{company_data['name']}' not found in base database - skipping"
|
|
||||||
)
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Add/update company members (key executives)
|
|
||||||
# First, remove existing members if updating
|
|
||||||
db.query(CompanyMember).filter_by(company_id=company.id).delete()
|
|
||||||
|
|
||||||
for exec_data in company_data.get("key_executives", []):
|
|
||||||
member = CompanyMember(
|
|
||||||
name=exec_data.get("name"),
|
|
||||||
role=exec_data.get("title"),
|
|
||||||
linkedin=exec_data.get(
|
|
||||||
"source_url"
|
|
||||||
), # Store source URL in linkedin field
|
|
||||||
company_id=company.id,
|
|
||||||
)
|
|
||||||
db.add(member)
|
|
||||||
|
|
||||||
return company
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error saving company to database: {e}")
|
print(f" ❌ Error saving: {e}")
|
||||||
db.rollback()
|
db.rollback()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -789,8 +858,11 @@ Return the lower and upper bounds in USD."""
|
|||||||
if pd.notna(row.get("Investor"))
|
if pd.notna(row.get("Investor"))
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
# Try both column names for flexibility
|
||||||
profile_json = (
|
profile_json = (
|
||||||
row.get("Final Investor Profile", "")
|
row.get("Perplexity Gap Output", "")
|
||||||
|
if pd.notna(row.get("Perplexity Gap Output"))
|
||||||
|
else row.get("Final Investor Profile", "")
|
||||||
if pd.notna(row.get("Final Investor Profile"))
|
if pd.notna(row.get("Final Investor Profile"))
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
|||||||
+144
-71
@@ -1,29 +1,25 @@
|
|||||||
import json
|
import asyncio
|
||||||
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from db.db import DATABASE_URL, get_db
|
from db.db import get_db
|
||||||
from db.models import FundTable, InvestorTable, ProjectTable
|
from db.models import FundTable, InvestorTable, ProjectTable
|
||||||
from langchain import hub
|
from langchain_core.prompts import ChatPromptTemplate
|
||||||
from langchain_community.agent_toolkits import SQLDatabaseToolkit
|
|
||||||
from langchain_community.utilities import SQLDatabase
|
|
||||||
from langchain_openai import ChatOpenAI
|
from langchain_openai import ChatOpenAI
|
||||||
from langgraph.prebuilt import create_react_agent
|
|
||||||
from schemas.router_schemas import (
|
from schemas.router_schemas import (
|
||||||
CompanyMinimal,
|
CompanyMinimal,
|
||||||
InvestmentResponse,
|
InvestmentResponse,
|
||||||
PaginatedResponse,
|
PaginatedResponse,
|
||||||
SectorMinimal,
|
SectorMinimal,
|
||||||
)
|
)
|
||||||
|
from sqlalchemy import text
|
||||||
from sqlalchemy.orm import selectinload
|
from sqlalchemy.orm import selectinload
|
||||||
|
|
||||||
from services.compatibility_score import calculate_project_investor_compatibility
|
from services.compatibility_score import calculate_project_investor_compatibility
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
# Connect to SQLite
|
|
||||||
prompt_template = hub.pull("langchain-ai/sql-agent-system-prompt")
|
|
||||||
db = SQLDatabase.from_uri(DATABASE_URL)
|
|
||||||
|
|
||||||
|
|
||||||
class QueryProcessor:
|
class QueryProcessor:
|
||||||
@@ -34,78 +30,155 @@ class QueryProcessor:
|
|||||||
model="openai/gpt-4o-mini",
|
model="openai/gpt-4o-mini",
|
||||||
temperature=0,
|
temperature=0,
|
||||||
)
|
)
|
||||||
self.toolkit = SQLDatabaseToolkit(db=db, llm=self.llm)
|
|
||||||
# Update system message to specifically request only fund IDs
|
# Query cache for performance
|
||||||
system_message_updated = (
|
self.query_cache = {}
|
||||||
prompt_template.format(dialect="SQLite", top_k=5)
|
|
||||||
+ "\n\n=== IMPORTANT TERMINOLOGY ==="
|
# SQL generation prompt
|
||||||
+ "\n- When users say 'investors' or 'find me investors', they mean FUNDS"
|
self.sql_prompt = ChatPromptTemplate.from_messages(
|
||||||
+ "\n- Always query the 'funds' table for investment opportunities"
|
[
|
||||||
+ "\n- The 'investors' table is for parent company information only"
|
(
|
||||||
+ "\n- Relationship: investors (1) -> (many) funds"
|
"system",
|
||||||
+ "\n\n=== YOUR TASK ==="
|
"""You are a SQL expert. Generate a SQLite query to find fund IDs based on user requirements.
|
||||||
+ "\nReturn ONLY fund IDs (funds.id) that match the user's criteria."
|
|
||||||
+ "\nFormat: comma-separated numbers only (e.g., 1, 5, 12, 23)"
|
Database Schema:
|
||||||
+ "\nNo explanations, no other data."
|
- funds: id, fund_name, investor_id, check_size_lower, check_size_upper, geographic_focus
|
||||||
+ "\n\n=== QUERY GUIDELINES ==="
|
- fund_sectors: fund_id, sector_id
|
||||||
+ "\n1. For geographic searches: use funds.geographic_focus"
|
- fund_investment_stages: fund_id, stage_id
|
||||||
+ "\n2. For sector searches: JOIN with fund_sectors table"
|
- sectors: id, name
|
||||||
+ "\n3. For stage searches: JOIN with fund_investment_stages table"
|
- investment_stages: id, name
|
||||||
+ "\n4. If no results: respond with 'NO_RESULTS'"
|
- investors: id, name, aum
|
||||||
+ "\n5. Never repeat the same failed query"
|
|
||||||
)
|
IMPORTANT RULES:
|
||||||
self.agent = create_react_agent(
|
1. ALWAYS return ONLY fund IDs (funds.id) - use SELECT DISTINCT f.id
|
||||||
model=self.llm,
|
2. For geography: Be FLEXIBLE - use OR with variations and partial matches
|
||||||
tools=self.toolkit.get_tools(),
|
- 'Europe' → WHERE geographic_focus LIKE '%Europe%' OR geographic_focus LIKE '%European%'
|
||||||
prompt=system_message_updated,
|
- 'America' → WHERE geographic_focus LIKE '%America%' OR geographic_focus LIKE '%US%' OR geographic_focus LIKE '%United States%'
|
||||||
|
- 'Asia' → WHERE geographic_focus LIKE '%Asia%' OR geographic_focus LIKE '%Asian%'
|
||||||
|
- If no geography specified, DON'T filter by geography
|
||||||
|
3. For stages: Use LEFT JOIN and LIKE for flexible matching with synonyms
|
||||||
|
- 'Seed' → s.name LIKE '%Seed%' OR s.name LIKE '%Pre-Seed%' OR s.name LIKE '%Early%'
|
||||||
|
- 'Series A' → s.name LIKE '%Series A%' OR s.name LIKE '%A%'
|
||||||
|
- 'Growth' → s.name LIKE '%Growth%' OR s.name LIKE '%Late%' OR s.name LIKE '%Expansion%'
|
||||||
|
- If stage not specified, include ALL funds
|
||||||
|
4. For sectors: Use LEFT JOIN and include related terms with OR
|
||||||
|
- 'Fintech' → sec.name LIKE '%Fintech%' OR sec.name LIKE '%Finance%' OR sec.name LIKE '%Financial%'
|
||||||
|
- 'AI' → sec.name LIKE '%AI%' OR sec.name LIKE '%Artificial Intelligence%' OR sec.name LIKE '%Machine Learning%' OR sec.name LIKE '%ML%'
|
||||||
|
- 'Healthcare' → sec.name LIKE '%Healthcare%' OR sec.name LIKE '%Health%' OR sec.name LIKE '%Medical%' OR sec.name LIKE '%Biotech%'
|
||||||
|
5. For check size filters (be flexible with ranges):
|
||||||
|
- "under X" → WHERE (check_size_upper <= X OR check_size_upper IS NULL)
|
||||||
|
- "over X" → WHERE (check_size_lower >= X OR check_size_lower IS NULL)
|
||||||
|
- "between X and Y" → WHERE check_size_lower >= X AND check_size_upper <= Y
|
||||||
|
6. Use LEFT JOIN for stages and sectors so funds without tags still match
|
||||||
|
7. Use DISTINCT to avoid duplicates from joins
|
||||||
|
8. Be INCLUSIVE - use OR conditions to cast a wider net
|
||||||
|
9. If query is very simple (e.g., just "seed stage"), don't add unnecessary filters
|
||||||
|
10. Return a single, complete SELECT query
|
||||||
|
|
||||||
|
Example Queries:
|
||||||
|
Q: "Seed stage investors in Europe"
|
||||||
|
A: SELECT DISTINCT f.id FROM funds f
|
||||||
|
LEFT JOIN fund_investment_stages fis ON f.id = fis.fund_id
|
||||||
|
LEFT JOIN investment_stages s ON fis.stage_id = s.id
|
||||||
|
WHERE (s.name LIKE '%Seed%' OR s.name LIKE '%Pre-Seed%' OR s.name LIKE '%Early%' OR s.id IS NULL)
|
||||||
|
AND (f.geographic_focus LIKE '%Europe%' OR f.geographic_focus LIKE '%European%')
|
||||||
|
|
||||||
|
Q: "Fintech investors with check size under 5 million"
|
||||||
|
A: SELECT DISTINCT f.id FROM funds f
|
||||||
|
LEFT JOIN fund_sectors fs ON f.id = fs.fund_id
|
||||||
|
LEFT JOIN sectors sec ON fs.sector_id = sec.id
|
||||||
|
WHERE (sec.name LIKE '%Fintech%' OR sec.name LIKE '%Finance%' OR sec.name LIKE '%Financial%' OR sec.id IS NULL)
|
||||||
|
AND (f.check_size_upper <= 5000000 OR f.check_size_upper IS NULL)
|
||||||
|
|
||||||
|
Q: "Seed stage investors"
|
||||||
|
A: SELECT DISTINCT f.id FROM funds f
|
||||||
|
LEFT JOIN fund_investment_stages fis ON f.id = fis.fund_id
|
||||||
|
LEFT JOIN investment_stages s ON fis.stage_id = s.id
|
||||||
|
WHERE s.name LIKE '%Seed%' OR s.name LIKE '%Pre-Seed%' OR s.name LIKE '%Early%'
|
||||||
|
|
||||||
|
Q: "Growth stage investors"
|
||||||
|
A: SELECT DISTINCT f.id FROM funds f
|
||||||
|
LEFT JOIN fund_investment_stages fis ON f.id = fis.fund_id
|
||||||
|
LEFT JOIN investment_stages s ON fis.stage_id = s.id
|
||||||
|
WHERE s.name LIKE '%Growth%' OR s.name LIKE '%Late%' OR s.name LIKE '%Expansion%' OR s.name LIKE '%Series C%' OR s.name LIKE '%Series D%'
|
||||||
|
|
||||||
|
Q: "AI investors in America"
|
||||||
|
A: SELECT DISTINCT f.id FROM funds f
|
||||||
|
LEFT JOIN fund_sectors fs ON f.id = fs.fund_id
|
||||||
|
LEFT JOIN sectors sec ON fs.sector_id = sec.id
|
||||||
|
WHERE (sec.name LIKE '%AI%' OR sec.name LIKE '%Artificial Intelligence%' OR sec.name LIKE '%Machine Learning%' OR sec.name LIKE '%ML%')
|
||||||
|
AND (f.geographic_focus LIKE '%America%' OR f.geographic_focus LIKE '%US%' OR f.geographic_focus LIKE '%United States%' OR f.geographic_focus LIKE '%USA%')
|
||||||
|
|
||||||
|
Q: "Healthcare investors"
|
||||||
|
A: SELECT DISTINCT f.id FROM funds f
|
||||||
|
LEFT JOIN fund_sectors fs ON f.id = fs.fund_id
|
||||||
|
LEFT JOIN sectors sec ON fs.sector_id = sec.id
|
||||||
|
WHERE sec.name LIKE '%Healthcare%' OR sec.name LIKE '%Health%' OR sec.name LIKE '%Medical%' OR sec.name LIKE '%Biotech%' OR sec.name LIKE '%Pharma%'
|
||||||
|
|
||||||
|
IMPORTANT: Use LEFT JOIN so funds without sector/stage tags can still match. Include synonym terms with OR for better recall.
|
||||||
|
|
||||||
|
Return ONLY the SQL query, no explanations or markdown.""",
|
||||||
|
),
|
||||||
|
("user", "{question}"),
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
def process_query(
|
def _get_cache_key(self, question: str) -> str:
|
||||||
|
"""Generate cache key from normalized question."""
|
||||||
|
return hashlib.md5(question.lower().strip().encode()).hexdigest()
|
||||||
|
|
||||||
|
async def process_query(
|
||||||
self, question: str, project_id: Optional[int] = None
|
self, question: str, project_id: Optional[int] = None
|
||||||
) -> PaginatedResponse[InvestmentResponse]:
|
) -> PaginatedResponse[InvestmentResponse]:
|
||||||
"""Process a query using the LLM and return investment response data.
|
"""Async wrapper for process_query. Runs blocking work in a thread to avoid
|
||||||
|
blocking the event loop.
|
||||||
Args:
|
|
||||||
question: The natural language query to process
|
|
||||||
project_id: Optional project ID for compatibility scoring
|
|
||||||
"""
|
"""
|
||||||
# Let the LLM handle all database interactions and filtering to get fund IDs
|
return await asyncio.to_thread(self._process_query_sync, question, project_id)
|
||||||
response = self.agent.invoke(
|
|
||||||
{"messages": [("user", question)]},
|
|
||||||
config={"recursion_limit": 50},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract the actual message content
|
def _process_query_sync(
|
||||||
logger.info(f"{response}")
|
self, question: str, project_id: Optional[int] = None
|
||||||
final_message_content = response["messages"][-1].content
|
) -> PaginatedResponse[InvestmentResponse]:
|
||||||
logger.info(f"AI Response: \n{final_message_content}")
|
"""Synchronous implementation of process_query. This is run in a thread by
|
||||||
# Extract fund IDs from the AI response
|
the async wrapper above.
|
||||||
fund_ids = self._extract_fund_ids_from_response(final_message_content)
|
"""
|
||||||
|
cache_key = self._get_cache_key(question)
|
||||||
|
|
||||||
# Fetch full fund data with investor relationships using the IDs
|
# Check cache first
|
||||||
return self._fetch_funds_by_ids(fund_ids, project_id)
|
if cache_key in self.query_cache:
|
||||||
|
sql_query = self.query_cache[cache_key]
|
||||||
|
logger.info(f"Using cached SQL: {sql_query}")
|
||||||
|
else:
|
||||||
|
# Generate SQL query
|
||||||
|
messages = self.sql_prompt.format_messages(question=question)
|
||||||
|
response = self.llm.invoke(messages)
|
||||||
|
sql_query = response.content.strip()
|
||||||
|
|
||||||
def _extract_fund_ids_from_response(self, ai_response: str) -> List[int]:
|
# Clean up SQL (remove markdown code blocks if present)
|
||||||
"""Extract fund IDs from AI response."""
|
sql_query = sql_query.replace("```sql", "").replace("```", "").strip()
|
||||||
import re
|
|
||||||
|
|
||||||
fund_ids = []
|
# Cache the query
|
||||||
|
self.query_cache[cache_key] = sql_query
|
||||||
|
logger.info(f"Generated SQL: {sql_query}")
|
||||||
|
|
||||||
|
# Execute query to get fund IDs
|
||||||
|
db_session = next(get_db())
|
||||||
try:
|
try:
|
||||||
# Try multiple patterns to extract IDs from the response
|
result = db_session.execute(text(sql_query))
|
||||||
# Pattern 1: Simple numbers (assuming they are IDs)
|
fund_ids = [row[0] for row in result.fetchall()]
|
||||||
numbers = re.findall(r"\b\d+\b", ai_response)
|
logger.info(
|
||||||
fund_ids = [int(num) for num in numbers]
|
f"Found {len(fund_ids)} fund IDs: {fund_ids[:10]}{'...' if len(fund_ids) > 10 else ''}"
|
||||||
|
)
|
||||||
# Pattern 2: If response contains explicit ID references
|
|
||||||
id_matches = re.findall(r"\bid[:\s]*(\d+)", ai_response.lower())
|
|
||||||
if id_matches:
|
|
||||||
fund_ids = [int(id_str) for id_str in id_matches]
|
|
||||||
|
|
||||||
|
return self._fetch_funds_by_ids(fund_ids, project_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error extracting IDs from response: {e}")
|
logger.error(f"SQL execution error: {e}")
|
||||||
return []
|
logger.error(f"Failed SQL: {sql_query}")
|
||||||
|
# Return empty result
|
||||||
return fund_ids
|
return PaginatedResponse(
|
||||||
|
items=[], total=0, page=1, page_size=10, total_pages=0
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
db_session.close()
|
||||||
|
|
||||||
def _fetch_funds_by_ids(
|
def _fetch_funds_by_ids(
|
||||||
self, fund_ids: List[int], project_id: Optional[int] = None
|
self, fund_ids: List[int], project_id: Optional[int] = None
|
||||||
@@ -185,10 +258,10 @@ class QueryProcessor:
|
|||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get top 3 sectors from fund (id and name only)
|
# Get top 3 sectors from fund (id and name only) - sorted alphabetically
|
||||||
fund_sectors = [
|
fund_sectors = [
|
||||||
SectorMinimal(id=sector.id, name=sector.name)
|
SectorMinimal(id=sector.id, name=sector.name)
|
||||||
for sector in (fund.sectors[:3] if fund.sectors else [])
|
for sector in sorted(fund.sectors[:3] if fund.sectors else [], key=lambda s: s.name)
|
||||||
]
|
]
|
||||||
|
|
||||||
investment_response = InvestmentResponse(
|
investment_response = InvestmentResponse(
|
||||||
|
|||||||
+164
-83
@@ -1,9 +1,13 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
# Import database models and compatibility score service
|
||||||
|
from db.models import InvestorTable, ProjectTable
|
||||||
from jinja2 import Environment, FileSystemLoader
|
from jinja2 import Environment, FileSystemLoader
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
|
from services.compatibility_score import calculate_project_investor_compatibility
|
||||||
|
|
||||||
|
|
||||||
class ReportGenerator:
|
class ReportGenerator:
|
||||||
"""Service for generating PDF reports from HTML templates"""
|
"""Service for generating PDF reports from HTML templates"""
|
||||||
@@ -17,6 +21,8 @@ class ReportGenerator:
|
|||||||
self,
|
self,
|
||||||
investor_data: Dict[str, Any],
|
investor_data: Dict[str, Any],
|
||||||
project_data: Optional[Dict[str, Any]] = None,
|
project_data: Optional[Dict[str, Any]] = None,
|
||||||
|
investor_model: Optional[InvestorTable] = None,
|
||||||
|
project_model: Optional[ProjectTable] = None,
|
||||||
) -> bytes:
|
) -> bytes:
|
||||||
"""
|
"""
|
||||||
Generate a PDF report for an investor profile.
|
Generate a PDF report for an investor profile.
|
||||||
@@ -24,12 +30,16 @@ class ReportGenerator:
|
|||||||
Args:
|
Args:
|
||||||
investor_data: Dictionary containing investor information
|
investor_data: Dictionary containing investor information
|
||||||
project_data: Optional dictionary containing project information for compatibility analysis
|
project_data: Optional dictionary containing project information for compatibility analysis
|
||||||
|
investor_model: Optional database model for investor (used for compatibility scoring)
|
||||||
|
project_model: Optional database model for project (used for compatibility scoring)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bytes: PDF file content
|
bytes: PDF file content
|
||||||
"""
|
"""
|
||||||
# Prepare template context
|
# Prepare template context
|
||||||
context = self._prepare_context(investor_data, project_data)
|
context = self._prepare_context(
|
||||||
|
investor_data, project_data, investor_model, project_model
|
||||||
|
)
|
||||||
|
|
||||||
# Render HTML from template
|
# Render HTML from template
|
||||||
template = self.env.get_template("report.html")
|
template = self.env.get_template("report.html")
|
||||||
@@ -43,6 +53,8 @@ class ReportGenerator:
|
|||||||
self,
|
self,
|
||||||
investor_data: Dict[str, Any],
|
investor_data: Dict[str, Any],
|
||||||
project_data: Optional[Dict[str, Any]] = None,
|
project_data: Optional[Dict[str, Any]] = None,
|
||||||
|
investor_model: Optional[InvestorTable] = None,
|
||||||
|
project_model: Optional[ProjectTable] = None,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Prepare the context dictionary for template rendering"""
|
"""Prepare the context dictionary for template rendering"""
|
||||||
context = {
|
context = {
|
||||||
@@ -55,9 +67,20 @@ class ReportGenerator:
|
|||||||
|
|
||||||
# If project data is provided, calculate compatibility
|
# If project data is provided, calculate compatibility
|
||||||
if project_data:
|
if project_data:
|
||||||
context["compatibility_score"] = self._calculate_compatibility_score(
|
# Use the compatibility_score service if models are provided
|
||||||
investor_data, project_data
|
if investor_model and project_model:
|
||||||
)
|
# Calculate using the standardized compatibility score service
|
||||||
|
# Returns score between 0 and 1, convert to percentage (0-100)
|
||||||
|
score_decimal = calculate_project_investor_compatibility(
|
||||||
|
project=project_model, investor=investor_model, use_funds=True
|
||||||
|
)
|
||||||
|
context["compatibility_score"] = int(score_decimal * 100)
|
||||||
|
else:
|
||||||
|
# Fallback to old calculation method if models not provided
|
||||||
|
context["compatibility_score"] = self._calculate_compatibility_score(
|
||||||
|
investor_data, project_data
|
||||||
|
)
|
||||||
|
|
||||||
context["match_criteria"] = self._generate_match_criteria(
|
context["match_criteria"] = self._generate_match_criteria(
|
||||||
investor_data, project_data
|
investor_data, project_data
|
||||||
)
|
)
|
||||||
@@ -76,43 +99,75 @@ class ReportGenerator:
|
|||||||
"sector": 30,
|
"sector": 30,
|
||||||
"stage": 30,
|
"stage": 30,
|
||||||
"geography": 20,
|
"geography": 20,
|
||||||
"check_size": 15,
|
"check_size": 20,
|
||||||
"thesis": 5,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Aggregate data from all funds
|
||||||
|
all_sectors = set(investor_data.get("sectors", []))
|
||||||
|
all_stages = set()
|
||||||
|
all_geographies = []
|
||||||
|
check_ranges = []
|
||||||
|
|
||||||
|
for fund in investor_data.get("funds", []):
|
||||||
|
all_sectors.update(fund.get("sectors", []))
|
||||||
|
all_stages.update(fund.get("investment_stages", []))
|
||||||
|
if fund.get("geographic_focus"):
|
||||||
|
all_geographies.append(fund["geographic_focus"])
|
||||||
|
if fund.get("check_size_lower") and fund.get("check_size_upper"):
|
||||||
|
check_ranges.append(
|
||||||
|
{
|
||||||
|
"lower": fund["check_size_lower"],
|
||||||
|
"upper": fund["check_size_upper"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Sector match
|
# Sector match
|
||||||
investor_sectors = set(investor_data.get("sectors", []))
|
|
||||||
project_sectors = set(project_data.get("sectors", []))
|
project_sectors = set(project_data.get("sectors", []))
|
||||||
if investor_sectors and project_sectors:
|
if all_sectors and project_sectors:
|
||||||
if investor_sectors & project_sectors:
|
if all_sectors & project_sectors:
|
||||||
score += weights["sector"]
|
score += weights["sector"]
|
||||||
|
|
||||||
# Stage match
|
# Stage match - case insensitive comparison
|
||||||
investor_stages = set(investor_data.get("investment_stages", []))
|
|
||||||
project_stage = project_data.get("stage")
|
project_stage = project_data.get("stage")
|
||||||
if project_stage and project_stage in investor_stages:
|
if project_stage and all_stages:
|
||||||
score += weights["stage"]
|
# Normalize stage names for comparison (case-insensitive)
|
||||||
|
normalized_stages = {
|
||||||
|
stage.lower().replace("_", " ") for stage in all_stages
|
||||||
|
}
|
||||||
|
project_stage_normalized = project_stage.lower().replace("_", " ")
|
||||||
|
if project_stage_normalized in normalized_stages:
|
||||||
|
score += weights["stage"]
|
||||||
|
|
||||||
# Geography match
|
# Geography match - check if any fund matches
|
||||||
investor_geo = (investor_data.get("geographic_focus") or "").lower()
|
|
||||||
project_geo = (project_data.get("location") or "").lower()
|
project_geo = (project_data.get("location") or "").lower()
|
||||||
if investor_geo and project_geo and investor_geo in project_geo:
|
geo_match = False
|
||||||
|
if all_geographies:
|
||||||
|
for geo in all_geographies:
|
||||||
|
if geo:
|
||||||
|
geo_lower = geo.lower()
|
||||||
|
# Match if investor geography is "global" or if there's a location overlap
|
||||||
|
if "global" in geo_lower or "worldwide" in geo_lower:
|
||||||
|
geo_match = True
|
||||||
|
break
|
||||||
|
if project_geo and (
|
||||||
|
geo_lower in project_geo or project_geo in geo_lower
|
||||||
|
):
|
||||||
|
geo_match = True
|
||||||
|
break
|
||||||
|
if geo_match:
|
||||||
score += weights["geography"]
|
score += weights["geography"]
|
||||||
|
|
||||||
# Check size match
|
# Check size match - check if any fund's range matches
|
||||||
project_valuation = project_data.get("valuation", 0)
|
project_valuation = project_data.get("valuation", 0)
|
||||||
check_lower = investor_data.get("check_size_lower") or 0
|
check_match = False
|
||||||
check_upper = investor_data.get("check_size_upper") or float("inf")
|
if project_valuation and check_ranges:
|
||||||
if (
|
for check_range in check_ranges:
|
||||||
check_lower
|
if check_range["lower"] <= project_valuation <= check_range["upper"]:
|
||||||
and check_upper
|
check_match = True
|
||||||
and check_lower <= project_valuation <= check_upper
|
break
|
||||||
):
|
if check_match:
|
||||||
score += weights["check_size"]
|
score += weights["check_size"]
|
||||||
|
|
||||||
# Thesis alignment (simplified)
|
|
||||||
score += weights["thesis"]
|
|
||||||
|
|
||||||
return min(score, 100)
|
return min(score, 100)
|
||||||
|
|
||||||
def _generate_match_criteria(
|
def _generate_match_criteria(
|
||||||
@@ -121,86 +176,124 @@ class ReportGenerator:
|
|||||||
"""Generate detailed match criteria table"""
|
"""Generate detailed match criteria table"""
|
||||||
criteria = []
|
criteria = []
|
||||||
|
|
||||||
|
# Aggregate data from all funds
|
||||||
|
all_sectors = set(investor_data.get("sectors", []))
|
||||||
|
all_stages = set()
|
||||||
|
all_geographies = []
|
||||||
|
check_ranges = []
|
||||||
|
|
||||||
|
for fund in investor_data.get("funds", []):
|
||||||
|
all_sectors.update(fund.get("sectors", []))
|
||||||
|
all_stages.update(fund.get("investment_stages", []))
|
||||||
|
if fund.get("geographic_focus"):
|
||||||
|
all_geographies.append(fund["geographic_focus"])
|
||||||
|
if fund.get("check_size_lower") and fund.get("check_size_upper"):
|
||||||
|
check_ranges.append(
|
||||||
|
{
|
||||||
|
"lower": fund["check_size_lower"],
|
||||||
|
"upper": fund["check_size_upper"],
|
||||||
|
"fund_name": fund.get("fund_name", "Unnamed Fund"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Sector criterion
|
# Sector criterion
|
||||||
investor_sectors = investor_data.get("sectors", [])
|
|
||||||
project_sectors = project_data.get("sectors", [])
|
project_sectors = project_data.get("sectors", [])
|
||||||
sector_match = (
|
sector_match = "Perfect" if all_sectors & set(project_sectors) else "Mismatch"
|
||||||
"Perfect" if set(investor_sectors) & set(project_sectors) else "Mismatch"
|
|
||||||
)
|
|
||||||
criteria.append(
|
criteria.append(
|
||||||
{
|
{
|
||||||
"name": "Sector",
|
"name": "Sector",
|
||||||
"requirement": "Cybersecurity, B2B SaaS" if project_sectors else "N/A",
|
"requirement": ", ".join(project_sectors) if project_sectors else "N/A",
|
||||||
"evidence": ", ".join(investor_sectors[:3])
|
"evidence": ", ".join(list(all_sectors)[:3]) if all_sectors else "N/A",
|
||||||
if investor_sectors
|
|
||||||
else "N/A",
|
|
||||||
"match": sector_match,
|
"match": sector_match,
|
||||||
"weight": "30%",
|
"weight": "30%",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Stage criterion
|
# Stage criterion - case insensitive comparison
|
||||||
investor_stages = investor_data.get("investment_stages", [])
|
|
||||||
project_stage = project_data.get("stage", "N/A")
|
project_stage = project_data.get("stage", "N/A")
|
||||||
stage_match = "Perfect" if project_stage in investor_stages else "Mismatch"
|
stage_match = "Mismatch"
|
||||||
|
if project_stage != "N/A" and all_stages:
|
||||||
|
# Normalize stage names for comparison
|
||||||
|
normalized_stages = {
|
||||||
|
stage.lower().replace("_", " ") for stage in all_stages
|
||||||
|
}
|
||||||
|
project_stage_normalized = project_stage.lower().replace("_", " ")
|
||||||
|
stage_match = (
|
||||||
|
"Perfect"
|
||||||
|
if project_stage_normalized in normalized_stages
|
||||||
|
else "Mismatch"
|
||||||
|
)
|
||||||
|
elif project_stage == "N/A":
|
||||||
|
stage_match = "N/A"
|
||||||
|
|
||||||
criteria.append(
|
criteria.append(
|
||||||
{
|
{
|
||||||
"name": "Stage",
|
"name": "Stage",
|
||||||
"requirement": str(project_stage),
|
"requirement": str(project_stage),
|
||||||
"evidence": ", ".join(investor_stages) if investor_stages else "N/A",
|
"evidence": ", ".join(all_stages) if all_stages else "N/A",
|
||||||
"match": stage_match,
|
"match": stage_match,
|
||||||
"weight": "30%",
|
"weight": "30%",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Geography criterion
|
# Geography criterion
|
||||||
investor_geo = investor_data.get("geographic_focus") or "N/A"
|
|
||||||
project_geo = project_data.get("location") or "N/A"
|
project_geo = project_data.get("location") or "N/A"
|
||||||
|
investor_geo_display = ", ".join(all_geographies) if all_geographies else "N/A"
|
||||||
|
|
||||||
|
# Safe comparison handling None values and "Global" matches
|
||||||
|
geo_match = "Mismatch"
|
||||||
|
if project_geo != "N/A" and all_geographies:
|
||||||
|
for geo in all_geographies:
|
||||||
|
if geo:
|
||||||
|
geo_lower = geo.lower()
|
||||||
|
# Match if investor geography is "global" or if there's a location overlap
|
||||||
|
if "global" in geo_lower or "worldwide" in geo_lower:
|
||||||
|
geo_match = "Perfect"
|
||||||
|
break
|
||||||
|
if (
|
||||||
|
geo_lower in project_geo.lower()
|
||||||
|
or project_geo.lower() in geo_lower
|
||||||
|
):
|
||||||
|
geo_match = "Strong"
|
||||||
|
break
|
||||||
|
elif not all_geographies and project_geo == "N/A":
|
||||||
|
geo_match = "N/A"
|
||||||
|
|
||||||
# Safe comparison handling None values
|
|
||||||
if investor_geo == "N/A" or project_geo == "N/A":
|
|
||||||
geo_match = (
|
|
||||||
"N/A" if investor_geo == "N/A" and project_geo == "N/A" else "Mismatch"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
investor_geo_lower = investor_geo.lower()
|
|
||||||
project_geo_lower = project_geo.lower()
|
|
||||||
geo_match = (
|
|
||||||
"Strong"
|
|
||||||
if investor_geo_lower in project_geo_lower
|
|
||||||
or project_geo_lower in investor_geo_lower
|
|
||||||
else "Mismatch"
|
|
||||||
)
|
|
||||||
criteria.append(
|
criteria.append(
|
||||||
{
|
{
|
||||||
"name": "Geography",
|
"name": "Geography",
|
||||||
"requirement": project_geo,
|
"requirement": project_geo,
|
||||||
"evidence": investor_geo,
|
"evidence": investor_geo_display,
|
||||||
"match": geo_match,
|
"match": geo_match,
|
||||||
"weight": "20%",
|
"weight": "20%",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check Size criterion
|
# Check Size criterion
|
||||||
check_lower = investor_data.get("check_size_lower") or 0
|
|
||||||
check_upper = investor_data.get("check_size_upper") or 0
|
|
||||||
project_val = project_data.get("valuation", 0)
|
project_val = project_data.get("valuation", 0)
|
||||||
|
|
||||||
|
# Build evidence string from all fund ranges
|
||||||
check_evidence = "N/A"
|
check_evidence = "N/A"
|
||||||
if check_lower and check_upper:
|
if check_ranges:
|
||||||
check_evidence = (
|
evidence_parts = []
|
||||||
f"€{check_lower / 1000000:.0f}M - €{check_upper / 1000000:.0f}M"
|
for cr in check_ranges[:3]: # Show up to 3 funds
|
||||||
)
|
range_str = (
|
||||||
elif check_lower:
|
f"€{cr['lower'] / 1000000:.0f}M - €{cr['upper'] / 1000000:.0f}M"
|
||||||
check_evidence = f"€{check_lower / 1000000:.0f}M+"
|
)
|
||||||
|
if cr["fund_name"]:
|
||||||
|
evidence_parts.append(f"{cr['fund_name']}: {range_str}")
|
||||||
|
else:
|
||||||
|
evidence_parts.append(range_str)
|
||||||
|
check_evidence = "; ".join(evidence_parts)
|
||||||
|
|
||||||
|
# Check if project valuation matches any fund
|
||||||
|
check_match = "N/A"
|
||||||
|
if project_val > 0 and check_ranges:
|
||||||
|
match_found = any(
|
||||||
|
cr["lower"] <= project_val <= cr["upper"] for cr in check_ranges
|
||||||
|
)
|
||||||
|
check_match = "Perfect" if match_found else "Mismatch"
|
||||||
|
|
||||||
check_match = (
|
|
||||||
"Perfect"
|
|
||||||
if check_lower and check_upper and check_lower <= project_val <= check_upper
|
|
||||||
else "Strong"
|
|
||||||
if project_val > 0
|
|
||||||
else "N/A"
|
|
||||||
)
|
|
||||||
criteria.append(
|
criteria.append(
|
||||||
{
|
{
|
||||||
"name": "Check Size",
|
"name": "Check Size",
|
||||||
@@ -209,19 +302,7 @@ class ReportGenerator:
|
|||||||
else "N/A",
|
else "N/A",
|
||||||
"evidence": check_evidence,
|
"evidence": check_evidence,
|
||||||
"match": check_match,
|
"match": check_match,
|
||||||
"weight": "15%",
|
"weight": "20%",
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Thesis criterion
|
|
||||||
thesis = investor_data.get("investment_thesis", [])
|
|
||||||
criteria.append(
|
|
||||||
{
|
|
||||||
"name": "Thesis",
|
|
||||||
"requirement": "Founder-led, ESG focus",
|
|
||||||
"evidence": ", ".join(thesis[:2]) if thesis else "Entrepreneur-led",
|
|
||||||
"match": "Strong",
|
|
||||||
"weight": "5%",
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
+38
-31
@@ -161,13 +161,6 @@
|
|||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div>
|
|
||||||
<p class="text-xs text-gray-600">DACH Region:</p>
|
|
||||||
<p class="font-semibold text-gray-900">
|
|
||||||
{{ investor.geographic_focus or 'N/A' }}
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
<p class="text-xs text-gray-600">AUM (EUR million):</p>
|
<p class="text-xs text-gray-600">AUM (EUR million):</p>
|
||||||
<p class="font-semibold text-gray-900">
|
<p class="font-semibold text-gray-900">
|
||||||
@@ -179,33 +172,47 @@
|
|||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="mb-4">
|
<div>
|
||||||
<p class="text-xs text-gray-600 mb-1">
|
<p class="text-xs text-gray-600 mb-1">Number of Funds:</p>
|
||||||
Investment Stage:
|
<p class="font-semibold text-gray-900">
|
||||||
</p>
|
{{ investor.funds | length if investor.funds else 'N/A' }}
|
||||||
<p class="text-sm font-semibold text-gray-900">
|
|
||||||
{% if investor.investment_stages %} {{
|
|
||||||
investor.investment_stages | join(', ') }} {% else
|
|
||||||
%} N/A {% endif %}
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="mb-4">
|
|
||||||
<p class="text-xs text-gray-600 mb-1">
|
|
||||||
Est. Investment Size:
|
|
||||||
</p>
|
|
||||||
<p class="text-sm font-semibold text-gray-900">
|
|
||||||
{% if investor.check_size_lower and
|
|
||||||
investor.check_size_upper %} €{{
|
|
||||||
'{:,.0f}'.format(investor.check_size_lower /
|
|
||||||
1000000) }}M - €{{
|
|
||||||
'{:,.0f}'.format(investor.check_size_upper /
|
|
||||||
1000000) }}M {% elif investor.check_size_lower %}
|
|
||||||
€{{ '{:,.0f}'.format(investor.check_size_lower /
|
|
||||||
1000000) }}M+ {% else %} N/A {% endif %}
|
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div class="mt-4">
|
||||||
|
<h3 class="text-xs font-bold text-gray-900 uppercase mb-2">
|
||||||
|
Fund Details
|
||||||
|
</h3>
|
||||||
|
{% if investor.funds %}
|
||||||
|
{% for fund in investor.funds %}
|
||||||
|
<div class="mb-3 pb-3 border-b border-gray-200">
|
||||||
|
<p class="text-sm font-semibold text-gray-900 mb-1">
|
||||||
|
{{ fund.fund_name or 'Fund ' + loop.index|string }}
|
||||||
|
</p>
|
||||||
|
<div class="text-xs text-gray-700 space-y-1">
|
||||||
|
{% if fund.fund_size %}
|
||||||
|
<p>Fund Size: €{{ '{:,.0f}'.format(fund.fund_size / 1000000) }}M</p>
|
||||||
|
{% endif %}
|
||||||
|
{% if fund.check_size_lower and fund.check_size_upper %}
|
||||||
|
<p>Check Size: €{{ '{:,.0f}'.format(fund.check_size_lower / 1000000) }}M - €{{ '{:,.0f}'.format(fund.check_size_upper / 1000000) }}M</p>
|
||||||
|
{% endif %}
|
||||||
|
{% if fund.geographic_focus %}
|
||||||
|
<p>Geography: {{ fund.geographic_focus }}</p>
|
||||||
|
{% endif %}
|
||||||
|
{% if fund.investment_stages %}
|
||||||
|
<p>Stages: {{ fund.investment_stages | join(', ') }}</p>
|
||||||
|
{% endif %}
|
||||||
|
{% if fund.sectors %}
|
||||||
|
<p>Sectors: {{ fund.sectors[:3] | join(', ') }}</p>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
{% else %}
|
||||||
|
<p class="text-xs text-gray-500">No fund information available</p>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
@@ -0,0 +1,117 @@
|
|||||||
|
"""
|
||||||
|
Migration: Add fields from feedback fixes
|
||||||
|
Date: 2025-01-07
|
||||||
|
|
||||||
|
Adds the following fields:
|
||||||
|
- projects.is_archived (INTEGER, default 0)
|
||||||
|
- companies.product_service (TEXT, nullable)
|
||||||
|
- companies.clients (TEXT, nullable - stored as JSON string)
|
||||||
|
- investor_members.linkedin (VARCHAR, nullable)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path to import app modules
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.db.db import engine
|
||||||
|
|
||||||
|
|
||||||
|
def check_column_exists(conn, table_name, column_name):
|
||||||
|
"""Check if a column exists in a table"""
|
||||||
|
result = conn.execute(text(f"PRAGMA table_info({table_name})"))
|
||||||
|
columns = [row[1] for row in result]
|
||||||
|
return column_name in columns
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade():
|
||||||
|
"""Add new columns to tables"""
|
||||||
|
print("Running migration: Add feedback fixes fields")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
with engine.begin() as conn: # Use begin() for transaction management
|
||||||
|
# 1. Add is_archived to projects table
|
||||||
|
print("\n1. Adding 'is_archived' column to projects table...")
|
||||||
|
if check_column_exists(conn, "projects", "is_archived"):
|
||||||
|
print(" ✓ Column 'is_archived' already exists. Skipping.")
|
||||||
|
else:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"ALTER TABLE projects ADD COLUMN is_archived INTEGER DEFAULT 0 NOT NULL"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# Set default value for existing rows
|
||||||
|
conn.execute(
|
||||||
|
text("UPDATE projects SET is_archived = 0 WHERE is_archived IS NULL")
|
||||||
|
)
|
||||||
|
print(" ✓ Successfully added 'is_archived' column to projects table")
|
||||||
|
|
||||||
|
# 2. Add product_service to companies table
|
||||||
|
print("\n2. Adding 'product_service' column to companies table...")
|
||||||
|
if check_column_exists(conn, "companies", "product_service"):
|
||||||
|
print(" ✓ Column 'product_service' already exists. Skipping.")
|
||||||
|
else:
|
||||||
|
conn.execute(text("ALTER TABLE companies ADD COLUMN product_service TEXT"))
|
||||||
|
print(" ✓ Successfully added 'product_service' column to companies table")
|
||||||
|
|
||||||
|
# 3. Add clients to companies table
|
||||||
|
print("\n3. Adding 'clients' column to companies table...")
|
||||||
|
if check_column_exists(conn, "companies", "clients"):
|
||||||
|
print(" ✓ Column 'clients' already exists. Skipping.")
|
||||||
|
else:
|
||||||
|
conn.execute(text("ALTER TABLE companies ADD COLUMN clients TEXT"))
|
||||||
|
print(" ✓ Successfully added 'clients' column to companies table")
|
||||||
|
|
||||||
|
# 4. Add linkedin to investor_members table
|
||||||
|
print("\n4. Adding 'linkedin' column to investor_members table...")
|
||||||
|
if check_column_exists(conn, "investor_members", "linkedin"):
|
||||||
|
print(" ✓ Column 'linkedin' already exists. Skipping.")
|
||||||
|
else:
|
||||||
|
conn.execute(
|
||||||
|
text("ALTER TABLE investor_members ADD COLUMN linkedin VARCHAR")
|
||||||
|
)
|
||||||
|
print(" ✓ Successfully added 'linkedin' column to investor_members table")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Migration completed successfully!")
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade():
|
||||||
|
"""Remove added columns from tables"""
|
||||||
|
print("Running downgrade: Remove feedback fixes fields")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Note: SQLite doesn't support DROP COLUMN directly
|
||||||
|
print("\nWarning: SQLite doesn't support DROP COLUMN directly.")
|
||||||
|
print("To remove these columns, you would need to:")
|
||||||
|
print("1. Create new tables without the columns")
|
||||||
|
print("2. Copy data from old tables to new tables")
|
||||||
|
print("3. Drop old tables and rename new tables")
|
||||||
|
print("\nColumns to remove:")
|
||||||
|
print(" - projects.is_archived")
|
||||||
|
print(" - companies.product_service")
|
||||||
|
print(" - companies.clients")
|
||||||
|
print(" - investor_members.linkedin")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Run database migration")
|
||||||
|
parser.add_argument(
|
||||||
|
"direction",
|
||||||
|
choices=["upgrade", "downgrade"],
|
||||||
|
default="upgrade",
|
||||||
|
nargs="?",
|
||||||
|
help="Migration direction (default: upgrade)",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.direction == "upgrade":
|
||||||
|
upgrade()
|
||||||
|
else:
|
||||||
|
downgrade()
|
||||||
@@ -0,0 +1,67 @@
|
|||||||
|
"""
|
||||||
|
Migration: Add industry column to projects table
|
||||||
|
Date: 2025-10-23
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path to import app modules
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine, text
|
||||||
|
from app.db.db import DATABASE_URL, engine
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade():
|
||||||
|
"""Add industry column to projects table"""
|
||||||
|
print("Running migration: Add industry column to projects table")
|
||||||
|
|
||||||
|
with engine.connect() as conn:
|
||||||
|
# Check if column already exists
|
||||||
|
result = conn.execute(text("PRAGMA table_info(projects)"))
|
||||||
|
columns = [row[1] for row in result]
|
||||||
|
|
||||||
|
if 'industry' in columns:
|
||||||
|
print("Column 'industry' already exists in projects table. Skipping migration.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Add the industry column
|
||||||
|
conn.execute(text("ALTER TABLE projects ADD COLUMN industry VARCHAR"))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
print("Successfully added 'industry' column to projects table")
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade():
|
||||||
|
"""Remove industry column from projects table"""
|
||||||
|
print("Running downgrade: Remove industry column from projects table")
|
||||||
|
|
||||||
|
# Note: SQLite doesn't support DROP COLUMN directly
|
||||||
|
# This is a simplified version - in production you'd need to recreate the table
|
||||||
|
print("Warning: SQLite doesn't support DROP COLUMN.")
|
||||||
|
print("To remove the column, you would need to:")
|
||||||
|
print("1. Create a new table without the industry column")
|
||||||
|
print("2. Copy data from old table to new table")
|
||||||
|
print("3. Drop old table and rename new table")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Run database migration")
|
||||||
|
parser.add_argument(
|
||||||
|
"direction",
|
||||||
|
choices=["upgrade", "downgrade"],
|
||||||
|
default="upgrade",
|
||||||
|
nargs="?",
|
||||||
|
help="Migration direction (default: upgrade)"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.direction == "upgrade":
|
||||||
|
upgrade()
|
||||||
|
else:
|
||||||
|
downgrade()
|
||||||
Executable
+68
@@ -0,0 +1,68 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Server management script for app/main.py
|
||||||
|
# Usage: ./server_manager.sh start|stop|restart
|
||||||
|
|
||||||
|
PID_FILE="server.pid"
|
||||||
|
LOG_FILE="server.log"
|
||||||
|
|
||||||
|
start() {
|
||||||
|
if [ -f "$PID_FILE" ] && kill -0 $(cat "$PID_FILE") 2>/dev/null; then
|
||||||
|
echo "Server is already running (PID: $(cat "$PID_FILE"))"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
echo "Starting server..."
|
||||||
|
nohup uv run app/main.py > "$LOG_FILE" 2>&1 &
|
||||||
|
echo $! > "$PID_FILE"
|
||||||
|
echo "Server started (PID: $(cat "$PID_FILE"))"
|
||||||
|
}
|
||||||
|
|
||||||
|
stop() {
|
||||||
|
if [ ! -f "$PID_FILE" ]; then
|
||||||
|
echo "Server is not running (no PID file found)"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
if ! kill -0 "$PID" 2>/dev/null; then
|
||||||
|
echo "Server is not running (PID $PID not found)"
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
echo "Stopping server (PID: $PID)..."
|
||||||
|
kill "$PID"
|
||||||
|
# Wait for process to stop
|
||||||
|
for i in {1..10}; do
|
||||||
|
if ! kill -0 "$PID" 2>/dev/null; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
echo "Force killing server..."
|
||||||
|
kill -9 "$PID"
|
||||||
|
fi
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
echo "Server stopped"
|
||||||
|
}
|
||||||
|
|
||||||
|
restart() {
|
||||||
|
stop
|
||||||
|
sleep 2
|
||||||
|
start
|
||||||
|
}
|
||||||
|
|
||||||
|
case "$1" in
|
||||||
|
start)
|
||||||
|
start
|
||||||
|
;;
|
||||||
|
stop)
|
||||||
|
stop
|
||||||
|
;;
|
||||||
|
restart)
|
||||||
|
restart
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Usage: $0 {start|stop|restart}"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
@@ -0,0 +1,310 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Update Investor Members LinkedIn Profiles Script
|
||||||
|
|
||||||
|
This script finds and updates LinkedIn profile URLs for investor members in the database.
|
||||||
|
Uses crawl4ai to efficiently scrape team pages and extract LinkedIn URLs.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python update_linkedin_profiles.py [--test] [--limit N] [--skip-existing]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--test Test mode: process only 10 records and don't update database
|
||||||
|
--limit N Process only N records (default: all)
|
||||||
|
--skip-existing Skip members that already have LinkedIn URLs
|
||||||
|
--start-from N Start from record N (for resuming)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Add app to path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "app"))
|
||||||
|
|
||||||
|
from db.db import get_db_session
|
||||||
|
from db.models import InvestorMember, InvestorTable
|
||||||
|
from linkedin_scraper import LinkedInProfileScraper, format_linkedin_url
|
||||||
|
|
||||||
|
|
||||||
|
def progress_callback(current, total, result):
|
||||||
|
"""Print progress updates"""
|
||||||
|
percent = (current / total) * 100
|
||||||
|
status = "✓" if result["linkedin_url"] else "✗"
|
||||||
|
print(f"[{current}/{total} - {percent:.1f}%] {status} {result['member_name']}")
|
||||||
|
if result["linkedin_url"]:
|
||||||
|
print(
|
||||||
|
f" → {result['linkedin_url']} (confidence: {result['confidence']}%, method: {result['method']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_db_callback(test_mode=False):
|
||||||
|
"""
|
||||||
|
Create a callback function that saves LinkedIn profiles to the database immediately.
|
||||||
|
This allows stopping and resuming without losing progress.
|
||||||
|
"""
|
||||||
|
saved_count = {"count": 0} # Use dict to allow modification in closure
|
||||||
|
|
||||||
|
def db_callback(member_id: int, linkedin_url: str) -> bool:
|
||||||
|
"""Save LinkedIn URL to database immediately"""
|
||||||
|
if test_mode:
|
||||||
|
print(f" [TEST] Would save to DB: member {member_id}")
|
||||||
|
saved_count["count"] += 1
|
||||||
|
return True
|
||||||
|
|
||||||
|
try:
|
||||||
|
db = get_db_session()
|
||||||
|
member = db.query(InvestorMember).filter_by(id=member_id).first()
|
||||||
|
if member:
|
||||||
|
member.linkedin = format_linkedin_url(linkedin_url)
|
||||||
|
db.commit()
|
||||||
|
saved_count["count"] += 1
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ DB Error for member {member_id}: {e}")
|
||||||
|
try:
|
||||||
|
db.rollback()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
db.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
return db_callback, saved_count
|
||||||
|
|
||||||
|
|
||||||
|
def update_database(members_data, test_mode=False):
|
||||||
|
"""Update database with found LinkedIn profiles"""
|
||||||
|
db = get_db_session()
|
||||||
|
|
||||||
|
try:
|
||||||
|
updated_count = 0
|
||||||
|
for data in members_data:
|
||||||
|
if data["linkedin_url"] and data["member_id"]:
|
||||||
|
if not test_mode:
|
||||||
|
member = (
|
||||||
|
db.query(InvestorMember).filter_by(id=data["member_id"]).first()
|
||||||
|
)
|
||||||
|
if member:
|
||||||
|
member.linkedin = format_linkedin_url(data["linkedin_url"])
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f" [TEST MODE] Would update member {data['member_id']}: {data['linkedin_url']}"
|
||||||
|
)
|
||||||
|
updated_count += 1
|
||||||
|
|
||||||
|
if not test_mode:
|
||||||
|
db.commit()
|
||||||
|
print(f"\n✓ Successfully updated {updated_count} records in database")
|
||||||
|
else:
|
||||||
|
print(f"\n[TEST MODE] Would have updated {updated_count} records")
|
||||||
|
|
||||||
|
return updated_count
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
db.rollback()
|
||||||
|
print(f"\n✗ Error updating database: {e}")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def save_results(results, filename="linkedin_scraping_results.json"):
|
||||||
|
"""Save results to JSON file for backup/analysis"""
|
||||||
|
output = {
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"total_processed": len(results),
|
||||||
|
"found_count": sum(1 for r in results if r["linkedin_url"]),
|
||||||
|
"results": results,
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
json.dump(output, f, indent=2)
|
||||||
|
|
||||||
|
print(f"\n✓ Results saved to {filename}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_summary(results):
|
||||||
|
"""Print summary statistics"""
|
||||||
|
total = len(results)
|
||||||
|
found = sum(1 for r in results if r["linkedin_url"])
|
||||||
|
not_found = total - found
|
||||||
|
|
||||||
|
# Count by method
|
||||||
|
methods = {}
|
||||||
|
for r in results:
|
||||||
|
if r["linkedin_url"]:
|
||||||
|
method = r["method"]
|
||||||
|
methods[method] = methods.get(method, 0) + 1
|
||||||
|
|
||||||
|
# Average confidence for found profiles
|
||||||
|
avg_confidence = (
|
||||||
|
sum(r["confidence"] for r in results if r["linkedin_url"]) / found
|
||||||
|
if found > 0
|
||||||
|
else 0
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Total processed: {total}")
|
||||||
|
print(f"LinkedIn found: {found} ({found / total * 100:.1f}%)")
|
||||||
|
print(f"Not found: {not_found} ({not_found / total * 100:.1f}%)")
|
||||||
|
print(f"\nAverage confidence: {avg_confidence:.1f}%")
|
||||||
|
print("\nMethods used:")
|
||||||
|
for method, count in sorted(methods.items(), key=lambda x: x[1], reverse=True):
|
||||||
|
print(f" {method:20s} {count:5d} ({count / found * 100:.1f}%)")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Update LinkedIn profiles for investor members"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--test",
|
||||||
|
action="store_true",
|
||||||
|
help="Test mode: process only 10 records without updating database",
|
||||||
|
)
|
||||||
|
parser.add_argument("--limit", type=int, help="Limit number of records to process")
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-existing",
|
||||||
|
action="store_true",
|
||||||
|
help="Skip members that already have LinkedIn URLs",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--start-from",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Start from record N (for resuming interrupted runs)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--rate-limit",
|
||||||
|
type=float,
|
||||||
|
default=0.5,
|
||||||
|
help="Delay between URL crawls in seconds (default: 0.5)",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Test mode overrides limit
|
||||||
|
if args.test and not args.limit:
|
||||||
|
args.limit = 10
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("LinkedIn Profile Scraper for Investor Members (crawl4ai)")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
if args.test:
|
||||||
|
print("\n⚠️ TEST MODE - No database changes will be made")
|
||||||
|
|
||||||
|
# Initialize database and scraper
|
||||||
|
db = get_db_session()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Build query
|
||||||
|
query = db.query(InvestorMember, InvestorTable).join(
|
||||||
|
InvestorTable, InvestorMember.investor_id == InvestorTable.id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter existing if requested
|
||||||
|
if args.skip_existing:
|
||||||
|
query = query.filter(
|
||||||
|
(InvestorMember.linkedin.is_(None)) | (InvestorMember.linkedin == "")
|
||||||
|
)
|
||||||
|
print("\n✓ Filtering to members without LinkedIn profiles")
|
||||||
|
|
||||||
|
# Get total count
|
||||||
|
total_available = query.count()
|
||||||
|
print(f"\n✓ Found {total_available} members to process")
|
||||||
|
|
||||||
|
# Apply offset and limit
|
||||||
|
if args.start_from > 0:
|
||||||
|
query = query.offset(args.start_from)
|
||||||
|
print(f"✓ Starting from record {args.start_from}")
|
||||||
|
|
||||||
|
if args.limit:
|
||||||
|
query = query.limit(args.limit)
|
||||||
|
print(f"✓ Processing {args.limit} records")
|
||||||
|
|
||||||
|
# Fetch members
|
||||||
|
members_data = []
|
||||||
|
for member, investor in query.all():
|
||||||
|
members_data.append(
|
||||||
|
{
|
||||||
|
"id": member.id,
|
||||||
|
"name": member.name,
|
||||||
|
"company": investor.name,
|
||||||
|
"role": member.role,
|
||||||
|
"source_url": member.source_url,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not members_data:
|
||||||
|
print("\n⚠️ No members to process")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Count unique source URLs
|
||||||
|
unique_urls = len(set(m["source_url"] for m in members_data if m["source_url"]))
|
||||||
|
with_urls = sum(1 for m in members_data if m["source_url"])
|
||||||
|
|
||||||
|
print(f"\n✓ Loaded {len(members_data)} members")
|
||||||
|
print(
|
||||||
|
f"✓ {with_urls} members have source URLs ({unique_urls} unique pages to crawl)"
|
||||||
|
)
|
||||||
|
print(f"✓ {len(members_data) - with_urls} members without source URLs")
|
||||||
|
print(f"✓ Rate limit: {args.rate_limit}s between page crawls")
|
||||||
|
print("\nStarting LinkedIn profile search using crawl4ai...\n")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
# Initialize scraper
|
||||||
|
scraper = LinkedInProfileScraper(rate_limit_delay=args.rate_limit, use_cache=True)
|
||||||
|
|
||||||
|
print("ℹ️ Using crawl4ai to scrape team pages and extract LinkedIn URLs")
|
||||||
|
print(
|
||||||
|
"ℹ️ Profiles are saved to database IMMEDIATELY when found - safe to stop anytime!\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create database callback for real-time saving
|
||||||
|
db_callback, saved_count = create_db_callback(test_mode=args.test)
|
||||||
|
|
||||||
|
# Process members asynchronously with real-time DB saving
|
||||||
|
results = asyncio.run(
|
||||||
|
scraper.batch_find_profiles(
|
||||||
|
members_data, progress_callback=progress_callback, db_callback=db_callback
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print_summary(results)
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
results_file = f"linkedin_results_{timestamp}.json"
|
||||||
|
save_results(results, results_file)
|
||||||
|
|
||||||
|
# Show database update summary
|
||||||
|
if not args.test:
|
||||||
|
print(
|
||||||
|
f"\n✓ Database updated in real-time: {saved_count['count']} profiles saved"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"\n[TEST MODE] Would have saved {saved_count['count']} profiles to database"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✓ Done! You can resume anytime with --skip-existing")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user