feat: Implement company querying functionality with natural language processing and logging

2025-10-27 20:12:30 +01:00
parent 1ac755b2d7
commit ff0010019e
7 changed files with 225 additions and 70 deletions
@@ -1,4 +1,6 @@
 import io
 import logging
 import os
 import pandas as pd
 from db.db import Base, db_dependency, engine
@@ -13,7 +15,8 @@ from routers import (
    projects,
    report_route,
 )
-from schemas.router_schemas import InvestmentResponse, PaginatedResponse
+from schemas.router_schemas import CompanyData, InvestmentResponse, PaginatedResponse
 from services.company_querying import CompanyQueryProcessor
 from services.llm_parser import InvestorProcessor
 from services.querying import QueryProcessor
@@ -114,6 +117,27 @@ async def query_investors(request: QueryRequest):
    return results
@app.post(
    "/query-companies", response_model=PaginatedResponse[CompanyData], tags=["Querying"]
 )
 async def query_companies(request: QueryRequest):
    """
    Query companies using natural language.
    Returns company matches with their investor relationships, team members, and sectors.
    Supports queries like:
    - "Show me fintech companies founded in 2020"
    - "Find healthcare companies in San Francisco"
    - "Companies in the AI sector"
    - "Companies that received funding from Sequoia"
    - "European startups founded after 2019"
    """
    processor = CompanyQueryProcessor()
    results = processor.process_query(request.question)
    return results
 app.include_router(investors.router)
 app.include_router(companies.router)
 app.include_router(projects.router)
@@ -1,15 +1,21 @@
 import os
 from typing import List
 from db.db import get_db
 from db.models import InvestorTable
 from fastapi import APIRouter, Depends, HTTPException
 from pydantic import BaseModel
-from services.crm import folk
+from services.crm import FolkAPI
 from sqlalchemy.orm import Session, selectinload
 router = APIRouter(prefix="/folk", tags=["Folk CRM"])
 def get_folk_client():
    """Get Folk API client with loaded environment variables"""
    return FolkAPI(api_key=os.environ.get("FOLK_API_KEY", ""))
 class GroupResponse(BaseModel):
    id: str
    name: str
@@ -44,6 +50,7 @@ def get_folk_groups():
    to sync investors to Folk.
    """
    try:
        folk = get_folk_client()
        groups_data = folk.get_groups()
        items = groups_data.get("data", {}).get("items", [])
@@ -71,6 +78,7 @@ def sync_investors_to_folk(
    Returns:
        Summary of sync operation including successes and errors
    """
    folk = get_folk_client()
    # Fetch investors with their team members
    investors = (
        db.query(InvestorTable)
@@ -168,6 +168,7 @@ class InvestorFundData(BaseModel):
    class Config:
        from_attributes = True
 class InvestorMinimal(BaseModel):
    """Minimal investor info with just id and name"""
@@ -177,6 +178,7 @@ class InvestorMinimal(BaseModel):
    class Config:
        from_attributes = True
 class CompanySchemaMinimal(BaseModel):
    id: int
    name: str
@@ -188,9 +190,12 @@ class CompanySchemaMinimal(BaseModel):
    class Config:
        from_attributes = True
 class CompanyData(BaseModel):  # Renamed from CompaniesData for consistency
    company: CompanySchemaMinimal
    investors: List[InvestorMinimal]
    # members: List[CompanyMemberSchema] = []
    sectors: List[SectorSchema] = []
    class Config:
        from_attributes = True
@@ -0,0 +1,176 @@
 import logging
 import os
 from typing import List
 from db.db import DATABASE_URL, get_db
 from db.models import CompanyTable
 from langchain import hub
 from langchain_community.agent_toolkits import SQLDatabaseToolkit
 from langchain_community.utilities import SQLDatabase
 from langchain_openai import ChatOpenAI
 from langgraph.prebuilt import create_react_agent
 from schemas.router_schemas import CompanyData, PaginatedResponse
 from sqlalchemy.orm import selectinload
 logger = logging.getLogger(__name__)
 # Connect to SQLite
 prompt_template = hub.pull("langchain-ai/sql-agent-system-prompt")
 db = SQLDatabase.from_uri(DATABASE_URL)
 class CompanyQueryProcessor:
    def __init__(self):
        self.llm = ChatOpenAI(
            api_key=os.getenv("OPENROUTER_API_KEY"),
            base_url="https://openrouter.ai/api/v1",
            model="openai/gpt-4o-mini",
            temperature=0,
        )
        self.toolkit = SQLDatabaseToolkit(db=db, llm=self.llm)
        # Update system message to specifically request only company IDs
        system_message_updated = (
            prompt_template.format(dialect="SQLite", top_k=5)
            + "\n\n=== CRITICAL INSTRUCTIONS ==="
            + "\n- Your ONLY task is to run SQL queries and extract company IDs"
            + "\n- When you get SQL results with company IDs, return them EXACTLY as shown"
            + "\n- If the SQL query returns rows with company IDs like [(1,), (5,), (9,)], return all those IDs"
            + "\n- Do NOT add any explanations, just list the IDs"
            + "\n- If a query returns NO ROWS (empty result), then respond with 'NO_RESULTS'"
            + "\n\n=== QUERY GUIDELINES ==="
            + "\n1. For sector searches: SELECT companies.id FROM companies JOIN company_sector ON companies.id = company_sector.company_id JOIN sectors ON company_sector.sector_id = sectors.id WHERE sectors.name LIKE '%sector_name%'"
            + "\n2. For industry searches: WHERE companies.industry LIKE '%search_term%'"
            + "\n3. For location searches: WHERE companies.location LIKE '%location%'"
            + "\n4. For founding year searches: WHERE companies.founded_year >= year"
            + "\n5. For investor-related: JOIN investor_companies table"
        )
        self.agent = create_react_agent(
            model=self.llm,
            tools=self.toolkit.get_tools(),
            prompt=system_message_updated,
        )
    def process_query(self, question: str) -> PaginatedResponse[CompanyData]:
        """Process a query using the LLM and return company response data.
        Args:
            question: The natural language query to process
        """
        # Let the LLM handle all database interactions and filtering to get company IDs
        response = self.agent.invoke(
            {"messages": [("user", question)]},
            config={"recursion_limit": 50},
        )
        # Extract the actual message content
        logger.info(f"{response}")
        # Look through all messages to find the SQL query results (ToolMessage with actual data)
        company_ids = []
        for message in response["messages"]:
            if hasattr(message, "content") and message.content:
                # Check if this looks like SQL results (contains tuples with numbers)
                if "(" in str(message.content) and "," in str(message.content):
                    company_ids = self._extract_company_ids_from_response(
                        str(message.content)
                    )
                    if company_ids:
                        logger.info(
                            f"Extracted {len(company_ids)} company IDs from results"
                        )
                        break
        # If no IDs found from ToolMessage, check the final AI message
        if not company_ids:
            final_message_content = response["messages"][-1].content
            logger.info(f"AI Response: \n{final_message_content}")
            company_ids = self._extract_company_ids_from_response(final_message_content)
        # Fetch full company data with relationships using the IDs
        return self._fetch_companies_by_ids(company_ids)
    def _extract_company_ids_from_response(self, ai_response: str) -> List[int]:
        """Extract company IDs from AI response."""
        import re
        company_ids = []
        # Check if response is NO_RESULTS
        if "NO_RESULTS" in ai_response.upper():
            return []
        try:
            # The response contains tuples like (1,), (5,), etc.
            # Extract numbers between parentheses
            pattern = r"\((\d+),?\)"
            matches = re.findall(pattern, ai_response)
            if matches:
                company_ids = [int(match) for match in matches]
            else:
                # Fallback: extract all numbers
                numbers = re.findall(r"\b\d+\b", ai_response)
                # Filter out very large numbers that might be tokens or timestamps
                company_ids = [int(num) for num in numbers if int(num) < 100000]
        except Exception as e:
            logger.error(f"Error extracting IDs from response: {e}")
            return []
        return company_ids
    def _fetch_companies_by_ids(
        self, company_ids: List[int]
    ) -> PaginatedResponse[CompanyData]:
        """Fetch companies with all their relationships from the database using company IDs.
        Args:
            company_ids: List of company IDs to fetch
        """
        if not company_ids:
            return PaginatedResponse(
                items=[],
                total=0,
                page=1,
                page_size=len(company_ids) if company_ids else 10,
                total_pages=0,
            )
        # Get database session
        db_session = next(get_db())
        try:
            # Query companies with all necessary relationships loaded
            companies = (
                db_session.query(CompanyTable)
                .options(
                    selectinload(CompanyTable.investors),
                    selectinload(CompanyTable.members),
                    selectinload(CompanyTable.sectors),
                )
                .filter(CompanyTable.id.in_(company_ids))
                .all()
            )
            # Transform to CompanyData format
            company_data_list = []
            for company in companies:
                company_data = CompanyData(
                    company=company,
                    investors=company.investors,
                    members=company.members,
                    sectors=company.sectors,
                )
                company_data_list.append(company_data)
            total_count = len(company_data_list)
            total_pages = 1 if total_count > 0 else 0
            return PaginatedResponse(
                items=company_data_list,
                total=total_count,
                page=1,
                page_size=total_count,
                total_pages=total_pages,
            )
        finally:
            db_session.close()
@@ -1,14 +1,24 @@
 import logging
 import os
 import sys
 import requests
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()],
 )
 logger = logging.getLogger(__name__)
 class FolkAPI:
    BASE_URL = "https://api.folk.app/v1"
    def __init__(self, api_key: str):
        api_key = os.environ.get("FOLK_API_KEY", api_key)
        self.headers = {"Authorization": f"Bearer {api_key}"}
        logger.info(f"FolkAPI initialized with API key: {api_key[:4]}***")
    def get_groups(self):
        """Fetch all groups from Folk."""
@@ -190,71 +200,3 @@ class FolkAPI:
        response.raise_for_status()
        return response.json()
 # Prefer getting the API key from the environment. If not set, fall back to the
 # existing (hard-coded) key so behavior is unchanged for now.
 DEFAULT_API_KEY = "FOLKfIGXuv74ML9EAajxyiUR39ePaNrZ"
 api_key = os.environ.get("FOLK_API_KEY", DEFAULT_API_KEY)
 folk = FolkAPI(api_key=api_key)
 def example_flow():
    # Step 1: Get groups
    groups = folk.get_groups()
    print(groups)
    # Safely dig into the returned structure. The API returns groups under
    # groups['data']['items'] (not groups['data'][0]). Handle missing/empty.
    items = groups.get("data", {}).get("items", [])
    if not items:
        print("No groups returned by Folk API.")
        sys.exit(1)
    # Choose the first group as an example
    group_id = items[0].get("id")
    if not group_id:
        print("No id found for the first group item.")
        sys.exit(1)
    # Step 2: Choose a group_id and create a company
    company = folk.create_company(
        name="2050 Investment Partners",
        group_id=group_id,
        website="https://2050.com",
        linkedin_url="https://linkedin.com/company/2050-investments",
    )
    # Step 3: Add a person to the same group or company
    person = folk.create_person(
        first_name="John",
        last_name="Doe",
        email="john@2050.com",
        company_id=company.get("data", {}).get("id"),
        group_id=group_id,
    )
    print("Created company:", company)
    print("Created person:", person)
 if __name__ == "__main__":
    try:
        example_flow()
    except requests.HTTPError as e:
        # Try to include response body for easier debugging if available
        resp = getattr(e, "response", None)
        if resp is not None:
            try:
                body = resp.text
            except Exception:
                body = "<unreadable response body>"
            print("HTTP error while talking to Folk API:", e)
            print("Response status:", resp.status_code)
            print("Response body:", body)
        else:
            print("HTTP error while talking to Folk API:", e)
        sys.exit(1)
    except Exception as e:  # pragma: no cover - top-level safety
        print("Unexpected error:", e)
        sys.exit(1)