Initial commit

2025-08-04 14:50:33 +01:00
commit 40b28a7ee3
30 changed files with 3410 additions and 0 deletions
@@ -0,0 +1,84 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # Virtual Environment
 venv/
 env/
 ENV/
 .venv/
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # OS
 .DS_Store
 .DS_Store?
 ._*
 .Spotlight-V100
 .Trashes
 ehthumbs.db
 Thumbs.db
 # Environment variables
 .env
 .env.local
 .env.*.local
 # Data directories
 data/
 logs/
 *.log
 # Vector embeddings and databases
 *.db
 *.sqlite
 *.sqlite3
 embeddings/
 chroma/
 # Uploads and temporary files
 uploads/
 temp/
 tmp/
 # Test file
 tests/
 # Jupyter Notebooks
 .ipynb_checkpoints/
 *.ipynb
 # Model files
 *.model
 *.pkl
 *.joblib
 # Coverage reports
 htmlcov/
 .coverage
 .coverage.*
 coverage.xml
@@ -0,0 +1,155 @@
 # Semantic Search Engine POC
 A proof-of-concept intelligent semantic search engine for archival documents, made to show how advanced search can work with different types of files like PDFs, XML files, and more.
 ## Project Overview
 This POC addresses the requirements for a future full-scale semantic search system capable of:
 - **Entity-centric search** across persons, places, events, buildings, and organizations
 - **Multi-modal document processing** (PDFs, XML, text, images, audio, video)
 - **Semantic similarity search** using modern embedding techniques
 - **Relationship discovery** between entities across documents
 - **Access control** for public vs. restricted documents
 - **Scalable architecture** for production deployment
 ## Architecture
 ```
 ┌─────────────────┐    ┌─────────────────┐    ┌─────────────────┐
 │   Document      │    │   Entity        │    │   Vector        │
 │   Processor     │───▶│   Extractor     │───▶│   Store         │
 └─────────────────┘    └─────────────────┘    └─────────────────┘
         │                       │                       │
         ▼                       ▼                       ▼
 ┌─────────────────┐    ┌─────────────────┐    ┌─────────────────┐
 │   Text          │    │   Named Entity  │    │   Embeddings    │
 │   Extraction    │    │   Recognition   │    │   (ChromaDB)    │
 └─────────────────┘    └─────────────────┘    └─────────────────┘
                                │
                                ▼
                      ┌─────────────────┐
                      │   Search        │
                      │   Service       │
                      └─────────────────┘
 ```
 ### Prerequisites
 - Python 3.8+
 - pip
 - Git
 ### Installation
 1. **Clone the repository**
 ```bash
 git clone <repository-url>
 cd semantic_search_poc
 ```
 2. **Create virtual environment**
 ```bash
 python -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 ```
 3. **Install dependencies**
 ```bash
 pip install -r requirements.txt
 python -m spacy download en_core_web_sm
 ```
 4. **Initialize the environment**
 ```bash
 python scripts/setup_data.py
 ```
 5. **Run the POC**
 ```bash
 python -m src.main
 ```
 ### Expected Output
 The POC will demonstrate:
 - Document processing and indexing
 - Semantic search across sample documents
 - Entity extraction and relationship discovery
 - Performance metrics and statistics
 ## Features
 ### Document Processing
 - **PDF text extraction** using PyPDF2
 - **XML parsing** for finding aids
 - **DOCX support** for modern documents
 - **Metadata extraction** (title, author, creation date, keywords)
 - **Multi-language support** (currently optimized for English)
 ### Entity Recognition
 - **Named Entity Recognition** using spaCy
 - **Custom entity types**: Person, Place, Event, Organization, Building, Date
 - **Relationship extraction** between entities
 - **Confidence scoring** for entity matches
 ### Semantic Search
 - **Vector embeddings** using Sentence-BERT (`all-MiniLM-L6-v2`)
 - **Similarity search** with configurable thresholds
 - **Hybrid search** combining semantic and keyword matching
 - **Entity-filtered search** results
 ### Vector Storage
 - **ChromaDB integration** for persistent vector storage
 - **Scalable indexing** for large document collections
 - **Metadata filtering** and search optimization
 ## Configuration
 Key settings in `config/settings.py`:
 ```python
 # Embedding Model
 EMBEDDING_MODEL = "all-MiniLM-L6-v2"
 EMBEDDING_DIMENSION = 384
 # Search Parameters
 MAX_SEARCH_RESULTS = 50
 SIMILARITY_THRESHOLD = 0.3
 # File Processing
 MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
 ALLOWED_EXTENSIONS = [".pdf", ".txt", ".docx", ".xml"]
 ```
 ## Project Structure
 ```
 semantic_search_poc/
 ├── README.md
 ├── requirements.txt
 ├── .env.example
 ├── config/
 │   └── settings.py              # Configuration settings
 ├── src/
 │   ├── main.py                  # Main application entry point
 │   ├── models/
 │   │   ├── document.py          # Document data models
 │   │   └── search_result.py     # Search result models
 │   ├── services/
 │   │   ├── document_processor.py # Document processing pipeline
 │   │   ├── embedding_service.py  # Embedding generation
 │   │   ├── entity_extractor.py   # Named entity recognition
 │   │   ├── search_service.py     # Main search functionality
 │   │   └── vector_store.py       # Vector database operations
 │   └── utils/
 │       ├── file_handlers.py      # File processing utilities
 │       └── logger.py             # Logging configuration
 ├── data/
 │   ├── raw/                     # Input documents
 │   ├── processed/               # Processed document metadata
 │   └── embeddings/              # Vector embeddings storage
 ├── tests/                       # Unit tests
 ├── notebooks/                   # Jupyter notebooks for analysis
 └── scripts/                     # Utility scripts
 ```
@@ -0,0 +1,61 @@
 """Configuration settings for the semantic search POC."""
 import os
 from typing import List, Optional
 from pydantic_settings import BaseSettings
 class Settings(BaseSettings):
    """Application settings."""
    # Application
    APP_NAME: str = "Semantic Search POC"
    VERSION: str = "0.1.0"
    DEBUG: bool = True
    # API
    API_HOST: str = "localhost"
    API_PORT: int = 8000
    API_PREFIX: str = "/api/v1"
    # Database
    DATABASE_URL: str = "sqlite:///./data/semantic_search.db"
    # Vector Store
    VECTOR_STORE_TYPE: str = "chroma"  # chroma, faiss
    CHROMA_PERSIST_DIR: str = "./data/embeddings/chroma"
    FAISS_INDEX_PATH: str = "./data/embeddings/faiss"
    # Embedding Model
    EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"
    EMBEDDING_DIMENSION: int = 384
    # Entity Extraction
    SPACY_MODEL: str = "en_core_web_sm"
    CUSTOM_NER_MODEL: Optional[str] = None
    # Document Processing
    MAX_FILE_SIZE: int = 50 * 1024 * 1024  # 50MB
    ALLOWED_EXTENSIONS: List[str] = [".pdf", ".txt", ".docx", ".xml"]
    # Search
    MAX_SEARCH_RESULTS: int = 50
    SIMILARITY_THRESHOLD: float = 0.2
    # Directories
    DATA_DIR: str = "./data"
    RAW_DATA_DIR: str = "./data/raw"
    PROCESSED_DATA_DIR: str = "./data/processed"
    UPLOAD_DIR: str = "./data/uploads"
    # Logging
    LOG_LEVEL: str = "INFO"
    LOG_FILE: str = "./logs/app.log"
    class Config:
        env_file = ".env"
        case_sensitive = True
 # Global settings instance
 settings = Settings()
@@ -0,0 +1,56 @@
 import asyncio
 import sys
 import uvicorn
 from pathlib import Path
 import logging
 # Add src to path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from config.settings import settings
 def main():
    """Run the FastAPI server."""
    # Configure logging
    logging.basicConfig(
        level=getattr(logging, settings.LOG_LEVEL),
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger(__name__)
    logger.info("Starting Semantic Search API Server...")
    # Create necessary directories
    directories = [
        settings.DATA_DIR,
        settings.UPLOAD_DIR,
        Path(settings.LOG_FILE).parent,
        "templates"
    ]
    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)
    print(f"""
          Starting Semantic Search API Server
          Server URL: http://{settings.API_HOST}:{settings.API_PORT}
          API Docs: http://{settings.API_HOST}:{settings.API_PORT}/docs
          Frontend: http://{settings.API_HOST}:{settings.API_PORT}/
          Health Check: http://{settings.API_HOST}:{settings.API_PORT}/health
 Press Ctrl+C to stop the server
    """)
    # Run the server
    uvicorn.run(
        "src.api.routes:app",
        host=settings.API_HOST,
        port=settings.API_PORT,
        reload=settings.DEBUG,
        log_level=settings.LOG_LEVEL.lower()
    )
 if __name__ == "__main__":
    main()
@@ -0,0 +1,233 @@
 """Setup script to initialize the POC environment."""
 import asyncio
 import logging
 import sys
 from pathlib import Path
 # Add src to path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from src.services.vector_store import VectorStore
 from src.services.document_processor import DocumentProcessor
 from config.settings import settings
 async def setup_environment():
    """Set up the POC environment."""
    print("🚀 Setting up Semantic Search POC...")
    # Configure logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    try:
        # Create directories
        directories = [
            settings.DATA_DIR,
            settings.RAW_DATA_DIR,
            settings.PROCESSED_DATA_DIR,
            settings.UPLOAD_DIR,
            f"{settings.RAW_DATA_DIR}/sample_documents",
            f"{settings.RAW_DATA_DIR}/pdfs",
            f"{settings.RAW_DATA_DIR}/xml",
            Path(settings.LOG_FILE).parent
        ]
        for directory in directories:
            Path(directory).mkdir(parents=True, exist_ok=True)
            print(f"✅ Created directory: {directory}")
        # Initialize vector store
        print("\n📚 Initializing vector store...")
        vector_store = VectorStore()
        await vector_store.initialize()
        print("✅ Vector store initialized")
        # Create sample documents
        sample_docs_dir = Path(f"{settings.RAW_DATA_DIR}/sample_documents")
        # Sample document 1: Napoleon biography
        sample1 = sample_docs_dir / "napoleon_biography.txt"
        if not sample1.exists():
            sample1.write_text("""
 Napoleon Bonaparte: A Brief Biography
 Napoleon Bonaparte (1769-1821) was a French military general and political leader who rose to prominence during the French Revolution. Born in Corsica, Napoleon became Emperor of the French in 1804.
 Early Life and Rise to Power
 Napoleon was born in Ajaccio, Corsica, to Charles Buonaparte and Letizia Ramolino Bonaparte. He attended military school in France and quickly distinguished himself as a brilliant strategist.
 Military Campaigns
 Napoleon led numerous military campaigns across Europe, including:
 - The Italian Campaign (1796-1797)
 - The Egyptian Campaign (1798-1801)
 - The Austerlitz Campaign (1805)
 - The Russian Campaign (1812)
 Napoleon's forces occupied much of continental Europe at the height of his power. He established the Continental System to weaken Britain economically.
 Political Reforms
 As Emperor, Napoleon implemented significant reforms:
 - The Napoleonic Code (Civil Code)
 - Educational reforms
 - Infrastructure development
 - Administrative reorganization
 Exile and Death
 After defeat at the Battle of Leipzig in 1813 and subsequent abdication, Napoleon was exiled to Elba. He returned for the Hundred Days but was defeated at Waterloo in 1815. He was then exiled to Saint Helena, where he died in 1821.
 Legacy
 Napoleon's influence on European law, politics, and military strategy continues to this day. His reforms and conquests shaped the modern European state system.
            """.strip())
            print(f"✅ Created sample document: {sample1.name}")
        # Sample document 2: French Revolution overview  
        sample2 = sample_docs_dir / "french_revolution.txt"
        if not sample2.exists():
            sample2.write_text("""
 The French Revolution (1789-1799): An Overview
 The French Revolution was a period of radical political and societal change in France that began with the Estates-General of 1789 and ended with the formation of the French Consulate in November 1799.
 Causes of the Revolution
 - Economic crisis and debt
 - Social inequality under the Ancien Régime
 - Influence of Enlightenment ideas
 - Weak leadership under Louis XVI
 Key Events and Phases
 The Moderate Phase (1789-1792)
 - Storming of the Bastille (July 14, 1789)
 - Declaration of the Rights of Man and of the Citizen
 - Abolition of feudalism
 - Civil Constitution of the Clergy
 The Radical Phase (1792-1794)
 - Execution of Louis XVI (January 21, 1793)
 - Reign of Terror under Maximilien Robespierre
 - Committee of Public Safety
 - Revolutionary Wars against European coalitions
 The Thermidorian Reaction (1794-1799)
 - Fall of Robespierre (July 27, 1794)
 - Directory period
 - Rise of Napoleon Bonaparte
 Important Figures
 - Louis XVI - King of France
 - Marie Antoinette - Queen of France
 - Maximilien Robespierre - Jacobin leader
 - Georges Danton - Revolutionary leader
 - Jean-Paul Marat - Radical journalist
 - Jacques Necker - Finance Minister
 Geographic Centers
 The revolution centered around Paris, with key locations including:
 - Palace of Versailles
 - Tuileries Palace
 - Place de la Concorde (formerly Place Louis XV)
 - Conciergerie prison
 Impact and Legacy
 The French Revolution fundamentally changed French society and had lasting effects on European politics, inspiring democratic movements worldwide and establishing principles of popular sovereignty and individual rights.
            """.strip())
            print(f"✅ Created sample document: {sample2.name}")
        # Sample document 3: Architecture of Paris
        sample3 = sample_docs_dir / "paris_architecture.txt"
        if not sample3.exists():
            sample3.write_text("""
 Architectural Marvels of Paris
 Paris, the City of Light, is renowned for its stunning architecture spanning centuries of French history and culture.
 Medieval Architecture
 - Notre-Dame Cathedral: Gothic masterpiece on Île de la Cité
 - Sainte-Chapelle: Royal chapel with magnificent stained glass
 - Saint-Germain-des-Prés: Ancient abbey church
 Renaissance and Classical Period
 - Louvre Palace: Royal residence turned world's largest museum
 - Luxembourg Palace: Baroque palace and gardens
 - Place des Vosges: Oldest planned square in Paris
 Haussmann's Paris (19th Century)
 Baron Georges-Eugène Haussmann transformed Paris under Napoleon III:
 - Wide boulevards and avenues
 - Standardized building heights and facades
 - Parks and squares system
 - Sewerage and water systems
 Notable Haussmannian Buildings:
 - Opéra Garnier: Neo-baroque opera house
 - Grands Boulevards: Commercial and social centers
 - Residential buildings with characteristic iron balconies
 Modern and Contemporary Architecture
 - Eiffel Tower (1889): Iron lattice tower by Gustave Eiffel
 - Centre Pompidou (1977): High-tech architecture
 - Louvre Pyramid (1989): I.M. Pei's glass pyramid
 - Institut du Monde Arabe: Jean Nouvel's modern interpretation
 Architectural Districts
 - Marais: Medieval and Renaissance architecture
 - Saint-Germain-des-Prés: Literary and artistic quarter
 - Montmartre: Village atmosphere with Sacré-Cœur Basilica
 - La Défense: Modern business district with Grande Arche
 Building Materials and Techniques
 Traditional Parisian architecture features:
 - Lutetian limestone (Pierre de Paris)
 - Mansard roofs with zinc coverings
 - Iron work and balconies
 - Large windows and shutters
 Conservation Efforts
 Paris maintains strict building codes to preserve its architectural heritage while allowing for contemporary additions that complement the historic urban fabric.
            """.strip())
            print(f"✅ Created sample document: {sample3.name}")
        print(f"\n📝 Created {len(list(sample_docs_dir.glob('*.txt')))} sample documents")
        # Create .env file if it doesn't exist
        env_file = Path(".env")
        if not env_file.exists():
            env_content = """# Semantic Search POC Configuration
 DEBUG=True
 LOG_LEVEL=INFO
 # Database
 DATABASE_URL=sqlite:///./data/semantic_search.db
 # Vector Store
 VECTOR_STORE_TYPE=chroma
 CHROMA_PERSIST_DIR=./data/embeddings/chroma
 # Embedding Model
 EMBEDDING_MODEL=all-MiniLM-L6-v2
 EMBEDDING_DIMENSION=384
 # Search Settings
 MAX_SEARCH_RESULTS=50
 SIMILARITY_THRESHOLD=0.2
 # File Upload
 MAX_FILE_SIZE=52428800
 """
            env_file.write_text(env_content)
            print("Created .env configuration file")
        print("\nSetup complete! You can now:")
        print("1. Run: python -m src.main")
        print("2. Or process documents: python scripts/process_documents.py")
        print("3. Or start the API server: python scripts/run_server.py")
    except Exception as e:
        logger.error(f"Setup failed: {str(e)}")
        print(f"❌ Setup failed: {str(e)}")
 if __name__ == "__main__":
    asyncio.run(setup_environment())
@@ -0,0 +1,390 @@
 import time
 import logging
 from typing import List, Optional
 from fastapi import FastAPI, HTTPException, UploadFile, File, Query, Depends
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import HTMLResponse, FileResponse
 from pathlib import Path
 from ..services.search_service import SearchService
 from ..services.document_processor import DocumentProcessor
 from ..models.document import EntityType, DocumentType, AccessLevel
 from ..models.search_result import SearchResponse, EntitySearchResult
 from .schemas import SearchRequest, DocumentUploadResponse, StatsResponse
 from config.settings import settings
 # Initialize FastAPI app
 app = FastAPI(
    title="Semantic Search API",
    description="Intelligent semantic search engine for archival documents",
    version="0.1.0"
 )
 # Add CORS middleware
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Mount static files for frontend
 static_dir = Path(__file__).parent.parent.parent / "templates" / "static"
 if static_dir.exists():
    app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
 # Initialize services
 search_service = SearchService()
 document_processor = DocumentProcessor()
 logger = logging.getLogger(__name__)
 # Dependency to get search service
 async def get_search_service():
    return search_service
 # Dependency to get document processor
 async def get_document_processor():
    return document_processor
@app.on_event("startup")
 async def startup_event():
    """Initialize services on startup."""
    logger.info("Initializing API services...")
    await search_service.vector_store.initialize()
    logger.info("API services initialized successfully")
@app.get("/", response_class=HTMLResponse)
 async def read_root():
    """Serve the main frontend page."""
    template_path = Path(__file__).parent.parent.parent / "templates" / "index.html"
    if template_path.exists():
        return FileResponse(str(template_path))
    return HTMLResponse("<h1>Semantic Search API</h1><p>Frontend template not found</p>")
@app.get("/health")
 async def health_check():
    """Health check endpoint."""
    return {"status": "healthy", "timestamp": time.time()}
@app.post("/api/v1/search", response_model=SearchResponse)
 async def search_documents(
    request: SearchRequest,
    search_service: SearchService = Depends(get_search_service)
 ):
    """
    Perform semantic search across documents.
    - **query**: Search query string
    - **limit**: Maximum number of results (default: 10)
    - **entity_types**: Filter by entity types (optional)
    - **similarity_threshold**: Minimum similarity score (optional)
    """
    start_time = time.time()
    try:
        results = await search_service.semantic_search(
            query=request.query,
            limit=request.limit,
            entity_types=request.entity_types,
            similarity_threshold=request.similarity_threshold
        )
        search_time = (time.time() - start_time) * 1000  # Convert to milliseconds
        return SearchResponse(
            query=request.query,
            results=results,
            total_results=len(results),
            search_time_ms=search_time,
            filters_applied={
                "entity_types": [et.value for et in request.entity_types] if request.entity_types else [],
                "similarity_threshold": request.similarity_threshold
            }
        )
    except Exception as e:
        logger.error(f"Search error: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
@app.get("/api/v1/search", response_model=SearchResponse)
 async def search_documents_get(
    q: str = Query(..., description="Search query"),
    limit: int = Query(10, ge=1, le=100, description="Maximum number of results"),
    entity_types: Optional[str] = Query(None, description="Comma-separated entity types"),
    threshold: Optional[float] = Query(None, ge=0.0, le=1.0, description="Similarity threshold"),
    search_service: SearchService = Depends(get_search_service)
 ):
    """GET version of search endpoint for simple queries."""
    # Parse entity types
    parsed_entity_types = None
    if entity_types:
        try:
            parsed_entity_types = [EntityType(et.strip()) for et in entity_types.split(",")]
        except ValueError as e:
            raise HTTPException(status_code=400, detail=f"Invalid entity type: {str(e)}")
    # Create search request
    request = SearchRequest(
        query=q,
        limit=limit,
        entity_types=parsed_entity_types,
        similarity_threshold=threshold
    )
    return await search_documents(request, search_service)
@app.get("/api/v1/entities/{entity_type}", response_model=EntitySearchResult)
 async def search_by_entity(
    entity_type: EntityType,
    text: str = Query(..., description="Entity text to search for"),
    limit: int = Query(10, ge=1, le=100, description="Maximum number of results"),
    search_service: SearchService = Depends(get_search_service)
 ):
    """
    Search for documents containing specific entities.
    - **entity_type**: Type of entity (person, place, event, etc.)
    - **text**: Entity text to search for
    - **limit**: Maximum number of results
    """
    try:
        results = await search_service.search_by_entity(
            entity_text=text,
            entity_type=entity_type,
            limit=limit
        )
        return EntitySearchResult(
            entity_text=text,
            entity_type=entity_type.value,
            documents=results,
            total_occurrences=len(results)
        )
    except Exception as e:
        logger.error(f"Entity search error: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Entity search failed: {str(e)}")
@app.get("/api/v1/entities/{entity_text}/relationships")
 async def get_entity_relationships(
    entity_text: str,
    search_service: SearchService = Depends(get_search_service)
 ):
    """
    Get relationships for a specific entity.
    - **entity_text**: Entity to find relationships for
    """
    try:
        relationships = await search_service.get_entity_relationships(entity_text)
        return {
            "entity": entity_text,
            "relationships": relationships,
            "total_relationships": sum(len(entities) for entities in relationships.values())
        }
    except Exception as e:
        logger.error(f"Relationship search error: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Relationship search failed: {str(e)}")
@app.post("/api/v1/documents/upload", response_model=DocumentUploadResponse)
 async def upload_document(
    file: UploadFile = File(...),
    access_level: AccessLevel = AccessLevel.PUBLIC,
    document_processor: DocumentProcessor = Depends(get_document_processor)
 ):
    """
    Upload and process a new document.
    - **file**: Document file to upload
    - **access_level**: Access level for the document (public, restricted, private)
    """
    try:
        # Validate file type
        if not file.filename:
            raise HTTPException(status_code=400, detail="No filename provided")
        file_extension = Path(file.filename).suffix.lower()
        if file_extension not in settings.ALLOWED_EXTENSIONS:
            raise HTTPException(
                status_code=400, 
                detail=f"File type {file_extension} not supported. Allowed: {settings.ALLOWED_EXTENSIONS}"
            )
        # Validate file size
        content = await file.read()
        if len(content) > settings.MAX_FILE_SIZE:
            raise HTTPException(
                status_code=400,
                detail=f"File too large. Maximum size: {settings.MAX_FILE_SIZE / (1024*1024):.1f}MB"
            )
        # Save uploaded file
        upload_dir = Path(settings.UPLOAD_DIR)
        upload_dir.mkdir(parents=True, exist_ok=True)
        file_path = upload_dir / file.filename
        with open(file_path, 'wb') as f:
            f.write(content)
        # Process document
        document = await document_processor.process_document(str(file_path), access_level)
        if not document:
            raise HTTPException(status_code=500, detail="Failed to process document")
        return DocumentUploadResponse(
            document_id=document.id,
            filename=document.filename,
            status="processed",
            entities_found=len(document.entities),
            access_level=document.access_level.value,
            message="Document uploaded and processed successfully"
        )
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Upload error: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}")
@app.get("/api/v1/documents/{document_id}")
 async def get_document(
    document_id: str,
    document_processor: DocumentProcessor = Depends(get_document_processor)
 ):
    """
    Get document details by ID.
    - **document_id**: Unique document identifier
    """
    try:
        document = await document_processor.get_document_by_id(document_id)
        if not document:
            raise HTTPException(status_code=404, detail="Document not found")
        # Return document without full content for performance
        return {
            "id": document.id,
            "filename": document.filename,
            "document_type": document.document_type.value,
            "access_level": document.access_level.value,
            "metadata": document.metadata,
            "entities": [{
                "text": e.text,
                "entity_type": e.entity_type.value,
                "confidence": e.confidence
            } for e in document.entities],
            "created_at": document.created_at,
            "updated_at": document.updated_at
        }
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Document retrieval error: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Failed to retrieve document: {str(e)}")
@app.delete("/api/v1/documents/{document_id}")
 async def delete_document(
    document_id: str,
    document_processor: DocumentProcessor = Depends(get_document_processor)
 ):
    """
    Delete a document by ID.
    - **document_id**: Unique document identifier
    """
    try:
        success = await document_processor.delete_document(document_id)
        if not success:
            raise HTTPException(status_code=404, detail="Document not found or deletion failed")
        return {"message": f"Document {document_id} deleted successfully"}
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Document deletion error: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Failed to delete document: {str(e)}")
@app.get("/api/v1/stats", response_model=StatsResponse)
 async def get_stats(
    search_service: SearchService = Depends(get_search_service),
    document_processor: DocumentProcessor = Depends(get_document_processor)
 ):
    """
    Get system statistics and metrics.
    """
    try:
        # Get processing stats
        processing_stats = await document_processor.get_processing_stats()
        # Get vector store stats
        vector_stats = await search_service.vector_store.get_collection_stats()
        return StatsResponse(
            total_documents=processing_stats.get("total_documents", 0),
            document_types=processing_stats.get("document_types", {}),
            access_levels=processing_stats.get("access_levels", {}),
            languages=processing_stats.get("languages", {}),
            total_entities=processing_stats.get("total_entities", 0),
            average_entities_per_doc=processing_stats.get("average_entities_per_doc", 0.0),
            vector_store_stats=vector_stats,
            system_status="operational"
        )
    except Exception as e:
        logger.error(f"Stats error: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Failed to retrieve stats: {str(e)}")
@app.get("/api/v1/entity-types")
 async def get_entity_types():
    """Get available entity types."""
    return {
        "entity_types": [
            {"value": et.value, "label": et.value.title()} 
            for et in EntityType
        ]
    }
@app.get("/api/v1/document-types")
 async def get_document_types():
    """Get supported document types."""
    return {
        "document_types": [
            {"value": dt.value, "label": dt.value.upper()} 
            for dt in DocumentType
        ]
    }
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(
        "src.api.routes:app",
        host=settings.API_HOST,
        port=settings.API_PORT,
        reload=settings.DEBUG
    )
@@ -0,0 +1,40 @@
 from typing import List, Optional, Dict, Any
 from pydantic import BaseModel, Field
 from ..models.document import EntityType, AccessLevel
 class SearchRequest(BaseModel):
    """Search request schema."""
    query: str = Field(..., description="Search query string", min_length=1)
    limit: int = Field(10, ge=1, le=100, description="Maximum number of results")
    entity_types: Optional[List[EntityType]] = Field(None, description="Filter by entity types")
    similarity_threshold: Optional[float] = Field(None, ge=0.0, le=1.0, description="Minimum similarity score")
 class DocumentUploadResponse(BaseModel):
    """Document upload response schema."""
    document_id: str
    filename: str
    status: str
    entities_found: int
    access_level: str
    message: str
 class StatsResponse(BaseModel):
    """System statistics response schema."""
    total_documents: int
    document_types: Dict[str, int]
    access_levels: Dict[str, int]
    languages: Dict[str, int]
    total_entities: int
    average_entities_per_doc: float
    vector_store_stats: Dict[str, Any]
    system_status: str
 class ErrorResponse(BaseModel):
    """Error response schema."""
    error: str
    detail: str
    timestamp: float
@@ -0,0 +1,27 @@
 # app.py
 from fastapi import FastAPI, Request, Form
 from fastapi.templating import Jinja2Templates
 from fastapi.responses import HTMLResponse
 from src.services.vector_store import VectorStore
 import uvicorn
 import asyncio
 app = FastAPI()
 templates = Jinja2Templates(directory="templates")
 vector_store = VectorStore()
@app.on_event("startup")
 async def startup_event():
    await vector_store.initialize()
@app.get("/", response_class=HTMLResponse)
 async def form_get(request: Request):
    return templates.TemplateResponse("search.html", {"request": request, "results": None})
@app.post("/", response_class=HTMLResponse)
 async def form_post(request: Request, query: str = Form(...)):
    results = await vector_store.search(query, threshold=0.2)
    return templates.TemplateResponse("search.html", {"request": request, "results": results})
 if __name__ == "__main__":
    uvicorn.run("app:app", reload=True)
@@ -0,0 +1,172 @@
 """Main application entry point."""
 import asyncio
 import logging
 from pathlib import Path
 from .services.search_service import SearchService
 from .services.document_processor import DocumentProcessor
 from .models.document import EntityType
 from config.settings import settings
 # Configure logging
 logging.basicConfig(
    level=getattr(logging, settings.LOG_LEVEL),
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 class SemanticSearchPOC:
    """Main POC application class."""
    def __init__(self):
        """Initialize the POC application."""
        self.search_service = SearchService()
        self.document_processor = DocumentProcessor()
    async def initialize(self):
        """Initialize the application."""
        logger.info("Initializing Semantic Search POC...")
        # Create necessary directories
        self._create_directories()
        # Initialize services
        await self.search_service.vector_store.initialize()
        logger.info("Initialization complete!")
    def _create_directories(self):
        """Create necessary directories."""
        directories = [
            settings.DATA_DIR,
            settings.RAW_DATA_DIR,
            settings.PROCESSED_DATA_DIR,
            settings.UPLOAD_DIR,
            Path(settings.LOG_FILE).parent
        ]
        for directory in directories:
            Path(directory).mkdir(parents=True, exist_ok=True)
    async def process_sample_documents(self):
        """Process sample documents for testing."""
        logger.info("Processing sample documents...")
        sample_docs_dir = Path(settings.RAW_DATA_DIR) / "sample_documents"
        if not sample_docs_dir.exists():
            logger.warning(f"Sample documents directory not found: {sample_docs_dir}")
            return
        # Process all documents in the sample directory
        for file_path in sample_docs_dir.iterdir():
            if file_path.is_file():
                try:
                    await self.document_processor.process_document(str(file_path))
                    logger.info(f"Processed: {file_path.name}")
                except Exception as e:
                    logger.error(f"Error processing {file_path.name}: {str(e)}")
    async def demo_search(self):
        """Demonstrate search functionality."""
        logger.info("Running search demo...")
        # Example searches
        search_queries = [
            "Napoleon Bonaparte biography",
            "French Revolution events",
            "Paris buildings and architecture",
            "Military campaigns in Europe"
        ]
        for query in search_queries:
            logger.info(f"\nSearching for: '{query}'")
            try:
                results = await self.search_service.semantic_search(
                    query=query,
                    limit=5
                )
                if results:
                    for i, result in enumerate(results, 1):
                        logger.info(f"  {i}. {result.title} (Score: {result.similarity_score:.3f})")
                        logger.info(f"     Preview: {result.content_preview[:100]}...")
                else:
                    logger.info("  No results found.")
            except Exception as e:
                logger.error(f"Error searching for '{query}': {str(e)}")
    async def demo_entity_search(self):
        """Demonstrate entity-based search."""
        logger.info("Running entity search demo...")
        # Example entity searches
        entity_searches = [
            ("Napoleon", EntityType.PERSON),
            ("Paris", EntityType.PLACE),
            ("Revolution", EntityType.EVENT)
        ]
        for entity_text, entity_type in entity_searches:
            logger.info(f"\nSearching for {entity_type.value}: '{entity_text}'")
            try:
                results = await self.search_service.search_by_entity(
                    entity_text=entity_text,
                    entity_type=entity_type,
                    limit=3
                )
                if results:
                    for i, result in enumerate(results, 1):
                        logger.info(f"  {i}. {result.title}")
                        logger.info(f"     Entities: {len(result.entities)} found")
                else:
                    logger.info("  No results found.")
            except Exception as e:
                logger.error(f"Error searching for entity '{entity_text}': {str(e)}")
    async def demo_relationships(self):
        """Demonstrate entity relationship extraction."""
        logger.info("Running relationship demo...")
        entity = "Napoleon"
        logger.info(f"\nFinding relationships for: '{entity}'")
        try:
            relationships = await self.search_service.get_entity_relationships(entity)
            for rel_type, entities in relationships.items():
                if entities:
                    logger.info(f"  {rel_type.upper()}:")
                    for entity_data in entities[:3]:  # Show top 3
                        logger.info(f"    - {entity_data['entity']} (freq: {entity_data['frequency']})")
        except Exception as e:
            logger.error(f"Error finding relationships for '{entity}': {str(e)}")
 async def main():
    """Main function to run the POC."""
    logger.info("Starting Semantic Search POC")
    # Initialize application
    app = SemanticSearchPOC()
    await app.initialize()
    # Process sample documents (if any)
    await app.process_sample_documents()
    # Run demonstrations
    await app.demo_search()
    await app.demo_entity_search()
    await app.demo_relationships()
    logger.info("POC demonstration complete!")
 if __name__ == "__main__":
    asyncio.run(main())
@@ -0,0 +1,102 @@
 """Document data models."""
 from datetime import datetime
 from typing import List, Optional, Dict, Any
 from pydantic import BaseModel, Field
 from enum import Enum
 class DocumentType(str, Enum):
    """Document type enumeration."""
    PDF = "pdf"
    XML = "xml"
    TEXT = "text"
    DOCX = "docx"
    IMAGE = "image"
    AUDIO = "audio"
    VIDEO = "video"
 class AccessLevel(str, Enum):
    """Document access level."""
    PUBLIC = "public"
    RESTRICTED = "restricted"
    PRIVATE = "private"
 class EntityType(str, Enum):
    """Entity type enumeration."""
    PERSON = "person"
    PLACE = "place"
    EVENT = "event"
    ORGANIZATION = "organization"
    BUILDING = "building"
    DATE = "date"
 class Entity(BaseModel):
    """Entity extracted from document."""
    text: str
    label: str
    entity_type: EntityType
    start_pos: int
    end_pos: int
    confidence: float = 0.0
    metadata: Dict[str, Any] = Field(default_factory=dict)
 class DocumentMetadata(BaseModel):
    """Document metadata."""
    title: Optional[str] = None
    author: Optional[str] = None
    creation_date: Optional[datetime] = None
    language: str = "en"
    subject: Optional[str] = None
    description: Optional[str] = None
    keywords: List[str] = Field(default_factory=list)
    file_size: Optional[int] = None
    page_count: Optional[int] = None
 class Document(BaseModel):
    """Main document model."""
    id: str
    filename: str
    file_path: str
    document_type: DocumentType
    access_level: AccessLevel
    content: str
    metadata: DocumentMetadata
    entities: List[Entity] = Field(default_factory=list)
    embedding: Optional[List[float]] = None
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)
    class Config:
        """Pydantic config."""
        json_encoders = {
            datetime: lambda v: v.isoformat()
        }
 class DocumentChunk(BaseModel):
    """Document chunk for processing."""
    id: str
    document_id: str
    content: str
    chunk_index: int
    start_pos: int
    end_pos: int
    embedding: Optional[List[float]] = None
    entities: List[Entity] = Field(default_factory=list)
 class SearchQuery(BaseModel):
    """Search query model."""
    query: str
    entity_types: Optional[List[EntityType]] = None
    document_types: Optional[List[DocumentType]] = None
    access_levels: Optional[List[AccessLevel]] = None
    limit: int = 10
    offset: int = 0
    similarity_threshold: float = 0.2
@@ -0,0 +1,53 @@
 """Search result data models."""
 from typing import List, Dict, Any, Optional
 from pydantic import BaseModel, Field
 class SearchResult(BaseModel):
    """Search result model."""
    document_id: str
    title: str
    content_preview: str
    similarity_score: float
    entities: List[Dict[str, Any]] = Field(default_factory=list)
    metadata: Dict[str, Any] = Field(default_factory=dict)
    matched_entity: Optional[str] = None
    highlights: List[str] = Field(default_factory=list)
    class Config:
        """Pydantic config."""
        json_encoders = {
            float: lambda v: round(v, 4)
        }
 class EntitySearchResult(BaseModel):
    """Entity-specific search result."""
    entity_text: str
    entity_type: str
    documents: List[SearchResult]
    total_occurrences: int
    related_entities: List[Dict[str, Any]] = Field(default_factory=list)
 class SearchResponse(BaseModel):
    """Complete search response."""
    query: str
    results: List[SearchResult]
    total_results: int
    search_time_ms: float
    filters_applied: Dict[str, Any] = Field(default_factory=dict)
    suggestions: List[str] = Field(default_factory=list)
 class EntityRelationship(BaseModel):
    """Entity relationship model."""
    source_entity: str
    source_type: str
    target_entity: str
    target_type: str
    relationship_type: str
    confidence: float
    document_ids: List[str]
    frequency: int = 1
@@ -0,0 +1,545 @@
 import logging
 import os
 from pathlib import Path
 from typing import List, Dict, Any, Optional
 import hashlib
 from datetime import datetime
 import json
 # Document processing libraries
 import PyPDF2
 from docx import Document as DocxDocument
 import xml.etree.ElementTree as ET
 from bs4 import BeautifulSoup
 # ML libraries
 from sentence_transformers import SentenceTransformer
 from ..models.document import Document, DocumentType, AccessLevel, DocumentMetadata
 from ..services.entity_extractor import EntityExtractor
 from ..services.vector_store import VectorStore
 from config.settings import settings
 class DocumentProcessor:
    """Service for processing and indexing documents."""
    def __init__(self):
        """Initialize document processor."""
        self.logger = logging.getLogger(__name__)
        self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL)
        self.entity_extractor = EntityExtractor()
        self.vector_store = VectorStore()
    async def process_document(self, file_path: str, access_level: AccessLevel = AccessLevel.PUBLIC) -> Optional[Document]:
        """
        Process a document and add it to the search index.
        Args:
            file_path: Path to the document file
            access_level: Access level for the document
        Returns:
            Processed Document object or None if processing failed
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                self.logger.error(f"File not found: {file_path}")
                return None
            # Determine document type
            doc_type = self._get_document_type(file_path)
            if not doc_type:
                self.logger.error(f"Unsupported file type: {file_path.suffix}")
                return None
            # Extract text content
            content = self._extract_text(file_path, doc_type)
            if not content:
                self.logger.error(f"Failed to extract text from: {file_path}")
                return None
            # Generate document ID
            doc_id = self._generate_document_id(file_path, content)
            # Extract metadata
            metadata = self._extract_metadata(file_path, doc_type, content)
            # Extract entities
            entities = self.entity_extractor.extract_entities(content)
            # Generate embedding
            embedding = self.embedding_model.encode(content).tolist()
            # Create document object
            document = Document(
                id=doc_id,
                filename=file_path.name,
                file_path=str(file_path),
                document_type=doc_type,
                access_level=access_level,
                content=content,
                metadata=metadata,
                entities=entities,
                embedding=embedding
            )
            # Add to vector store
            await self._add_to_vector_store(document)
            # Save processed document metadata
            await self._save_document_metadata(document)
            self.logger.info(f"Successfully processed document: {file_path.name}")
            return document
        except Exception as e:
            self.logger.error(f"Error processing document {file_path}: {str(e)}")
            return None
    def _get_document_type(self, file_path: Path) -> Optional[DocumentType]:
        """Determine document type from file extension."""
        extension = file_path.suffix.lower()
        type_mapping = {
            '.pdf': DocumentType.PDF,
            '.txt': DocumentType.TEXT,
            '.docx': DocumentType.DOCX,
            '.xml': DocumentType.XML,
            '.jpg': DocumentType.IMAGE,
            '.jpeg': DocumentType.IMAGE,
            '.png': DocumentType.IMAGE,
            '.gif': DocumentType.IMAGE,
            '.mp3': DocumentType.AUDIO,
            '.wav': DocumentType.AUDIO,
            '.mp4': DocumentType.VIDEO,
            '.avi': DocumentType.VIDEO
        }
        return type_mapping.get(extension)
    def _extract_text(self, file_path: Path, doc_type: DocumentType) -> str:
        """Extract text content from document based on type."""
        try:
            if doc_type == DocumentType.PDF:
                return self._extract_text_from_pdf(file_path)
            elif doc_type == DocumentType.TEXT:
                return self._extract_text_from_txt(file_path)
            elif doc_type == DocumentType.DOCX:
                return self._extract_text_from_docx(file_path)
            elif doc_type == DocumentType.XML:
                return self._extract_text_from_xml(file_path)
            else:
                self.logger.warning(f"Text extraction not implemented for type: {doc_type}")
                return ""
        except Exception as e:
            self.logger.error(f"Error extracting text from {file_path}: {str(e)}")
            return ""
    def _extract_text_from_pdf(self, file_path: Path) -> str:
        """Extract text from PDF file."""
        text = ""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
        except Exception as e:
            self.logger.error(f"Error reading PDF {file_path}: {str(e)}")
        return text.strip()
    def _extract_text_from_txt(self, file_path: Path) -> str:
        """Extract text from plain text file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except UnicodeDecodeError:
            # Try with different encodings
            for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
                try:
                    with open(file_path, 'r', encoding=encoding) as file:
                        return file.read()
                except UnicodeDecodeError:
                    continue
            self.logger.error(f"Could not decode text file: {file_path}")
            return ""
    def _extract_text_from_docx(self, file_path: Path) -> str:
        """Extract text from DOCX file."""
        try:
            doc = DocxDocument(str(file_path))
            text = []
            for paragraph in doc.paragraphs:
                text.append(paragraph.text)
            return '\n'.join(text)
        except Exception as e:
            self.logger.error(f"Error reading DOCX {file_path}: {str(e)}")
            return ""
    def _extract_text_from_xml(self, file_path: Path) -> str:
        """Extract text from XML file."""
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            # Extract all text content from XML
            text_content = []
            for elem in root.iter():
                if elem.text and elem.text.strip():
                    text_content.append(elem.text.strip())
            return '\n'.join(text_content)
        except Exception as e:
            self.logger.error(f"Error reading XML {file_path}: {str(e)}")
            return ""
    def _extract_metadata(self, file_path: Path, doc_type: DocumentType, content: str) -> DocumentMetadata:
        """Extract metadata from document."""
        try:
            stat = file_path.stat()
            # Basic metadata
            metadata = DocumentMetadata(
                title=file_path.stem,  # Filename without extension
                creation_date=datetime.fromtimestamp(stat.st_ctime),
                language="en",  # Default to English for POC
                file_size=stat.st_size,
                keywords=self._extract_keywords(content)
            )
            # Document type specific metadata
            if doc_type == DocumentType.PDF:
                metadata.page_count = self._get_pdf_page_count(file_path)
            return metadata
        except Exception as e:
            self.logger.error(f"Error extracting metadata from {file_path}: {str(e)}")
            return DocumentMetadata()
    def _get_pdf_page_count(self, file_path: Path) -> Optional[int]:
        """Get page count for PDF files."""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                return len(pdf_reader.pages)
        except Exception as e:
            self.logger.error(f"Error getting PDF page count: {str(e)}")
            return None
    def _extract_keywords(self, content: str, max_keywords: int = 10) -> List[str]:
        """Extract keywords from document content."""
        words = content.lower().split()
        # Filter out common stop words
        stop_words = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 
            'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during',
            'before', 'after', 'above', 'below', 'between', 'among', 'is', 'are',
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does',
            'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can'
        }
        # Count word frequencies
        word_freq = {}
        for word in words:
            word = word.strip('.,!?;:"()[]{}')
            if len(word) > 3 and word not in stop_words:
                word_freq[word] = word_freq.get(word, 0) + 1
        # Get top keywords
        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        return [word for word, freq in sorted_words[:max_keywords]]
    def _generate_document_id(self, file_path: Path, content: str) -> str:
        """Generate unique document ID."""
        # Create hash from file path and content
        hasher = hashlib.md5()
        hasher.update(str(file_path).encode('utf-8'))
        hasher.update(content[:1000].encode('utf-8'))  # First 1000 chars
        return hasher.hexdigest()
    async def _add_to_vector_store(self, document: Document):
        """Add document to vector store."""
        try:
            # Prepare metadata for vector store
            metadata = {
                'title': document.metadata.title or document.filename,
                'filename': document.filename,
                'document_type': document.document_type.value,
                'access_level': document.access_level.value,
                'creation_date': document.metadata.creation_date.isoformat() if document.metadata.creation_date else None,
                'language': document.metadata.language,
                'file_size': document.metadata.file_size,
                'page_count': document.metadata.page_count,
                'keywords': json.dumps(document.metadata.keywords),
                'entities': json.dumps([{
                    'text': e.text,
                    'entity_type': e.entity_type.value,
                    'confidence': e.confidence
                } for e in document.entities])
            }
            await self.vector_store.add_document(
                document_id=document.id,
                embedding=document.embedding,
                content=document.content,
                metadata=metadata
            )
        except Exception as e:
            self.logger.error(f"Error adding document to vector store: {str(e)}")
            raise
    async def _save_document_metadata(self, document: Document):
        """Save document metadata to file."""
        try:
            metadata_dir = Path(settings.PROCESSED_DATA_DIR) / "metadata"
            metadata_dir.mkdir(parents=True, exist_ok=True)
            metadata_file = metadata_dir / f"{document.id}.json"
            # Convert document to dictionary for JSON serialization
            doc_dict = {
                'id': document.id,
                'filename': document.filename,
                'file_path': document.file_path,
                'document_type': document.document_type.value,
                'access_level': document.access_level.value,
                'content_length': len(document.content),
                'metadata': {
                    'title': document.metadata.title,
                    'author': document.metadata.author,
                    'creation_date': document.metadata.creation_date.isoformat() if document.metadata.creation_date else None,
                    'language': document.metadata.language,
                    'subject': document.metadata.subject,
                    'description': document.metadata.description,
                    'keywords': document.metadata.keywords,
                    'file_size': document.metadata.file_size,
                    'page_count': document.metadata.page_count
                },
                'entities': [{
                    'text': e.text,
                    'label': e.label,
                    'entity_type': e.entity_type.value,
                    'start_pos': e.start_pos,
                    'end_pos': e.end_pos,
                    'confidence': e.confidence,
                    'metadata': e.metadata
                } for e in document.entities],
                'created_at': document.created_at.isoformat(),
                'updated_at': document.updated_at.isoformat()
            }
            with open(metadata_file, 'w', encoding='utf-8') as f:
                json.dump(doc_dict, f, indent=2, ensure_ascii=False)
            self.logger.debug(f"Saved metadata for document: {document.id}")
        except Exception as e:
            self.logger.error(f"Error saving document metadata: {str(e)}")
    async def process_batch(self, directory_path: str, file_pattern: str = "*") -> List[Document]:
        """
        Process multiple documents in a directory.
        Args:
            directory_path: Path to directory containing documents
            file_pattern: File pattern to match (e.g., "*.pdf")
        Returns:
            List of processed documents
        """
        processed_documents = []
        try:
            directory = Path(directory_path)
            if not directory.exists():
                self.logger.error(f"Directory not found: {directory}")
                return processed_documents
            # Find matching files
            files = list(directory.glob(file_pattern))
            self.logger.info(f"Found {len(files)} files to process in {directory}")
            for file_path in files:
                if file_path.is_file():
                    document = await self.process_document(str(file_path))
                    if document:
                        processed_documents.append(document)
                    else:
                        self.logger.warning(f"Failed to process: {file_path}")
            self.logger.info(f"Successfully processed {len(processed_documents)} documents")
            return processed_documents
        except Exception as e:
            self.logger.error(f"Error processing batch: {str(e)}")
            return processed_documents
    async def get_document_by_id(self, document_id: str) -> Optional[Document]:
        """
        Retrieve a document by its ID.
        Args:
            document_id: Document ID
        Returns:
            Document object or None if not found
        """
        try:
            metadata_file = Path(settings.PROCESSED_DATA_DIR) / "metadata" / f"{document_id}.json"
            if not metadata_file.exists():
                self.logger.warning(f"Document metadata not found: {document_id}")
                return None
            with open(metadata_file, 'r', encoding='utf-8') as f:
                doc_dict = json.load(f)
            # Reconstruct document object
            metadata = DocumentMetadata(
                title=doc_dict['metadata'].get('title'),
                author=doc_dict['metadata'].get('author'),
                creation_date=datetime.fromisoformat(doc_dict['metadata']['creation_date']) if doc_dict['metadata'].get('creation_date') else None,
                language=doc_dict['metadata'].get('language', 'en'),
                subject=doc_dict['metadata'].get('subject'),
                description=doc_dict['metadata'].get('description'),
                keywords=doc_dict['metadata'].get('keywords', []),
                file_size=doc_dict['metadata'].get('file_size'),
                page_count=doc_dict['metadata'].get('page_count')
            )
            # Reconstruct entities
            entities = []
            for entity_dict in doc_dict.get('entities', []):
                entity = Entity(
                    text=entity_dict['text'],
                    label=entity_dict['label'],
                    entity_type=EntityType(entity_dict['entity_type']),
                    start_pos=entity_dict['start_pos'],
                    end_pos=entity_dict['end_pos'],
                    confidence=entity_dict['confidence'],
                    metadata=entity_dict.get('metadata', {})
                )
                entities.append(entity)
            # Read original content if needed (for small files)
            content = ""
            if Path(doc_dict['file_path']).exists():
                doc_type = DocumentType(doc_dict['document_type'])
                content = self._extract_text(Path(doc_dict['file_path']), doc_type)
            document = Document(
                id=doc_dict['id'],
                filename=doc_dict['filename'],
                file_path=doc_dict['file_path'],
                document_type=DocumentType(doc_dict['document_type']),
                access_level=AccessLevel(doc_dict['access_level']),
                content=content,
                metadata=metadata,
                entities=entities,
                created_at=datetime.fromisoformat(doc_dict['created_at']),
                updated_at=datetime.fromisoformat(doc_dict['updated_at'])
            )
            return document
        except Exception as e:
            self.logger.error(f"Error retrieving document {document_id}: {str(e)}")
            return None
    async def delete_document(self, document_id: str) -> bool:
        """
        Delete a document from the index.
        Args:
            document_id: Document ID to delete
        Returns:
            True if successful, False otherwise
        """
        try:
            # Delete from vector store
            await self.vector_store.delete_document(document_id)
            # Delete metadata file
            metadata_file = Path(settings.PROCESSED_DATA_DIR) / "metadata" / f"{document_id}.json"
            if metadata_file.exists():
                metadata_file.unlink()
            self.logger.info(f"Deleted document: {document_id}")
            return True
        except Exception as e:
            self.logger.error(f"Error deleting document {document_id}: {str(e)}")
            return False
    async def get_processing_stats(self) -> Dict[str, Any]:
        """
        Get statistics about processed documents.
        Returns:
            Dictionary with processing statistics
        """
        try:
            metadata_dir = Path(settings.PROCESSED_DATA_DIR) / "metadata"
            if not metadata_dir.exists():
                return {"total_documents": 0}
            # Count metadata files
            metadata_files = list(metadata_dir.glob("*.json"))
            total_docs = len(metadata_files)
            # Analyze document types and other stats
            doc_types = {}
            access_levels = {}
            languages = {}
            total_entities = 0
            for metadata_file in metadata_files:
                try:
                    with open(metadata_file, 'r', encoding='utf-8') as f:
                        doc_dict = json.load(f)
                    # Count by document type
                    doc_type = doc_dict.get('document_type', 'unknown')
                    doc_types[doc_type] = doc_types.get(doc_type, 0) + 1
                    # Count by access level
                    access_level = doc_dict.get('access_level', 'unknown')
                    access_levels[access_level] = access_levels.get(access_level, 0) + 1
                    # Count by language
                    language = doc_dict.get('metadata', {}).get('language', 'unknown')
                    languages[language] = languages.get(language, 0) + 1
                    # Count entities
                    total_entities += len(doc_dict.get('entities', []))
                except Exception as e:
                    self.logger.warning(f"Error reading metadata file {metadata_file}: {str(e)}")
                    continue
            # Get vector store stats
            vector_stats = await self.vector_store.get_collection_stats()
            return {
                "total_documents": total_docs,
                "document_types": doc_types,
                "access_levels": access_levels,
                "languages": languages,
                "total_entities": total_entities,
                "average_entities_per_doc": total_entities / total_docs if total_docs > 0 else 0,
                "vector_store": vector_stats
            }
        except Exception as e:
            self.logger.error(f"Error getting processing stats: {str(e)}")
            return {"error": str(e)}
@@ -0,0 +1,239 @@
 import logging
 from typing import List, Dict, Any, Optional
 import spacy
 from spacy import displacy
 from ..models.document import Entity, EntityType
 from config.settings import settings
 class EntityExtractor:
    """Entity extraction service for named entity recognition."""
    def __init__(self):
        """Initialize entity extractor."""
        self.logger = logging.getLogger(__name__)
        self.nlp = None
        self._load_model()
    def _load_model(self):
        """Load the spaCy NLP model."""
        try:
            self.nlp = spacy.load(settings.SPACY_MODEL)
            self.logger.info(f"Loaded spaCy model: {settings.SPACY_MODEL}")
        except OSError:
            self.logger.error(f"spaCy model {settings.SPACY_MODEL} not found. Please install it with: python -m spacy download {settings.SPACY_MODEL}")
            raise
    def extract_entities(self, text: str) -> List[Entity]:
        """
        Extract named entities from text.
        Args:
            text: Input text to process
        Returns:
            List of extracted entities
        """
        if not self.nlp:
            self._load_model()
        try:
            # Process text with spaCy
            doc = self.nlp(text)
            entities = []
            for ent in doc.ents:
                # Map spaCy labels to our EntityType enum
                entity_type = self._map_spacy_label(ent.label_)
                if entity_type:  # Only include mapped entity types
                    entity = Entity(
                        text=ent.text,
                        label=ent.label_,
                        entity_type=entity_type,
                        start_pos=ent.start_char,
                        end_pos=ent.end_char,
                        confidence=self._calculate_confidence(ent),
                        metadata={
                            "spacy_label": ent.label_,
                            "spacy_explanation": spacy.explain(ent.label_)
                        }
                    )
                    entities.append(entity)
            self.logger.debug(f"Extracted {len(entities)} entities from text")
            return entities
        except Exception as e:
            self.logger.error(f"Error extracting entities: {str(e)}")
            return []
    def extract_relationships(self, text: str) -> List[Dict[str, Any]]:
        """
        Extract relationships between entities.
        Args:
            text: Input text to process
        Returns:
            List of relationships between entities
        """
        if not self.nlp:
            self._load_model()
        try:
            doc = self.nlp(text)
            relationships = []
            # Simple relationship extraction based on dependency parsing
            for token in doc:
                if token.dep_ in ['nsubj', 'dobj', 'pobj']:  # Subject, direct object, prepositional object
                    head = token.head
                    # Check if both token and head are part of named entities
                    token_ent = self._get_entity_for_token(token, doc.ents)
                    head_ent = self._get_entity_for_token(head, doc.ents)
                    if token_ent and head_ent and token_ent != head_ent:
                        relationship = {
                            "subject": token_ent.text,
                            "subject_type": self._map_spacy_label(token_ent.label_),
                            "predicate": head.text,
                            "object": head_ent.text,
                            "object_type": self._map_spacy_label(head_ent.label_),
                            "relation_type": token.dep_,
                            "confidence": 0.7  # Basic confidence score
                        }
                        relationships.append(relationship)
            self.logger.debug(f"Extracted {len(relationships)} relationships from text")
            return relationships
        except Exception as e:
            self.logger.error(f"Error extracting relationships: {str(e)}")
            return []
    def _map_spacy_label(self, spacy_label: str) -> Optional[EntityType]:
        """
        Map spaCy entity labels to our EntityType enum.
        Args:
            spacy_label: spaCy entity label
        Returns:
            Corresponding EntityType or None if not mapped
        """
        mapping = {
            # Person
            'PERSON': EntityType.PERSON,
            # Places
            'GPE': EntityType.PLACE,  # Geopolitical entity
            'LOC': EntityType.PLACE,  # Location
            'FAC': EntityType.BUILDING,  # Facility/Building
            # Organizations
            'ORG': EntityType.ORGANIZATION,
            # Events
            'EVENT': EntityType.EVENT,
            # Dates
            'DATE': EntityType.DATE,
            'TIME': EntityType.DATE,
        }
        return mapping.get(spacy_label)
    def _calculate_confidence(self, entity) -> float:
        """
        Calculate confidence score for an entity.
        Args:
            entity: spaCy entity object
        Returns:
            Confidence score between 0 and 1
        """
        # Basic confidence calculation based on entity properties
        confidence = 0.5  # Base confidence
        # Increase confidence for longer entities
        if len(entity.text) > 3:
            confidence += 0.1
        # Increase confidence for capitalized entities
        if entity.text.istitle():
            confidence += 0.1
        # Increase confidence for certain entity types
        high_confidence_types = ['PERSON', 'GPE', 'ORG']
        if entity.label_ in high_confidence_types:
            confidence += 0.2
        return min(confidence, 1.0)
    def _get_entity_for_token(self, token, entities):
        """
        Get the entity that contains a specific token.
        Args:
            token: spaCy token
            entities: List of spaCy entities
        Returns:
            Entity containing the token or None
        """
        for ent in entities:
            if ent.start <= token.i < ent.end:
                return ent
        return None
    def get_entity_summary(self, entities: List[Entity]) -> Dict[str, int]:
        """
        Get summary statistics for extracted entities.
        Args:
            entities: List of entities
        Returns:
            Dictionary with entity type counts
        """
        summary = {}
        for entity in entities:
            entity_type = entity.entity_type.value
            summary[entity_type] = summary.get(entity_type, 0) + 1
        return summary
    def visualize_entities(self, text: str, output_path: Optional[str] = None) -> str:
        """
        Create HTML visualization of entities in text.
        Args:
            text: Input text
            output_path: Optional file path to save HTML
        Returns:
            HTML string with entity visualization
        """
        if not self.nlp:
            self._load_model()
        try:
            doc = self.nlp(text)
            # Generate HTML visualization
            html = displacy.render(doc, style="ent", jupyter=False)
            if output_path:
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(html)
                self.logger.info(f"Entity visualization saved to: {output_path}")
            return html
        except Exception as e:
            self.logger.error(f"Error creating entity visualization: {str(e)}")
            return ""
@@ -0,0 +1,216 @@
 """Main search service implementation."""
 from typing import List, Dict, Any, Optional
 import logging
 from sentence_transformers import SentenceTransformer
 from ..models.document import Document, SearchQuery, EntityType
 from ..models.search_result import SearchResult
 from .vector_store import VectorStore
 from .entity_extractor import EntityExtractor
 from config.settings import settings
 class SearchService:
    """Main search service for semantic search functionality."""
    def __init__(self):
        """Initialize search service."""
        self.logger = logging.getLogger(__name__)
        self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL)
        self.vector_store = VectorStore()
        self.entity_extractor = EntityExtractor()
    async def semantic_search(
        self, 
        query: str, 
        limit: int = 10,
        entity_types: Optional[List[EntityType]] = None,
        similarity_threshold: float = None
    ) -> List[SearchResult]:
        """
        Perform semantic search across documents.
        Args:
            query: Search query string
            limit: Maximum number of results
            entity_types: Filter by entity types
            similarity_threshold: Minimum similarity score
        Returns:
            List of search results
        """
        if similarity_threshold is None:
            similarity_threshold = settings.SIMILARITY_THRESHOLD
        try:
            # Generate query embedding
            query_embedding = self.embedding_model.encode(query).tolist()
            # Search vector store
            results = await self.vector_store.similarity_search(
                query_embedding=query_embedding,
                limit=limit,
                threshold=similarity_threshold
            )
            # Filter by entity types if specified
            if entity_types:
                results = self._filter_by_entities(results, entity_types)
            # Convert to SearchResult objects
            search_results = []
            for result in results:
                search_result = SearchResult(
                    document_id=result["document_id"],
                    title=result.get("title", "Untitled"),
                    content_preview=result.get("content", "")[:200] + "...",
                    similarity_score=result["similarity_score"],
                    entities=result.get("entities", []),
                    metadata=result.get("metadata", {})
                )
                search_results.append(search_result)
            return search_results
        except Exception as e:
            self.logger.error(f"Error in semantic search: {str(e)}")
            raise
    async def search_by_entity(
        self, 
        entity_text: str, 
        entity_type: EntityType,
        limit: int = 10
    ) -> List[SearchResult]:
        """
        Search for documents containing specific entities.
        Args:
            entity_text: Entity text to search for
            entity_type: Type of entity
            limit: Maximum number of results
        Returns:
            List of search results
        """
        try:
            # Search for documents containing the entity
            results = await self.vector_store.search_by_entity(
                entity_text=entity_text,
                entity_type=entity_type.value,
                limit=limit
            )
            # Convert to SearchResult objects
            search_results = []
            for result in results:
                search_result = SearchResult(
                    document_id=result["document_id"],
                    title=result.get("title", "Untitled"),
                    content_preview=result.get("content", "")[:200] + "...",
                    similarity_score=1.0,  # Exact entity match
                    entities=result.get("entities", []),
                    metadata=result.get("metadata", {}),
                    matched_entity=entity_text
                )
                search_results.append(search_result)
            return search_results
        except Exception as e:
            self.logger.error(f"Error in entity search: {str(e)}")
            raise
    async def get_entity_relationships(
        self, 
        entity_text: str
    ) -> Dict[str, List[Dict[str, Any]]]:
        """
        Get relationships for a specific entity.
        Args:
            entity_text: Entity to find relationships for
        Returns:
            Dictionary of relationship types and related entities
        """
        try:
            # Find documents containing the entity
            entity_docs = await self.search_by_entity(
                entity_text=entity_text,
                entity_type=EntityType.PERSON,  # Default to person
                limit=100
            )
            # Extract co-occurring entities
            relationships = {
                "persons": [],
                "places": [],
                "events": [],
                "organizations": []
            }
            for doc in entity_docs:
                for entity in doc.entities:
                    if entity["text"].lower() != entity_text.lower():
                        rel_type = self._map_entity_to_relationship(entity["entity_type"])
                        if rel_type in relationships:
                            relationships[rel_type].append({
                                "entity": entity["text"],
                                "type": entity["entity_type"],
                                "document_id": doc.document_id,
                                "confidence": entity.get("confidence", 0.0)
                            })
            # Remove duplicates and sort by frequency
            for rel_type in relationships:
                entities = relationships[rel_type]
                unique_entities = {}
                for entity in entities:
                    key = entity["entity"]
                    if key not in unique_entities:
                        unique_entities[key] = entity
                        unique_entities[key]["frequency"] = 1
                    else:
                        unique_entities[key]["frequency"] += 1
                relationships[rel_type] = sorted(
                    unique_entities.values(),
                    key=lambda x: x["frequency"],
                    reverse=True
                )[:10]  # Top 10 relationships
            return relationships
        except Exception as e:
            self.logger.error(f"Error getting entity relationships: {str(e)}")
            raise
    def _filter_by_entities(
        self, 
        results: List[Dict[str, Any]], 
        entity_types: List[EntityType]
    ) -> List[Dict[str, Any]]:
        """Filter search results by entity types."""
        filtered_results = []
        entity_type_values = [et.value for et in entity_types]
        for result in results:
            entities = result.get("entities", [])
            if any(entity.get("entity_type") in entity_type_values for entity in entities):
                filtered_results.append(result)
        return filtered_results
    def _map_entity_to_relationship(self, entity_type: str) -> str:
        """Map entity type to relationship category."""
        mapping = {
            "PERSON": "persons",
            "GPE": "places",  # Geopolitical entity
            "LOC": "places",  # Location
            "EVENT": "events",
            "ORG": "organizations",
            "BUILDING": "places"
        }
        return mapping.get(entity_type, "other")
@@ -0,0 +1,227 @@
 import logging
 from typing import List, Dict, Any, Optional
 from pathlib import Path
 import chromadb
 from chromadb.config import Settings
 from chromadb.errors import NotFoundError
 import numpy as np
 from config.settings import settings
 class VectorStore:
    """Vector store for document embeddings using ChromaDB."""
    def __init__(self):
        """Initialize vector store."""
        self.logger = logging.getLogger(__name__)
        self.client = None
        self.collection = None
        self.collection_name = "documents"
    async def initialize(self):
        """Initialize the vector store."""
        try:
            persist_dir = Path(settings.CHROMA_PERSIST_DIR)
            persist_dir.mkdir(parents=True, exist_ok=True)
            self.client = chromadb.PersistentClient(
                path=str(persist_dir),
                settings=Settings(
                    anonymized_telemetry=False,
                    allow_reset=True
                )
            )
            try:
                self.collection = self.client.get_collection(name=self.collection_name)
                self.logger.info(f"Loaded existing collection: {self.collection_name}")
            except NotFoundError:
                self.collection = self.client.create_collection(
                    name=self.collection_name,
                    metadata={"description": "Document embeddings for semantic search"}
                )
                self.logger.info(f"Created new collection: {self.collection_name}")
        except Exception as e:
            self.logger.error(f"Error initializing vector store: {str(e)}")
            raise
    async def add_document(
        self,
        document_id: str,
        embedding: List[float],
        content: str,
        metadata: Dict[str, Any]
    ) -> bool:
        try:
            if not self.collection:
                await self.initialize()
            chroma_metadata = {k: str(v) for k, v in metadata.items()}
            self.collection.add(
                embeddings=[embedding],
                documents=[content],
                metadatas=[chroma_metadata],
                ids=[document_id]
            )
            self.logger.info(f"Added document {document_id} to vector store")
            return True
        except Exception as e:
            self.logger.error(f"Error adding document {document_id}: {str(e)}")
            return False
    async def similarity_search(
        self,
        query_embedding: List[float],
        limit: int = 10,
        threshold: float = 0.3,
        where: Optional[Dict[str, Any]] = None
    ) -> List[Dict[str, Any]]:
        try:
            if not self.collection:
                await self.initialize()
            results = self.collection.query(
                query_embeddings=[query_embedding],
                n_results=limit,
                where=where
            )
            formatted_results = []
            if results['ids'] and len(results['ids'][0]) > 0:
                for i in range(len(results['ids'][0])):
                    distance = results['distances'][0][i]
                    similarity_score = 1 - distance
                    if similarity_score >= threshold:
                        result = {
                            'document_id': results['ids'][0][i],
                            'content': results['documents'][0][i] if results['documents'] else '',
                            'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
                            'similarity_score': similarity_score
                        }
                        if 'entities' in result['metadata']:
                            try:
                                import json
                                result['entities'] = json.loads(result['metadata']['entities'])
                            except:
                                result['entities'] = []
                        else:
                            result['entities'] = []
                        formatted_results.append(result)
            formatted_results.sort(key=lambda x: x['similarity_score'], reverse=True)
            self.logger.info(f"Found {len(formatted_results)} results above threshold {threshold}")
            return formatted_results
        except Exception as e:
            self.logger.error(f"Error in similarity search: {str(e)}")
            return []
    async def search_by_entity(
        self,
        entity_text: str,
        entity_type: str,
        limit: int = 10
    ) -> List[Dict[str, Any]]:
        try:
            if not self.collection:
                await self.initialize()
            results = self.collection.get()
            filtered_results = []
            entity_lower = entity_text.lower()
            if results['ids']:
                for i in range(len(results['ids'])):
                    document_content = results['documents'][i] if results['documents'] else ''
                    metadata = results['metadatas'][i] if results['metadatas'] else {}
                    if (entity_lower in document_content.lower() or
                        entity_lower in str(metadata).lower()):
                        result = {
                            'document_id': results['ids'][i],
                            'content': document_content,
                            'metadata': metadata,
                            'similarity_score': 1.0
                        }
                        if 'entities' in metadata:
                            try:
                                import json
                                result['entities'] = json.loads(metadata['entities'])
                            except:
                                result['entities'] = []
                        else:
                            result['entities'] = []
                        filtered_results.append(result)
                        if len(filtered_results) >= limit:
                            break
            self.logger.info(f"Found {len(filtered_results)} documents containing entity '{entity_text}'")
            return filtered_results
        except Exception as e:
            self.logger.error(f"Error searching for entity '{entity_text}': {str(e)}")
            return []
    async def delete_document(self, document_id: str) -> bool:
        try:
            if not self.collection:
                await self.initialize()
            self.collection.delete(ids=[document_id])
            self.logger.info(f"Deleted document {document_id} from vector store")
            return True
        except Exception as e:
            self.logger.error(f"Error deleting document {document_id}: {str(e)}")
            return False
    async def get_collection_stats(self) -> Dict[str, Any]:
        try:
            if not self.collection:
                await self.initialize()
            count = self.collection.count()
            return {
                'total_documents': count,
                'collection_name': self.collection_name,
                'embedding_dimension': settings.EMBEDDING_DIMENSION
            }
        except Exception as e:
            self.logger.error(f"Error getting collection stats: {str(e)}")
            return {}
    async def reset_collection(self) -> bool:
        try:
            if not self.client:
                await self.initialize()
            try:
                self.client.delete_collection(self.collection_name)
            except NotFoundError:
                pass
            self.collection = self.client.create_collection(
                name=self.collection_name,
                metadata={"description": "Document embeddings for semantic search"}
            )
            self.logger.info(f"Reset collection: {self.collection_name}")
            return True
        except Exception as e:
            self.logger.error(f"Error resetting collection: {str(e)}")
            return False
@@ -0,0 +1,788 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Semantic Search Engine - POC</title>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/axios/1.6.0/axios.min.js"></script>
    <style>
        * {
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
            line-height: 1.6;
            color: #333;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
        }
        .container {
            max-width: 1200px;
            margin: 0 auto;
            padding: 20px;
        }
        .header {
            text-align: center;
            color: white;
            margin-bottom: 40px;
        }
        .header h1 {
            font-size: 3rem;
            font-weight: 300;
            margin-bottom: 10px;
            text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
        }
        .header p {
            font-size: 1.2rem;
            opacity: 0.9;
        }
        .search-container {
            background: white;
            border-radius: 20px;
            padding: 40px;
            box-shadow: 0 20px 40px rgba(0,0,0,0.1);
            margin-bottom: 30px;
            backdrop-filter: blur(10px);
        }
        .search-form {
            display: flex;
            gap: 15px;
            margin-bottom: 20px;
            flex-wrap: wrap;
        }
        .search-input {
            flex: 1;
            min-width: 300px;
            padding: 15px 20px;
            border: 2px solid #e1e5e9;
            border-radius: 12px;
            font-size: 16px;
            transition: all 0.3s ease;
        }
        .search-input:focus {
            outline: none;
            border-color: #667eea;
            box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
        }
        .search-btn {
            padding: 15px 30px;
            background: linear-gradient(135deg, #667eea, #764ba2);
            color: white;
            border: none;
            border-radius: 12px;
            font-size: 16px;
            font-weight: 600;
            cursor: pointer;
            transition: all 0.3s ease;
            min-width: 120px;
        }
        .search-btn:hover {
            transform: translateY(-2px);
            box-shadow: 0 8px 20px rgba(102, 126, 234, 0.3);
        }
        .search-btn:disabled {
            opacity: 0.6;
            cursor: not-allowed;
            transform: none;
        }
        .filters {
            display: flex;
            gap: 15px;
            flex-wrap: wrap;
            align-items: center;
        }
        .filter-group {
            display: flex;
            align-items: center;
            gap: 8px;
        }
        .filter-group label {
            font-weight: 500;
            color: #555;
        }
        .filter-select {
            padding: 8px 12px;
            border: 1px solid #ddd;
            border-radius: 8px;
            font-size: 14px;
        }
        .stats-container {
            background: white;
            border-radius: 20px;
            padding: 30px;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
            margin-bottom: 30px;
        }
        .stats-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 20px;
        }
        .stat-card {
            text-align: center;
            padding: 20px;
            background: linear-gradient(135deg, #f8f9fa, #e9ecef);
            border-radius: 12px;
        }
        .stat-number {
            font-size: 2rem;
            font-weight: 700;
            color: #667eea;
            margin-bottom: 5px;
        }
        .stat-label {
            color: #666;
            font-size: 0.9rem;
            text-transform: uppercase;
            letter-spacing: 0.5px;
        }
        .results-container {
            background: white;
            border-radius: 20px;
            padding: 30px;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
            margin-bottom: 30px;
        }
        .results-header {
            display: flex;
            justify-content: space-between;
            align-items: center;
            margin-bottom: 25px;
            padding-bottom: 15px;
            border-bottom: 2px solid #f0f0f0;
        }
        .results-title {
            font-size: 1.5rem;
            font-weight: 600;
            color: #333;
        }
        .results-meta {
            color: #666;
            font-size: 0.9rem;
        }
        .result-card {
            border: 1px solid #e9ecef;
            border-radius: 12px;
            padding: 20px;
            margin-bottom: 15px;
            transition: all 0.3s ease;
            cursor: pointer;
        }
        .result-card:hover {
            transform: translateY(-2px);
            box-shadow: 0 8px 25px rgba(0,0,0,0.1);
            border-color: #667eea;
        }
        .result-title {
            font-size: 1.2rem;
            font-weight: 600;
            color: #333;
            margin-bottom: 10px;
        }
        .result-preview {
            color: #666;
            line-height: 1.6;
            margin-bottom: 15px;
        }
        .result-footer {
            display: flex;
            justify-content: space-between;
            align-items: center;
            font-size: 0.9rem;
        }
        .result-score {
            background: linear-gradient(135deg, #667eea, #764ba2);
            color: white;
            padding: 4px 12px;
            border-radius: 20px;
            font-weight: 500;
        }
        .result-entities {
            display: flex;
            gap: 8px;
            flex-wrap: wrap;
        }
        .entity-tag {
            background: #f8f9fa;
            color: #495057;
            padding: 3px 8px;
            border-radius: 15px;
            font-size: 0.8rem;
            border: 1px solid #dee2e6;
        }
        .loading {
            text-align: center;
            padding: 40px;
            color: #666;
        }
        .spinner {
            width: 40px;
            height: 40px;
            border: 4px solid #f3f3f3;
            border-top: 4px solid #667eea;
            border-radius: 50%;
            animation: spin 1s linear infinite;
            margin: 0 auto 20px;
        }
        @keyframes spin {
            0% { transform: rotate(0deg); }
            100% { transform: rotate(360deg); }
        }
        .error {
            background: #f8d7da;
            color: #721c24;
            padding: 15px;
            border-radius: 8px;
            margin: 20px 0;
            border: 1px solid #f5c6cb;
        }
        .no-results {
            text-align: center;
            padding: 40px;
            color: #666;
        }
        .upload-container {
            background: white;
            border-radius: 20px;
            padding: 30px;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
            margin-bottom: 30px;
        }
        .upload-area {
            border: 2px dashed #ddd;
            border-radius: 12px;
            padding: 40px;
            text-align: center;
            transition: all 0.3s ease;
            cursor: pointer;
        }
        .upload-area:hover {
            border-color: #667eea;
            background: rgba(102, 126, 234, 0.05);
        }
        .upload-area.dragover {
            border-color: #667eea;
            background: rgba(102, 126, 234, 0.1);
        }
        @media (max-width: 768px) {
            .header h1 {
                font-size: 2rem;
            }
            .search-form {
                flex-direction: column;
            }
            .search-input {
                min-width: auto;
            }
            .stats-grid {
                grid-template-columns: repeat(2, 1fr);
            }
            .filters {
                flex-direction: column;
                align-items: stretch;
            }
        }
    </style>
 </head>
 <body>
    <div class="container">
        <!-- Header -->
        <div class="header">
            <h1>🔍 Semantic Search Engine</h1>
            <p>Intelligent search across archival documents with entity recognition</p>
        </div>
        <!-- Stats Container -->
        <div class="stats-container">
            <h2 style="margin-bottom: 20px; text-align: center;">System Statistics</h2>
            <div class="stats-grid" id="statsGrid">
                <div class="stat-card">
                    <div class="stat-number" id="totalDocs">-</div>
                    <div class="stat-label">Total Documents</div>
                </div>
                <div class="stat-card">
                    <div class="stat-number" id="totalEntities">-</div>
                    <div class="stat-label">Total Entities</div>
                </div>
                <div class="stat-card">
                    <div class="stat-number" id="avgEntities">-</div>
                    <div class="stat-label">Avg Entities/Doc</div>
                </div>
                <div class="stat-card">
                    <div class="stat-number" id="systemStatus">-</div>
                    <div class="stat-label">System Status</div>
                </div>
            </div>
        </div>
        <!-- Search Container -->
        <div class="search-container">
            <h2 style="margin-bottom: 20px;">Search Documents</h2>
            <form class="search-form" id="searchForm">
                <input 
                    type="text" 
                    class="search-input" 
                    id="searchInput" 
                    placeholder="Enter your search query... (e.g., 'Napoleon Bonaparte biography')"
                    required
                >
                <button type="submit" class="search-btn" id="searchBtn">
                    Search
                </button>
            </form>
            <div class="filters">
                <div class="filter-group">
                    <label for="entityTypeFilter">Entity Type:</label>
                    <select class="filter-select" id="entityTypeFilter">
                        <option value="">All Types</option>
                        <option value="person">Person</option>
                        <option value="place">Place</option>
                        <option value="event">Event</option>
                        <option value="organization">Organization</option>
                        <option value="building">Building</option>
                        <option value="date">Date</option>
                    </select>
                </div>
                <div class="filter-group">
                    <label for="limitFilter">Results:</label>
                    <select class="filter-select" id="limitFilter">
                        <option value="5">5 results</option>
                        <option value="10" selected>10 results</option>
                        <option value="20">20 results</option>
                        <option value="50">50 results</option>
                    </select>
                </div>
                <div class="filter-group">
                    <label for="thresholdFilter">Min Score:</label>
                    <select class="filter-select" id="thresholdFilter">
                        <option value="">Any score</option>
                        <option value="0.1">0.1+</option>
                        <option value="0.3" selected>0.3+</option>
                        <option value="0.5">0.5+</option>
                        <option value="0.7">0.7+</option>
                    </select>
                </div>
            </div>
        </div>
        <!-- Upload Container -->
        <div class="upload-container">
            <h2 style="margin-bottom: 20px;">Upload Document</h2>
            <div class="upload-area" id="uploadArea">
                <p>📄 Drag and drop a document here, or click to select</p>
                <p style="color: #666; font-size: 0.9rem; margin-top: 10px;">
                    Supported formats: PDF, TXT, DOCX, XML (Max: 50MB)
                </p>
                <input type="file" id="fileInput" style="display: none;" accept=".pdf,.txt,.docx,.xml">
            </div>
        </div>
        <!-- Results Container -->
        <div class="results-container" id="resultsContainer" style="display: none;">
            <div class="results-header">
                <h2 class="results-title">Search Results</h2>
                <div class="results-meta" id="resultsMeta"></div>
            </div>
            <div id="resultsContent"></div>
        </div>
    </div>
    <script>
        // API base URL
        const API_BASE = '/api/v1';
        // DOM elements
        const searchForm = document.getElementById('searchForm');
        const searchInput = document.getElementById('searchInput');
        const searchBtn = document.getElementById('searchBtn');
        const resultsContainer = document.getElementById('resultsContainer');
        const resultsContent = document.getElementById('resultsContent');
        const resultsMeta = document.getElementById('resultsMeta');
        const uploadArea = document.getElementById('uploadArea');
        const fileInput = document.getElementById('fileInput');
        // Initialize app
        document.addEventListener('DOMContentLoaded', function() {
            loadStats();
            setupEventListeners();
        });
        // Setup event listeners
        function setupEventListeners() {
            // Search form
            searchForm.addEventListener('submit', handleSearch);
            // Upload area
            uploadArea.addEventListener('click', () => fileInput.click());
            uploadArea.addEventListener('dragover', handleDragOver);
            uploadArea.addEventListener('dragleave', handleDragLeave);
            uploadArea.addEventListener('drop', handleFileDrop);
            // File input
            fileInput.addEventListener('change', handleFileSelect);
        }
        // Load system statistics
        async function loadStats() {
            try {
                const response = await axios.get(`${API_BASE}/stats`);
                const stats = response.data;
                document.getElementById('totalDocs').textContent = stats.total_documents;
                document.getElementById('totalEntities').textContent = stats.total_entities;
                document.getElementById('avgEntities').textContent = stats.average_entities_per_doc.toFixed(1);
                document.getElementById('systemStatus').textContent = stats.system_status.toUpperCase();
            } catch (error) {
                console.error('Failed to load stats:', error);
                document.getElementById('systemStatus').textContent = 'ERROR';
            }
        }
        // Handle search form submission
        async function handleSearch(event) {
            event.preventDefault();
            const query = searchInput.value.trim();
            if (!query) return;
            // Get filter values
            const entityType = document.getElementById('entityTypeFilter').value;
            const limit = parseInt(document.getElementById('limitFilter').value);
            const threshold = document.getElementById('thresholdFilter').value;
            // Prepare search parameters
            const params = new URLSearchParams({
                q: query,
                limit: limit.toString()
            });
            if (entityType) params.append('entity_types', entityType);
            if (threshold) params.append('threshold', threshold);
            // Show loading state
            showLoading();
            searchBtn.disabled = true;
            searchBtn.textContent = 'Searching...';
            try {
                const response = await axios.get(`${API_BASE}/search?${params}`);
                displayResults(response.data);
            } catch (error) {
                showError('Search failed: ' + (error.response?.data?.detail || error.message));
            } finally {
                searchBtn.disabled = false;
                searchBtn.textContent = 'Search';
            }
        }
        // Display search results
        function displayResults(data) {
            resultsContainer.style.display = 'block';
            // Update meta information
            resultsMeta.innerHTML = `
                <div>
                    <strong>${data.total_results}</strong> results found in 
                    <strong>${data.search_time_ms.toFixed(0)}ms</strong>
                </div>
            `;
            // Clear previous results
            resultsContent.innerHTML = '';
            if (data.results.length === 0) {
                resultsContent.innerHTML = `
                    <div class="no-results">
                        <h3>No results found</h3>
                        <p>Try adjusting your search query or filters</p>
                    </div>
                `;
                return;
            }
            // Display results
            data.results.forEach(result => {
                const resultCard = createResultCard(result);
                resultsContent.appendChild(resultCard);
            });
        }
        // Create result card element
        function createResultCard(result) {
            const card = document.createElement('div');
            card.className = 'result-card';
            // Create entities tags
            const entitiesTags = result.entities.slice(0, 5).map(entity => 
                `<span class="entity-tag">${entity.text} (${entity.entity_type})</span>`
            ).join('');
            card.innerHTML = `
                <div class="result-title">${escapeHtml(result.title)}</div>
                <div class="result-preview">${escapeHtml(result.content_preview)}</div>
                <div class="result-footer">
                    <div class="result-entities">
                        ${entitiesTags}
                        ${result.entities.length > 5 ? `<span class="entity-tag">+${result.entities.length - 5} more</span>` : ''}
                    </div>
                    <div class="result-score">Score: ${result.similarity_score.toFixed(3)}</div>
                </div>
            `;
            // Add click handler to show document details
            card.addEventListener('click', () => showDocumentDetails(result.document_id));
            return card;
        }
        // Show document details
        async function showDocumentDetails(documentId) {
            try {
                const response = await axios.get(`${API_BASE}/documents/${documentId}`);
                const document = response.data;
                // Create modal or detailed view
                alert(`Document: ${document.filename}\nType: ${document.document_type}\nEntities: ${document.entities.length}`);
            } catch (error) {
                console.error('Failed to load document details:', error);
                alert('Failed to load document details');
            }
        }
        // File upload handlers
        function handleDragOver(event) {
            event.preventDefault();
            uploadArea.classList.add('dragover');
        }
        function handleDragLeave(event) {
            event.preventDefault();
            uploadArea.classList.remove('dragover');
        }
        function handleFileDrop(event) {
            event.preventDefault();
            uploadArea.classList.remove('dragover');
            const files = event.dataTransfer.files;
            if (files.length > 0) {
                uploadFile(files[0]);
            }
        }
        function handleFileSelect(event) {
            const files = event.target.files;
            if (files.length > 0) {
                uploadFile(files[0]);
            }
        }
        // Upload file
        async function uploadFile(file) {
            // Validate file type
            const allowedTypes = ['.pdf', '.txt', '.docx', '.xml'];
            const fileExtension = '.' + file.name.split('.').pop().toLowerCase();
            if (!allowedTypes.includes(fileExtension)) {
                alert(`File type ${fileExtension} not supported. Allowed: ${allowedTypes.join(', ')}`);
                return;
            }
            // Validate file size (50MB)
            if (file.size > 50 * 1024 * 1024) {
                alert('File too large. Maximum size: 50MB');
                return;
            }
            const formData = new FormData();
            formData.append('file', file);
            formData.append('access_level', 'public');
            // Show upload progress
            uploadArea.innerHTML = `
                <div class="loading">
                    <div class="spinner"></div>
                    <p>Uploading and processing "${file.name}"...</p>
                </div>
            `;
            try {
                const response = await axios.post(`${API_BASE}/documents/upload`, formData, {
                    headers: {
                        'Content-Type': 'multipart/form-data'
                    }
                });
                const result = response.data;
                // Show success message
                uploadArea.innerHTML = `
                    <div style="color: #28a745; text-align: center;">
                        <h3>✅ Upload Successful!</h3>
                        <p><strong>${result.filename}</strong></p>
                        <p>Document ID: ${result.document_id}</p>
                        <p>Entities found: ${result.entities_found}</p>
                        <button onclick="resetUploadArea()" style="margin-top: 15px; padding: 10px 20px; background: #667eea; color: white; border: none; border-radius: 8px; cursor: pointer;">
                            Upload Another
                        </button>
                    </div>
                `;
                // Refresh stats
                loadStats();
            } catch (error) {
                console.error('Upload failed:', error);
                uploadArea.innerHTML = `
                    <div style="color: #dc3545; text-align: center;">
                        <h3>❌ Upload Failed</h3>
                        <p>${error.response?.data?.detail || error.message}</p>
                        <button onclick="resetUploadArea()" style="margin-top: 15px; padding: 10px 20px; background: #dc3545; color: white; border: none; border-radius: 8px; cursor: pointer;">
                            Try Again
                        </button>
                    </div>
                `;
            }
        }
        // Reset upload area
        function resetUploadArea() {
            uploadArea.innerHTML = `
                <p>📄 Drag and drop a document here, or click to select</p>
                <p style="color: #666; font-size: 0.9rem; margin-top: 10px;">
                    Supported formats: PDF, TXT, DOCX, XML (Max: 50MB)
                </p>
            `;
            fileInput.value = '';
        }
        // Show loading state
        function showLoading() {
            resultsContainer.style.display = 'block';
            resultsMeta.textContent = '';
            resultsContent.innerHTML = `
                <div class="loading">
                    <div class="spinner"></div>
                    <p>Searching documents...</p>
                </div>
            `;
        }
        // Show error message
        function showError(message) {
            resultsContainer.style.display = 'block';
            resultsMeta.textContent = '';
            resultsContent.innerHTML = `
                <div class="error">
                    <strong>Error:</strong> ${escapeHtml(message)}
                </div>
            `;
        }
        // Utility function to escape HTML
        function escapeHtml(text) {
            const div = document.createElement('div');
            div.textContent = text;
            return div.innerHTML;
        }
        // Sample search queries for testing
        const sampleQueries = [
            "Napoleon Bonaparte biography",
            "French Revolution events",
            "Paris architecture buildings",
            "Military campaigns Europe",
            "Historical documents France"
        ];
        // Add sample query buttons
        function addSampleQueries() {
            const samplesContainer = document.createElement('div');
            samplesContainer.style.marginTop = '15px';
            samplesContainer.innerHTML = '<p style="margin-bottom: 10px; color: #666;">Try these sample queries:</p>';
            sampleQueries.forEach(query => {
                const button = document.createElement('button');
                button.textContent = query;
                button.style.cssText = `
                    margin: 5px;
                    padding: 8px 15px;
                    background: #f8f9fa;
                    border: 1px solid #dee2e6;
                    border-radius: 20px;
                    cursor: pointer;
                    font-size: 0.9rem;
                `;
                button.addEventListener('click', () => {
                    searchInput.value = query;
                    handleSearch(new Event('submit'));
                });
                samplesContainer.appendChild(button);
            });
            document.querySelector('.search-container').appendChild(samplesContainer);
        }
        // Add sample queries
        // setTimeout(addSampleQueries, 100);
    </script>
 </body>
 </html>
@@ -0,0 +1,22 @@
 <!DOCTYPE html>
 <html>
 <head>
    <title>Vector Search</title>
 </head>
 <body>
    <h1>Semantic Search</h1>
    <form method="post">
        <input type="text" name="query" placeholder="Enter your query">
        <button type="submit">Search</button>
    </form>
    {% if results %}
        <h2>Results</h2>
        <ul>
        {% for item in results %}
            <li><strong>{{ item.document_id }}</strong>: {{ item.content }}</li>
        {% endfor %}
        </ul>
    {% endif %}
 </body>
 </html>