Initial commit

This commit is contained in:
Ayomide
2025-08-04 14:50:33 +01:00
commit 40b28a7ee3
30 changed files with 3410 additions and 0 deletions
View File
+84
View File
@@ -0,0 +1,84 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Virtual Environment
venv/
env/
ENV/
.venv/
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
# Environment variables
.env
.env.local
.env.*.local
# Data directories
data/
logs/
*.log
# Vector embeddings and databases
*.db
*.sqlite
*.sqlite3
embeddings/
chroma/
# Uploads and temporary files
uploads/
temp/
tmp/
# Test file
tests/
# Jupyter Notebooks
.ipynb_checkpoints/
*.ipynb
# Model files
*.model
*.pkl
*.joblib
# Coverage reports
htmlcov/
.coverage
.coverage.*
coverage.xml
+155
View File
@@ -0,0 +1,155 @@
# Semantic Search Engine POC
A proof-of-concept intelligent semantic search engine for archival documents, made to show how advanced search can work with different types of files like PDFs, XML files, and more.
## Project Overview
This POC addresses the requirements for a future full-scale semantic search system capable of:
- **Entity-centric search** across persons, places, events, buildings, and organizations
- **Multi-modal document processing** (PDFs, XML, text, images, audio, video)
- **Semantic similarity search** using modern embedding techniques
- **Relationship discovery** between entities across documents
- **Access control** for public vs. restricted documents
- **Scalable architecture** for production deployment
## Architecture
```
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Document │ │ Entity │ │ Vector │
│ Processor │───▶│ Extractor │───▶│ Store │
└─────────────────┘ └─────────────────┘ └─────────────────┘
│ │ │
▼ ▼ ▼
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Text │ │ Named Entity │ │ Embeddings │
│ Extraction │ │ Recognition │ │ (ChromaDB) │
└─────────────────┘ └─────────────────┘ └─────────────────┘
┌─────────────────┐
│ Search │
│ Service │
└─────────────────┘
```
### Prerequisites
- Python 3.8+
- pip
- Git
### Installation
1. **Clone the repository**
```bash
git clone <repository-url>
cd semantic_search_poc
```
2. **Create virtual environment**
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
3. **Install dependencies**
```bash
pip install -r requirements.txt
python -m spacy download en_core_web_sm
```
4. **Initialize the environment**
```bash
python scripts/setup_data.py
```
5. **Run the POC**
```bash
python -m src.main
```
### Expected Output
The POC will demonstrate:
- Document processing and indexing
- Semantic search across sample documents
- Entity extraction and relationship discovery
- Performance metrics and statistics
## Features
### Document Processing
- **PDF text extraction** using PyPDF2
- **XML parsing** for finding aids
- **DOCX support** for modern documents
- **Metadata extraction** (title, author, creation date, keywords)
- **Multi-language support** (currently optimized for English)
### Entity Recognition
- **Named Entity Recognition** using spaCy
- **Custom entity types**: Person, Place, Event, Organization, Building, Date
- **Relationship extraction** between entities
- **Confidence scoring** for entity matches
### Semantic Search
- **Vector embeddings** using Sentence-BERT (`all-MiniLM-L6-v2`)
- **Similarity search** with configurable thresholds
- **Hybrid search** combining semantic and keyword matching
- **Entity-filtered search** results
### Vector Storage
- **ChromaDB integration** for persistent vector storage
- **Scalable indexing** for large document collections
- **Metadata filtering** and search optimization
## Configuration
Key settings in `config/settings.py`:
```python
# Embedding Model
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
EMBEDDING_DIMENSION = 384
# Search Parameters
MAX_SEARCH_RESULTS = 50
SIMILARITY_THRESHOLD = 0.3
# File Processing
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
ALLOWED_EXTENSIONS = [".pdf", ".txt", ".docx", ".xml"]
```
## Project Structure
```
semantic_search_poc/
├── README.md
├── requirements.txt
├── .env.example
├── config/
│ └── settings.py # Configuration settings
├── src/
│ ├── main.py # Main application entry point
│ ├── models/
│ │ ├── document.py # Document data models
│ │ └── search_result.py # Search result models
│ ├── services/
│ │ ├── document_processor.py # Document processing pipeline
│ │ ├── embedding_service.py # Embedding generation
│ │ ├── entity_extractor.py # Named entity recognition
│ │ ├── search_service.py # Main search functionality
│ │ └── vector_store.py # Vector database operations
│ └── utils/
│ ├── file_handlers.py # File processing utilities
│ └── logger.py # Logging configuration
├── data/
│ ├── raw/ # Input documents
│ ├── processed/ # Processed document metadata
│ └── embeddings/ # Vector embeddings storage
├── tests/ # Unit tests
├── notebooks/ # Jupyter notebooks for analysis
└── scripts/ # Utility scripts
```
View File
+61
View File
@@ -0,0 +1,61 @@
"""Configuration settings for the semantic search POC."""
import os
from typing import List, Optional
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
"""Application settings."""
# Application
APP_NAME: str = "Semantic Search POC"
VERSION: str = "0.1.0"
DEBUG: bool = True
# API
API_HOST: str = "localhost"
API_PORT: int = 8000
API_PREFIX: str = "/api/v1"
# Database
DATABASE_URL: str = "sqlite:///./data/semantic_search.db"
# Vector Store
VECTOR_STORE_TYPE: str = "chroma" # chroma, faiss
CHROMA_PERSIST_DIR: str = "./data/embeddings/chroma"
FAISS_INDEX_PATH: str = "./data/embeddings/faiss"
# Embedding Model
EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"
EMBEDDING_DIMENSION: int = 384
# Entity Extraction
SPACY_MODEL: str = "en_core_web_sm"
CUSTOM_NER_MODEL: Optional[str] = None
# Document Processing
MAX_FILE_SIZE: int = 50 * 1024 * 1024 # 50MB
ALLOWED_EXTENSIONS: List[str] = [".pdf", ".txt", ".docx", ".xml"]
# Search
MAX_SEARCH_RESULTS: int = 50
SIMILARITY_THRESHOLD: float = 0.2
# Directories
DATA_DIR: str = "./data"
RAW_DATA_DIR: str = "./data/raw"
PROCESSED_DATA_DIR: str = "./data/processed"
UPLOAD_DIR: str = "./data/uploads"
# Logging
LOG_LEVEL: str = "INFO"
LOG_FILE: str = "./logs/app.log"
class Config:
env_file = ".env"
case_sensitive = True
# Global settings instance
settings = Settings()
BIN
View File
Binary file not shown.
View File
+56
View File
@@ -0,0 +1,56 @@
import asyncio
import sys
import uvicorn
from pathlib import Path
import logging
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from config.settings import settings
def main():
"""Run the FastAPI server."""
# Configure logging
logging.basicConfig(
level=getattr(logging, settings.LOG_LEVEL),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
logger.info("Starting Semantic Search API Server...")
# Create necessary directories
directories = [
settings.DATA_DIR,
settings.UPLOAD_DIR,
Path(settings.LOG_FILE).parent,
"templates"
]
for directory in directories:
Path(directory).mkdir(parents=True, exist_ok=True)
print(f"""
Starting Semantic Search API Server
Server URL: http://{settings.API_HOST}:{settings.API_PORT}
API Docs: http://{settings.API_HOST}:{settings.API_PORT}/docs
Frontend: http://{settings.API_HOST}:{settings.API_PORT}/
Health Check: http://{settings.API_HOST}:{settings.API_PORT}/health
Press Ctrl+C to stop the server
""")
# Run the server
uvicorn.run(
"src.api.routes:app",
host=settings.API_HOST,
port=settings.API_PORT,
reload=settings.DEBUG,
log_level=settings.LOG_LEVEL.lower()
)
if __name__ == "__main__":
main()
+233
View File
@@ -0,0 +1,233 @@
"""Setup script to initialize the POC environment."""
import asyncio
import logging
import sys
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.services.vector_store import VectorStore
from src.services.document_processor import DocumentProcessor
from config.settings import settings
async def setup_environment():
"""Set up the POC environment."""
print("🚀 Setting up Semantic Search POC...")
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
try:
# Create directories
directories = [
settings.DATA_DIR,
settings.RAW_DATA_DIR,
settings.PROCESSED_DATA_DIR,
settings.UPLOAD_DIR,
f"{settings.RAW_DATA_DIR}/sample_documents",
f"{settings.RAW_DATA_DIR}/pdfs",
f"{settings.RAW_DATA_DIR}/xml",
Path(settings.LOG_FILE).parent
]
for directory in directories:
Path(directory).mkdir(parents=True, exist_ok=True)
print(f"✅ Created directory: {directory}")
# Initialize vector store
print("\n📚 Initializing vector store...")
vector_store = VectorStore()
await vector_store.initialize()
print("✅ Vector store initialized")
# Create sample documents
sample_docs_dir = Path(f"{settings.RAW_DATA_DIR}/sample_documents")
# Sample document 1: Napoleon biography
sample1 = sample_docs_dir / "napoleon_biography.txt"
if not sample1.exists():
sample1.write_text("""
Napoleon Bonaparte: A Brief Biography
Napoleon Bonaparte (1769-1821) was a French military general and political leader who rose to prominence during the French Revolution. Born in Corsica, Napoleon became Emperor of the French in 1804.
Early Life and Rise to Power
Napoleon was born in Ajaccio, Corsica, to Charles Buonaparte and Letizia Ramolino Bonaparte. He attended military school in France and quickly distinguished himself as a brilliant strategist.
Military Campaigns
Napoleon led numerous military campaigns across Europe, including:
- The Italian Campaign (1796-1797)
- The Egyptian Campaign (1798-1801)
- The Austerlitz Campaign (1805)
- The Russian Campaign (1812)
Napoleon's forces occupied much of continental Europe at the height of his power. He established the Continental System to weaken Britain economically.
Political Reforms
As Emperor, Napoleon implemented significant reforms:
- The Napoleonic Code (Civil Code)
- Educational reforms
- Infrastructure development
- Administrative reorganization
Exile and Death
After defeat at the Battle of Leipzig in 1813 and subsequent abdication, Napoleon was exiled to Elba. He returned for the Hundred Days but was defeated at Waterloo in 1815. He was then exiled to Saint Helena, where he died in 1821.
Legacy
Napoleon's influence on European law, politics, and military strategy continues to this day. His reforms and conquests shaped the modern European state system.
""".strip())
print(f"✅ Created sample document: {sample1.name}")
# Sample document 2: French Revolution overview
sample2 = sample_docs_dir / "french_revolution.txt"
if not sample2.exists():
sample2.write_text("""
The French Revolution (1789-1799): An Overview
The French Revolution was a period of radical political and societal change in France that began with the Estates-General of 1789 and ended with the formation of the French Consulate in November 1799.
Causes of the Revolution
- Economic crisis and debt
- Social inequality under the Ancien Régime
- Influence of Enlightenment ideas
- Weak leadership under Louis XVI
Key Events and Phases
The Moderate Phase (1789-1792)
- Storming of the Bastille (July 14, 1789)
- Declaration of the Rights of Man and of the Citizen
- Abolition of feudalism
- Civil Constitution of the Clergy
The Radical Phase (1792-1794)
- Execution of Louis XVI (January 21, 1793)
- Reign of Terror under Maximilien Robespierre
- Committee of Public Safety
- Revolutionary Wars against European coalitions
The Thermidorian Reaction (1794-1799)
- Fall of Robespierre (July 27, 1794)
- Directory period
- Rise of Napoleon Bonaparte
Important Figures
- Louis XVI - King of France
- Marie Antoinette - Queen of France
- Maximilien Robespierre - Jacobin leader
- Georges Danton - Revolutionary leader
- Jean-Paul Marat - Radical journalist
- Jacques Necker - Finance Minister
Geographic Centers
The revolution centered around Paris, with key locations including:
- Palace of Versailles
- Tuileries Palace
- Place de la Concorde (formerly Place Louis XV)
- Conciergerie prison
Impact and Legacy
The French Revolution fundamentally changed French society and had lasting effects on European politics, inspiring democratic movements worldwide and establishing principles of popular sovereignty and individual rights.
""".strip())
print(f"✅ Created sample document: {sample2.name}")
# Sample document 3: Architecture of Paris
sample3 = sample_docs_dir / "paris_architecture.txt"
if not sample3.exists():
sample3.write_text("""
Architectural Marvels of Paris
Paris, the City of Light, is renowned for its stunning architecture spanning centuries of French history and culture.
Medieval Architecture
- Notre-Dame Cathedral: Gothic masterpiece on Île de la Cité
- Sainte-Chapelle: Royal chapel with magnificent stained glass
- Saint-Germain-des-Prés: Ancient abbey church
Renaissance and Classical Period
- Louvre Palace: Royal residence turned world's largest museum
- Luxembourg Palace: Baroque palace and gardens
- Place des Vosges: Oldest planned square in Paris
Haussmann's Paris (19th Century)
Baron Georges-Eugène Haussmann transformed Paris under Napoleon III:
- Wide boulevards and avenues
- Standardized building heights and facades
- Parks and squares system
- Sewerage and water systems
Notable Haussmannian Buildings:
- Opéra Garnier: Neo-baroque opera house
- Grands Boulevards: Commercial and social centers
- Residential buildings with characteristic iron balconies
Modern and Contemporary Architecture
- Eiffel Tower (1889): Iron lattice tower by Gustave Eiffel
- Centre Pompidou (1977): High-tech architecture
- Louvre Pyramid (1989): I.M. Pei's glass pyramid
- Institut du Monde Arabe: Jean Nouvel's modern interpretation
Architectural Districts
- Marais: Medieval and Renaissance architecture
- Saint-Germain-des-Prés: Literary and artistic quarter
- Montmartre: Village atmosphere with Sacré-Cœur Basilica
- La Défense: Modern business district with Grande Arche
Building Materials and Techniques
Traditional Parisian architecture features:
- Lutetian limestone (Pierre de Paris)
- Mansard roofs with zinc coverings
- Iron work and balconies
- Large windows and shutters
Conservation Efforts
Paris maintains strict building codes to preserve its architectural heritage while allowing for contemporary additions that complement the historic urban fabric.
""".strip())
print(f"✅ Created sample document: {sample3.name}")
print(f"\n📝 Created {len(list(sample_docs_dir.glob('*.txt')))} sample documents")
# Create .env file if it doesn't exist
env_file = Path(".env")
if not env_file.exists():
env_content = """# Semantic Search POC Configuration
DEBUG=True
LOG_LEVEL=INFO
# Database
DATABASE_URL=sqlite:///./data/semantic_search.db
# Vector Store
VECTOR_STORE_TYPE=chroma
CHROMA_PERSIST_DIR=./data/embeddings/chroma
# Embedding Model
EMBEDDING_MODEL=all-MiniLM-L6-v2
EMBEDDING_DIMENSION=384
# Search Settings
MAX_SEARCH_RESULTS=50
SIMILARITY_THRESHOLD=0.2
# File Upload
MAX_FILE_SIZE=52428800
"""
env_file.write_text(env_content)
print("Created .env configuration file")
print("\nSetup complete! You can now:")
print("1. Run: python -m src.main")
print("2. Or process documents: python scripts/process_documents.py")
print("3. Or start the API server: python scripts/run_server.py")
except Exception as e:
logger.error(f"Setup failed: {str(e)}")
print(f"❌ Setup failed: {str(e)}")
if __name__ == "__main__":
asyncio.run(setup_environment())
View File
View File
+390
View File
@@ -0,0 +1,390 @@
import time
import logging
from typing import List, Optional
from fastapi import FastAPI, HTTPException, UploadFile, File, Query, Depends
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse, FileResponse
from pathlib import Path
from ..services.search_service import SearchService
from ..services.document_processor import DocumentProcessor
from ..models.document import EntityType, DocumentType, AccessLevel
from ..models.search_result import SearchResponse, EntitySearchResult
from .schemas import SearchRequest, DocumentUploadResponse, StatsResponse
from config.settings import settings
# Initialize FastAPI app
app = FastAPI(
title="Semantic Search API",
description="Intelligent semantic search engine for archival documents",
version="0.1.0"
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Mount static files for frontend
static_dir = Path(__file__).parent.parent.parent / "templates" / "static"
if static_dir.exists():
app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
# Initialize services
search_service = SearchService()
document_processor = DocumentProcessor()
logger = logging.getLogger(__name__)
# Dependency to get search service
async def get_search_service():
return search_service
# Dependency to get document processor
async def get_document_processor():
return document_processor
@app.on_event("startup")
async def startup_event():
"""Initialize services on startup."""
logger.info("Initializing API services...")
await search_service.vector_store.initialize()
logger.info("API services initialized successfully")
@app.get("/", response_class=HTMLResponse)
async def read_root():
"""Serve the main frontend page."""
template_path = Path(__file__).parent.parent.parent / "templates" / "index.html"
if template_path.exists():
return FileResponse(str(template_path))
return HTMLResponse("<h1>Semantic Search API</h1><p>Frontend template not found</p>")
@app.get("/health")
async def health_check():
"""Health check endpoint."""
return {"status": "healthy", "timestamp": time.time()}
@app.post("/api/v1/search", response_model=SearchResponse)
async def search_documents(
request: SearchRequest,
search_service: SearchService = Depends(get_search_service)
):
"""
Perform semantic search across documents.
- **query**: Search query string
- **limit**: Maximum number of results (default: 10)
- **entity_types**: Filter by entity types (optional)
- **similarity_threshold**: Minimum similarity score (optional)
"""
start_time = time.time()
try:
results = await search_service.semantic_search(
query=request.query,
limit=request.limit,
entity_types=request.entity_types,
similarity_threshold=request.similarity_threshold
)
search_time = (time.time() - start_time) * 1000 # Convert to milliseconds
return SearchResponse(
query=request.query,
results=results,
total_results=len(results),
search_time_ms=search_time,
filters_applied={
"entity_types": [et.value for et in request.entity_types] if request.entity_types else [],
"similarity_threshold": request.similarity_threshold
}
)
except Exception as e:
logger.error(f"Search error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
@app.get("/api/v1/search", response_model=SearchResponse)
async def search_documents_get(
q: str = Query(..., description="Search query"),
limit: int = Query(10, ge=1, le=100, description="Maximum number of results"),
entity_types: Optional[str] = Query(None, description="Comma-separated entity types"),
threshold: Optional[float] = Query(None, ge=0.0, le=1.0, description="Similarity threshold"),
search_service: SearchService = Depends(get_search_service)
):
"""GET version of search endpoint for simple queries."""
# Parse entity types
parsed_entity_types = None
if entity_types:
try:
parsed_entity_types = [EntityType(et.strip()) for et in entity_types.split(",")]
except ValueError as e:
raise HTTPException(status_code=400, detail=f"Invalid entity type: {str(e)}")
# Create search request
request = SearchRequest(
query=q,
limit=limit,
entity_types=parsed_entity_types,
similarity_threshold=threshold
)
return await search_documents(request, search_service)
@app.get("/api/v1/entities/{entity_type}", response_model=EntitySearchResult)
async def search_by_entity(
entity_type: EntityType,
text: str = Query(..., description="Entity text to search for"),
limit: int = Query(10, ge=1, le=100, description="Maximum number of results"),
search_service: SearchService = Depends(get_search_service)
):
"""
Search for documents containing specific entities.
- **entity_type**: Type of entity (person, place, event, etc.)
- **text**: Entity text to search for
- **limit**: Maximum number of results
"""
try:
results = await search_service.search_by_entity(
entity_text=text,
entity_type=entity_type,
limit=limit
)
return EntitySearchResult(
entity_text=text,
entity_type=entity_type.value,
documents=results,
total_occurrences=len(results)
)
except Exception as e:
logger.error(f"Entity search error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Entity search failed: {str(e)}")
@app.get("/api/v1/entities/{entity_text}/relationships")
async def get_entity_relationships(
entity_text: str,
search_service: SearchService = Depends(get_search_service)
):
"""
Get relationships for a specific entity.
- **entity_text**: Entity to find relationships for
"""
try:
relationships = await search_service.get_entity_relationships(entity_text)
return {
"entity": entity_text,
"relationships": relationships,
"total_relationships": sum(len(entities) for entities in relationships.values())
}
except Exception as e:
logger.error(f"Relationship search error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Relationship search failed: {str(e)}")
@app.post("/api/v1/documents/upload", response_model=DocumentUploadResponse)
async def upload_document(
file: UploadFile = File(...),
access_level: AccessLevel = AccessLevel.PUBLIC,
document_processor: DocumentProcessor = Depends(get_document_processor)
):
"""
Upload and process a new document.
- **file**: Document file to upload
- **access_level**: Access level for the document (public, restricted, private)
"""
try:
# Validate file type
if not file.filename:
raise HTTPException(status_code=400, detail="No filename provided")
file_extension = Path(file.filename).suffix.lower()
if file_extension not in settings.ALLOWED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"File type {file_extension} not supported. Allowed: {settings.ALLOWED_EXTENSIONS}"
)
# Validate file size
content = await file.read()
if len(content) > settings.MAX_FILE_SIZE:
raise HTTPException(
status_code=400,
detail=f"File too large. Maximum size: {settings.MAX_FILE_SIZE / (1024*1024):.1f}MB"
)
# Save uploaded file
upload_dir = Path(settings.UPLOAD_DIR)
upload_dir.mkdir(parents=True, exist_ok=True)
file_path = upload_dir / file.filename
with open(file_path, 'wb') as f:
f.write(content)
# Process document
document = await document_processor.process_document(str(file_path), access_level)
if not document:
raise HTTPException(status_code=500, detail="Failed to process document")
return DocumentUploadResponse(
document_id=document.id,
filename=document.filename,
status="processed",
entities_found=len(document.entities),
access_level=document.access_level.value,
message="Document uploaded and processed successfully"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Upload error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}")
@app.get("/api/v1/documents/{document_id}")
async def get_document(
document_id: str,
document_processor: DocumentProcessor = Depends(get_document_processor)
):
"""
Get document details by ID.
- **document_id**: Unique document identifier
"""
try:
document = await document_processor.get_document_by_id(document_id)
if not document:
raise HTTPException(status_code=404, detail="Document not found")
# Return document without full content for performance
return {
"id": document.id,
"filename": document.filename,
"document_type": document.document_type.value,
"access_level": document.access_level.value,
"metadata": document.metadata,
"entities": [{
"text": e.text,
"entity_type": e.entity_type.value,
"confidence": e.confidence
} for e in document.entities],
"created_at": document.created_at,
"updated_at": document.updated_at
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Document retrieval error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Failed to retrieve document: {str(e)}")
@app.delete("/api/v1/documents/{document_id}")
async def delete_document(
document_id: str,
document_processor: DocumentProcessor = Depends(get_document_processor)
):
"""
Delete a document by ID.
- **document_id**: Unique document identifier
"""
try:
success = await document_processor.delete_document(document_id)
if not success:
raise HTTPException(status_code=404, detail="Document not found or deletion failed")
return {"message": f"Document {document_id} deleted successfully"}
except HTTPException:
raise
except Exception as e:
logger.error(f"Document deletion error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Failed to delete document: {str(e)}")
@app.get("/api/v1/stats", response_model=StatsResponse)
async def get_stats(
search_service: SearchService = Depends(get_search_service),
document_processor: DocumentProcessor = Depends(get_document_processor)
):
"""
Get system statistics and metrics.
"""
try:
# Get processing stats
processing_stats = await document_processor.get_processing_stats()
# Get vector store stats
vector_stats = await search_service.vector_store.get_collection_stats()
return StatsResponse(
total_documents=processing_stats.get("total_documents", 0),
document_types=processing_stats.get("document_types", {}),
access_levels=processing_stats.get("access_levels", {}),
languages=processing_stats.get("languages", {}),
total_entities=processing_stats.get("total_entities", 0),
average_entities_per_doc=processing_stats.get("average_entities_per_doc", 0.0),
vector_store_stats=vector_stats,
system_status="operational"
)
except Exception as e:
logger.error(f"Stats error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Failed to retrieve stats: {str(e)}")
@app.get("/api/v1/entity-types")
async def get_entity_types():
"""Get available entity types."""
return {
"entity_types": [
{"value": et.value, "label": et.value.title()}
for et in EntityType
]
}
@app.get("/api/v1/document-types")
async def get_document_types():
"""Get supported document types."""
return {
"document_types": [
{"value": dt.value, "label": dt.value.upper()}
for dt in DocumentType
]
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"src.api.routes:app",
host=settings.API_HOST,
port=settings.API_PORT,
reload=settings.DEBUG
)
+40
View File
@@ -0,0 +1,40 @@
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field
from ..models.document import EntityType, AccessLevel
class SearchRequest(BaseModel):
"""Search request schema."""
query: str = Field(..., description="Search query string", min_length=1)
limit: int = Field(10, ge=1, le=100, description="Maximum number of results")
entity_types: Optional[List[EntityType]] = Field(None, description="Filter by entity types")
similarity_threshold: Optional[float] = Field(None, ge=0.0, le=1.0, description="Minimum similarity score")
class DocumentUploadResponse(BaseModel):
"""Document upload response schema."""
document_id: str
filename: str
status: str
entities_found: int
access_level: str
message: str
class StatsResponse(BaseModel):
"""System statistics response schema."""
total_documents: int
document_types: Dict[str, int]
access_levels: Dict[str, int]
languages: Dict[str, int]
total_entities: int
average_entities_per_doc: float
vector_store_stats: Dict[str, Any]
system_status: str
class ErrorResponse(BaseModel):
"""Error response schema."""
error: str
detail: str
timestamp: float
+27
View File
@@ -0,0 +1,27 @@
# app.py
from fastapi import FastAPI, Request, Form
from fastapi.templating import Jinja2Templates
from fastapi.responses import HTMLResponse
from src.services.vector_store import VectorStore
import uvicorn
import asyncio
app = FastAPI()
templates = Jinja2Templates(directory="templates")
vector_store = VectorStore()
@app.on_event("startup")
async def startup_event():
await vector_store.initialize()
@app.get("/", response_class=HTMLResponse)
async def form_get(request: Request):
return templates.TemplateResponse("search.html", {"request": request, "results": None})
@app.post("/", response_class=HTMLResponse)
async def form_post(request: Request, query: str = Form(...)):
results = await vector_store.search(query, threshold=0.2)
return templates.TemplateResponse("search.html", {"request": request, "results": results})
if __name__ == "__main__":
uvicorn.run("app:app", reload=True)
+172
View File
@@ -0,0 +1,172 @@
"""Main application entry point."""
import asyncio
import logging
from pathlib import Path
from .services.search_service import SearchService
from .services.document_processor import DocumentProcessor
from .models.document import EntityType
from config.settings import settings
# Configure logging
logging.basicConfig(
level=getattr(logging, settings.LOG_LEVEL),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class SemanticSearchPOC:
"""Main POC application class."""
def __init__(self):
"""Initialize the POC application."""
self.search_service = SearchService()
self.document_processor = DocumentProcessor()
async def initialize(self):
"""Initialize the application."""
logger.info("Initializing Semantic Search POC...")
# Create necessary directories
self._create_directories()
# Initialize services
await self.search_service.vector_store.initialize()
logger.info("Initialization complete!")
def _create_directories(self):
"""Create necessary directories."""
directories = [
settings.DATA_DIR,
settings.RAW_DATA_DIR,
settings.PROCESSED_DATA_DIR,
settings.UPLOAD_DIR,
Path(settings.LOG_FILE).parent
]
for directory in directories:
Path(directory).mkdir(parents=True, exist_ok=True)
async def process_sample_documents(self):
"""Process sample documents for testing."""
logger.info("Processing sample documents...")
sample_docs_dir = Path(settings.RAW_DATA_DIR) / "sample_documents"
if not sample_docs_dir.exists():
logger.warning(f"Sample documents directory not found: {sample_docs_dir}")
return
# Process all documents in the sample directory
for file_path in sample_docs_dir.iterdir():
if file_path.is_file():
try:
await self.document_processor.process_document(str(file_path))
logger.info(f"Processed: {file_path.name}")
except Exception as e:
logger.error(f"Error processing {file_path.name}: {str(e)}")
async def demo_search(self):
"""Demonstrate search functionality."""
logger.info("Running search demo...")
# Example searches
search_queries = [
"Napoleon Bonaparte biography",
"French Revolution events",
"Paris buildings and architecture",
"Military campaigns in Europe"
]
for query in search_queries:
logger.info(f"\nSearching for: '{query}'")
try:
results = await self.search_service.semantic_search(
query=query,
limit=5
)
if results:
for i, result in enumerate(results, 1):
logger.info(f" {i}. {result.title} (Score: {result.similarity_score:.3f})")
logger.info(f" Preview: {result.content_preview[:100]}...")
else:
logger.info(" No results found.")
except Exception as e:
logger.error(f"Error searching for '{query}': {str(e)}")
async def demo_entity_search(self):
"""Demonstrate entity-based search."""
logger.info("Running entity search demo...")
# Example entity searches
entity_searches = [
("Napoleon", EntityType.PERSON),
("Paris", EntityType.PLACE),
("Revolution", EntityType.EVENT)
]
for entity_text, entity_type in entity_searches:
logger.info(f"\nSearching for {entity_type.value}: '{entity_text}'")
try:
results = await self.search_service.search_by_entity(
entity_text=entity_text,
entity_type=entity_type,
limit=3
)
if results:
for i, result in enumerate(results, 1):
logger.info(f" {i}. {result.title}")
logger.info(f" Entities: {len(result.entities)} found")
else:
logger.info(" No results found.")
except Exception as e:
logger.error(f"Error searching for entity '{entity_text}': {str(e)}")
async def demo_relationships(self):
"""Demonstrate entity relationship extraction."""
logger.info("Running relationship demo...")
entity = "Napoleon"
logger.info(f"\nFinding relationships for: '{entity}'")
try:
relationships = await self.search_service.get_entity_relationships(entity)
for rel_type, entities in relationships.items():
if entities:
logger.info(f" {rel_type.upper()}:")
for entity_data in entities[:3]: # Show top 3
logger.info(f" - {entity_data['entity']} (freq: {entity_data['frequency']})")
except Exception as e:
logger.error(f"Error finding relationships for '{entity}': {str(e)}")
async def main():
"""Main function to run the POC."""
logger.info("Starting Semantic Search POC")
# Initialize application
app = SemanticSearchPOC()
await app.initialize()
# Process sample documents (if any)
await app.process_sample_documents()
# Run demonstrations
await app.demo_search()
await app.demo_entity_search()
await app.demo_relationships()
logger.info("POC demonstration complete!")
if __name__ == "__main__":
asyncio.run(main())
View File
+102
View File
@@ -0,0 +1,102 @@
"""Document data models."""
from datetime import datetime
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field
from enum import Enum
class DocumentType(str, Enum):
"""Document type enumeration."""
PDF = "pdf"
XML = "xml"
TEXT = "text"
DOCX = "docx"
IMAGE = "image"
AUDIO = "audio"
VIDEO = "video"
class AccessLevel(str, Enum):
"""Document access level."""
PUBLIC = "public"
RESTRICTED = "restricted"
PRIVATE = "private"
class EntityType(str, Enum):
"""Entity type enumeration."""
PERSON = "person"
PLACE = "place"
EVENT = "event"
ORGANIZATION = "organization"
BUILDING = "building"
DATE = "date"
class Entity(BaseModel):
"""Entity extracted from document."""
text: str
label: str
entity_type: EntityType
start_pos: int
end_pos: int
confidence: float = 0.0
metadata: Dict[str, Any] = Field(default_factory=dict)
class DocumentMetadata(BaseModel):
"""Document metadata."""
title: Optional[str] = None
author: Optional[str] = None
creation_date: Optional[datetime] = None
language: str = "en"
subject: Optional[str] = None
description: Optional[str] = None
keywords: List[str] = Field(default_factory=list)
file_size: Optional[int] = None
page_count: Optional[int] = None
class Document(BaseModel):
"""Main document model."""
id: str
filename: str
file_path: str
document_type: DocumentType
access_level: AccessLevel
content: str
metadata: DocumentMetadata
entities: List[Entity] = Field(default_factory=list)
embedding: Optional[List[float]] = None
created_at: datetime = Field(default_factory=datetime.utcnow)
updated_at: datetime = Field(default_factory=datetime.utcnow)
class Config:
"""Pydantic config."""
json_encoders = {
datetime: lambda v: v.isoformat()
}
class DocumentChunk(BaseModel):
"""Document chunk for processing."""
id: str
document_id: str
content: str
chunk_index: int
start_pos: int
end_pos: int
embedding: Optional[List[float]] = None
entities: List[Entity] = Field(default_factory=list)
class SearchQuery(BaseModel):
"""Search query model."""
query: str
entity_types: Optional[List[EntityType]] = None
document_types: Optional[List[DocumentType]] = None
access_levels: Optional[List[AccessLevel]] = None
limit: int = 10
offset: int = 0
similarity_threshold: float = 0.2
View File
+53
View File
@@ -0,0 +1,53 @@
"""Search result data models."""
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field
class SearchResult(BaseModel):
"""Search result model."""
document_id: str
title: str
content_preview: str
similarity_score: float
entities: List[Dict[str, Any]] = Field(default_factory=list)
metadata: Dict[str, Any] = Field(default_factory=dict)
matched_entity: Optional[str] = None
highlights: List[str] = Field(default_factory=list)
class Config:
"""Pydantic config."""
json_encoders = {
float: lambda v: round(v, 4)
}
class EntitySearchResult(BaseModel):
"""Entity-specific search result."""
entity_text: str
entity_type: str
documents: List[SearchResult]
total_occurrences: int
related_entities: List[Dict[str, Any]] = Field(default_factory=list)
class SearchResponse(BaseModel):
"""Complete search response."""
query: str
results: List[SearchResult]
total_results: int
search_time_ms: float
filters_applied: Dict[str, Any] = Field(default_factory=dict)
suggestions: List[str] = Field(default_factory=list)
class EntityRelationship(BaseModel):
"""Entity relationship model."""
source_entity: str
source_type: str
target_entity: str
target_type: str
relationship_type: str
confidence: float
document_ids: List[str]
frequency: int = 1
View File
+545
View File
@@ -0,0 +1,545 @@
import logging
import os
from pathlib import Path
from typing import List, Dict, Any, Optional
import hashlib
from datetime import datetime
import json
# Document processing libraries
import PyPDF2
from docx import Document as DocxDocument
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
# ML libraries
from sentence_transformers import SentenceTransformer
from ..models.document import Document, DocumentType, AccessLevel, DocumentMetadata
from ..services.entity_extractor import EntityExtractor
from ..services.vector_store import VectorStore
from config.settings import settings
class DocumentProcessor:
"""Service for processing and indexing documents."""
def __init__(self):
"""Initialize document processor."""
self.logger = logging.getLogger(__name__)
self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL)
self.entity_extractor = EntityExtractor()
self.vector_store = VectorStore()
async def process_document(self, file_path: str, access_level: AccessLevel = AccessLevel.PUBLIC) -> Optional[Document]:
"""
Process a document and add it to the search index.
Args:
file_path: Path to the document file
access_level: Access level for the document
Returns:
Processed Document object or None if processing failed
"""
try:
file_path = Path(file_path)
if not file_path.exists():
self.logger.error(f"File not found: {file_path}")
return None
# Determine document type
doc_type = self._get_document_type(file_path)
if not doc_type:
self.logger.error(f"Unsupported file type: {file_path.suffix}")
return None
# Extract text content
content = self._extract_text(file_path, doc_type)
if not content:
self.logger.error(f"Failed to extract text from: {file_path}")
return None
# Generate document ID
doc_id = self._generate_document_id(file_path, content)
# Extract metadata
metadata = self._extract_metadata(file_path, doc_type, content)
# Extract entities
entities = self.entity_extractor.extract_entities(content)
# Generate embedding
embedding = self.embedding_model.encode(content).tolist()
# Create document object
document = Document(
id=doc_id,
filename=file_path.name,
file_path=str(file_path),
document_type=doc_type,
access_level=access_level,
content=content,
metadata=metadata,
entities=entities,
embedding=embedding
)
# Add to vector store
await self._add_to_vector_store(document)
# Save processed document metadata
await self._save_document_metadata(document)
self.logger.info(f"Successfully processed document: {file_path.name}")
return document
except Exception as e:
self.logger.error(f"Error processing document {file_path}: {str(e)}")
return None
def _get_document_type(self, file_path: Path) -> Optional[DocumentType]:
"""Determine document type from file extension."""
extension = file_path.suffix.lower()
type_mapping = {
'.pdf': DocumentType.PDF,
'.txt': DocumentType.TEXT,
'.docx': DocumentType.DOCX,
'.xml': DocumentType.XML,
'.jpg': DocumentType.IMAGE,
'.jpeg': DocumentType.IMAGE,
'.png': DocumentType.IMAGE,
'.gif': DocumentType.IMAGE,
'.mp3': DocumentType.AUDIO,
'.wav': DocumentType.AUDIO,
'.mp4': DocumentType.VIDEO,
'.avi': DocumentType.VIDEO
}
return type_mapping.get(extension)
def _extract_text(self, file_path: Path, doc_type: DocumentType) -> str:
"""Extract text content from document based on type."""
try:
if doc_type == DocumentType.PDF:
return self._extract_text_from_pdf(file_path)
elif doc_type == DocumentType.TEXT:
return self._extract_text_from_txt(file_path)
elif doc_type == DocumentType.DOCX:
return self._extract_text_from_docx(file_path)
elif doc_type == DocumentType.XML:
return self._extract_text_from_xml(file_path)
else:
self.logger.warning(f"Text extraction not implemented for type: {doc_type}")
return ""
except Exception as e:
self.logger.error(f"Error extracting text from {file_path}: {str(e)}")
return ""
def _extract_text_from_pdf(self, file_path: Path) -> str:
"""Extract text from PDF file."""
text = ""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
except Exception as e:
self.logger.error(f"Error reading PDF {file_path}: {str(e)}")
return text.strip()
def _extract_text_from_txt(self, file_path: Path) -> str:
"""Extract text from plain text file."""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except UnicodeDecodeError:
# Try with different encodings
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
try:
with open(file_path, 'r', encoding=encoding) as file:
return file.read()
except UnicodeDecodeError:
continue
self.logger.error(f"Could not decode text file: {file_path}")
return ""
def _extract_text_from_docx(self, file_path: Path) -> str:
"""Extract text from DOCX file."""
try:
doc = DocxDocument(str(file_path))
text = []
for paragraph in doc.paragraphs:
text.append(paragraph.text)
return '\n'.join(text)
except Exception as e:
self.logger.error(f"Error reading DOCX {file_path}: {str(e)}")
return ""
def _extract_text_from_xml(self, file_path: Path) -> str:
"""Extract text from XML file."""
try:
tree = ET.parse(file_path)
root = tree.getroot()
# Extract all text content from XML
text_content = []
for elem in root.iter():
if elem.text and elem.text.strip():
text_content.append(elem.text.strip())
return '\n'.join(text_content)
except Exception as e:
self.logger.error(f"Error reading XML {file_path}: {str(e)}")
return ""
def _extract_metadata(self, file_path: Path, doc_type: DocumentType, content: str) -> DocumentMetadata:
"""Extract metadata from document."""
try:
stat = file_path.stat()
# Basic metadata
metadata = DocumentMetadata(
title=file_path.stem, # Filename without extension
creation_date=datetime.fromtimestamp(stat.st_ctime),
language="en", # Default to English for POC
file_size=stat.st_size,
keywords=self._extract_keywords(content)
)
# Document type specific metadata
if doc_type == DocumentType.PDF:
metadata.page_count = self._get_pdf_page_count(file_path)
return metadata
except Exception as e:
self.logger.error(f"Error extracting metadata from {file_path}: {str(e)}")
return DocumentMetadata()
def _get_pdf_page_count(self, file_path: Path) -> Optional[int]:
"""Get page count for PDF files."""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
return len(pdf_reader.pages)
except Exception as e:
self.logger.error(f"Error getting PDF page count: {str(e)}")
return None
def _extract_keywords(self, content: str, max_keywords: int = 10) -> List[str]:
"""Extract keywords from document content."""
words = content.lower().split()
# Filter out common stop words
stop_words = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during',
'before', 'after', 'above', 'below', 'between', 'among', 'is', 'are',
'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does',
'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can'
}
# Count word frequencies
word_freq = {}
for word in words:
word = word.strip('.,!?;:"()[]{}')
if len(word) > 3 and word not in stop_words:
word_freq[word] = word_freq.get(word, 0) + 1
# Get top keywords
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
return [word for word, freq in sorted_words[:max_keywords]]
def _generate_document_id(self, file_path: Path, content: str) -> str:
"""Generate unique document ID."""
# Create hash from file path and content
hasher = hashlib.md5()
hasher.update(str(file_path).encode('utf-8'))
hasher.update(content[:1000].encode('utf-8')) # First 1000 chars
return hasher.hexdigest()
async def _add_to_vector_store(self, document: Document):
"""Add document to vector store."""
try:
# Prepare metadata for vector store
metadata = {
'title': document.metadata.title or document.filename,
'filename': document.filename,
'document_type': document.document_type.value,
'access_level': document.access_level.value,
'creation_date': document.metadata.creation_date.isoformat() if document.metadata.creation_date else None,
'language': document.metadata.language,
'file_size': document.metadata.file_size,
'page_count': document.metadata.page_count,
'keywords': json.dumps(document.metadata.keywords),
'entities': json.dumps([{
'text': e.text,
'entity_type': e.entity_type.value,
'confidence': e.confidence
} for e in document.entities])
}
await self.vector_store.add_document(
document_id=document.id,
embedding=document.embedding,
content=document.content,
metadata=metadata
)
except Exception as e:
self.logger.error(f"Error adding document to vector store: {str(e)}")
raise
async def _save_document_metadata(self, document: Document):
"""Save document metadata to file."""
try:
metadata_dir = Path(settings.PROCESSED_DATA_DIR) / "metadata"
metadata_dir.mkdir(parents=True, exist_ok=True)
metadata_file = metadata_dir / f"{document.id}.json"
# Convert document to dictionary for JSON serialization
doc_dict = {
'id': document.id,
'filename': document.filename,
'file_path': document.file_path,
'document_type': document.document_type.value,
'access_level': document.access_level.value,
'content_length': len(document.content),
'metadata': {
'title': document.metadata.title,
'author': document.metadata.author,
'creation_date': document.metadata.creation_date.isoformat() if document.metadata.creation_date else None,
'language': document.metadata.language,
'subject': document.metadata.subject,
'description': document.metadata.description,
'keywords': document.metadata.keywords,
'file_size': document.metadata.file_size,
'page_count': document.metadata.page_count
},
'entities': [{
'text': e.text,
'label': e.label,
'entity_type': e.entity_type.value,
'start_pos': e.start_pos,
'end_pos': e.end_pos,
'confidence': e.confidence,
'metadata': e.metadata
} for e in document.entities],
'created_at': document.created_at.isoformat(),
'updated_at': document.updated_at.isoformat()
}
with open(metadata_file, 'w', encoding='utf-8') as f:
json.dump(doc_dict, f, indent=2, ensure_ascii=False)
self.logger.debug(f"Saved metadata for document: {document.id}")
except Exception as e:
self.logger.error(f"Error saving document metadata: {str(e)}")
async def process_batch(self, directory_path: str, file_pattern: str = "*") -> List[Document]:
"""
Process multiple documents in a directory.
Args:
directory_path: Path to directory containing documents
file_pattern: File pattern to match (e.g., "*.pdf")
Returns:
List of processed documents
"""
processed_documents = []
try:
directory = Path(directory_path)
if not directory.exists():
self.logger.error(f"Directory not found: {directory}")
return processed_documents
# Find matching files
files = list(directory.glob(file_pattern))
self.logger.info(f"Found {len(files)} files to process in {directory}")
for file_path in files:
if file_path.is_file():
document = await self.process_document(str(file_path))
if document:
processed_documents.append(document)
else:
self.logger.warning(f"Failed to process: {file_path}")
self.logger.info(f"Successfully processed {len(processed_documents)} documents")
return processed_documents
except Exception as e:
self.logger.error(f"Error processing batch: {str(e)}")
return processed_documents
async def get_document_by_id(self, document_id: str) -> Optional[Document]:
"""
Retrieve a document by its ID.
Args:
document_id: Document ID
Returns:
Document object or None if not found
"""
try:
metadata_file = Path(settings.PROCESSED_DATA_DIR) / "metadata" / f"{document_id}.json"
if not metadata_file.exists():
self.logger.warning(f"Document metadata not found: {document_id}")
return None
with open(metadata_file, 'r', encoding='utf-8') as f:
doc_dict = json.load(f)
# Reconstruct document object
metadata = DocumentMetadata(
title=doc_dict['metadata'].get('title'),
author=doc_dict['metadata'].get('author'),
creation_date=datetime.fromisoformat(doc_dict['metadata']['creation_date']) if doc_dict['metadata'].get('creation_date') else None,
language=doc_dict['metadata'].get('language', 'en'),
subject=doc_dict['metadata'].get('subject'),
description=doc_dict['metadata'].get('description'),
keywords=doc_dict['metadata'].get('keywords', []),
file_size=doc_dict['metadata'].get('file_size'),
page_count=doc_dict['metadata'].get('page_count')
)
# Reconstruct entities
entities = []
for entity_dict in doc_dict.get('entities', []):
entity = Entity(
text=entity_dict['text'],
label=entity_dict['label'],
entity_type=EntityType(entity_dict['entity_type']),
start_pos=entity_dict['start_pos'],
end_pos=entity_dict['end_pos'],
confidence=entity_dict['confidence'],
metadata=entity_dict.get('metadata', {})
)
entities.append(entity)
# Read original content if needed (for small files)
content = ""
if Path(doc_dict['file_path']).exists():
doc_type = DocumentType(doc_dict['document_type'])
content = self._extract_text(Path(doc_dict['file_path']), doc_type)
document = Document(
id=doc_dict['id'],
filename=doc_dict['filename'],
file_path=doc_dict['file_path'],
document_type=DocumentType(doc_dict['document_type']),
access_level=AccessLevel(doc_dict['access_level']),
content=content,
metadata=metadata,
entities=entities,
created_at=datetime.fromisoformat(doc_dict['created_at']),
updated_at=datetime.fromisoformat(doc_dict['updated_at'])
)
return document
except Exception as e:
self.logger.error(f"Error retrieving document {document_id}: {str(e)}")
return None
async def delete_document(self, document_id: str) -> bool:
"""
Delete a document from the index.
Args:
document_id: Document ID to delete
Returns:
True if successful, False otherwise
"""
try:
# Delete from vector store
await self.vector_store.delete_document(document_id)
# Delete metadata file
metadata_file = Path(settings.PROCESSED_DATA_DIR) / "metadata" / f"{document_id}.json"
if metadata_file.exists():
metadata_file.unlink()
self.logger.info(f"Deleted document: {document_id}")
return True
except Exception as e:
self.logger.error(f"Error deleting document {document_id}: {str(e)}")
return False
async def get_processing_stats(self) -> Dict[str, Any]:
"""
Get statistics about processed documents.
Returns:
Dictionary with processing statistics
"""
try:
metadata_dir = Path(settings.PROCESSED_DATA_DIR) / "metadata"
if not metadata_dir.exists():
return {"total_documents": 0}
# Count metadata files
metadata_files = list(metadata_dir.glob("*.json"))
total_docs = len(metadata_files)
# Analyze document types and other stats
doc_types = {}
access_levels = {}
languages = {}
total_entities = 0
for metadata_file in metadata_files:
try:
with open(metadata_file, 'r', encoding='utf-8') as f:
doc_dict = json.load(f)
# Count by document type
doc_type = doc_dict.get('document_type', 'unknown')
doc_types[doc_type] = doc_types.get(doc_type, 0) + 1
# Count by access level
access_level = doc_dict.get('access_level', 'unknown')
access_levels[access_level] = access_levels.get(access_level, 0) + 1
# Count by language
language = doc_dict.get('metadata', {}).get('language', 'unknown')
languages[language] = languages.get(language, 0) + 1
# Count entities
total_entities += len(doc_dict.get('entities', []))
except Exception as e:
self.logger.warning(f"Error reading metadata file {metadata_file}: {str(e)}")
continue
# Get vector store stats
vector_stats = await self.vector_store.get_collection_stats()
return {
"total_documents": total_docs,
"document_types": doc_types,
"access_levels": access_levels,
"languages": languages,
"total_entities": total_entities,
"average_entities_per_doc": total_entities / total_docs if total_docs > 0 else 0,
"vector_store": vector_stats
}
except Exception as e:
self.logger.error(f"Error getting processing stats: {str(e)}")
return {"error": str(e)}
View File
+239
View File
@@ -0,0 +1,239 @@
import logging
from typing import List, Dict, Any, Optional
import spacy
from spacy import displacy
from ..models.document import Entity, EntityType
from config.settings import settings
class EntityExtractor:
"""Entity extraction service for named entity recognition."""
def __init__(self):
"""Initialize entity extractor."""
self.logger = logging.getLogger(__name__)
self.nlp = None
self._load_model()
def _load_model(self):
"""Load the spaCy NLP model."""
try:
self.nlp = spacy.load(settings.SPACY_MODEL)
self.logger.info(f"Loaded spaCy model: {settings.SPACY_MODEL}")
except OSError:
self.logger.error(f"spaCy model {settings.SPACY_MODEL} not found. Please install it with: python -m spacy download {settings.SPACY_MODEL}")
raise
def extract_entities(self, text: str) -> List[Entity]:
"""
Extract named entities from text.
Args:
text: Input text to process
Returns:
List of extracted entities
"""
if not self.nlp:
self._load_model()
try:
# Process text with spaCy
doc = self.nlp(text)
entities = []
for ent in doc.ents:
# Map spaCy labels to our EntityType enum
entity_type = self._map_spacy_label(ent.label_)
if entity_type: # Only include mapped entity types
entity = Entity(
text=ent.text,
label=ent.label_,
entity_type=entity_type,
start_pos=ent.start_char,
end_pos=ent.end_char,
confidence=self._calculate_confidence(ent),
metadata={
"spacy_label": ent.label_,
"spacy_explanation": spacy.explain(ent.label_)
}
)
entities.append(entity)
self.logger.debug(f"Extracted {len(entities)} entities from text")
return entities
except Exception as e:
self.logger.error(f"Error extracting entities: {str(e)}")
return []
def extract_relationships(self, text: str) -> List[Dict[str, Any]]:
"""
Extract relationships between entities.
Args:
text: Input text to process
Returns:
List of relationships between entities
"""
if not self.nlp:
self._load_model()
try:
doc = self.nlp(text)
relationships = []
# Simple relationship extraction based on dependency parsing
for token in doc:
if token.dep_ in ['nsubj', 'dobj', 'pobj']: # Subject, direct object, prepositional object
head = token.head
# Check if both token and head are part of named entities
token_ent = self._get_entity_for_token(token, doc.ents)
head_ent = self._get_entity_for_token(head, doc.ents)
if token_ent and head_ent and token_ent != head_ent:
relationship = {
"subject": token_ent.text,
"subject_type": self._map_spacy_label(token_ent.label_),
"predicate": head.text,
"object": head_ent.text,
"object_type": self._map_spacy_label(head_ent.label_),
"relation_type": token.dep_,
"confidence": 0.7 # Basic confidence score
}
relationships.append(relationship)
self.logger.debug(f"Extracted {len(relationships)} relationships from text")
return relationships
except Exception as e:
self.logger.error(f"Error extracting relationships: {str(e)}")
return []
def _map_spacy_label(self, spacy_label: str) -> Optional[EntityType]:
"""
Map spaCy entity labels to our EntityType enum.
Args:
spacy_label: spaCy entity label
Returns:
Corresponding EntityType or None if not mapped
"""
mapping = {
# Person
'PERSON': EntityType.PERSON,
# Places
'GPE': EntityType.PLACE, # Geopolitical entity
'LOC': EntityType.PLACE, # Location
'FAC': EntityType.BUILDING, # Facility/Building
# Organizations
'ORG': EntityType.ORGANIZATION,
# Events
'EVENT': EntityType.EVENT,
# Dates
'DATE': EntityType.DATE,
'TIME': EntityType.DATE,
}
return mapping.get(spacy_label)
def _calculate_confidence(self, entity) -> float:
"""
Calculate confidence score for an entity.
Args:
entity: spaCy entity object
Returns:
Confidence score between 0 and 1
"""
# Basic confidence calculation based on entity properties
confidence = 0.5 # Base confidence
# Increase confidence for longer entities
if len(entity.text) > 3:
confidence += 0.1
# Increase confidence for capitalized entities
if entity.text.istitle():
confidence += 0.1
# Increase confidence for certain entity types
high_confidence_types = ['PERSON', 'GPE', 'ORG']
if entity.label_ in high_confidence_types:
confidence += 0.2
return min(confidence, 1.0)
def _get_entity_for_token(self, token, entities):
"""
Get the entity that contains a specific token.
Args:
token: spaCy token
entities: List of spaCy entities
Returns:
Entity containing the token or None
"""
for ent in entities:
if ent.start <= token.i < ent.end:
return ent
return None
def get_entity_summary(self, entities: List[Entity]) -> Dict[str, int]:
"""
Get summary statistics for extracted entities.
Args:
entities: List of entities
Returns:
Dictionary with entity type counts
"""
summary = {}
for entity in entities:
entity_type = entity.entity_type.value
summary[entity_type] = summary.get(entity_type, 0) + 1
return summary
def visualize_entities(self, text: str, output_path: Optional[str] = None) -> str:
"""
Create HTML visualization of entities in text.
Args:
text: Input text
output_path: Optional file path to save HTML
Returns:
HTML string with entity visualization
"""
if not self.nlp:
self._load_model()
try:
doc = self.nlp(text)
# Generate HTML visualization
html = displacy.render(doc, style="ent", jupyter=False)
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html)
self.logger.info(f"Entity visualization saved to: {output_path}")
return html
except Exception as e:
self.logger.error(f"Error creating entity visualization: {str(e)}")
return ""
+216
View File
@@ -0,0 +1,216 @@
"""Main search service implementation."""
from typing import List, Dict, Any, Optional
import logging
from sentence_transformers import SentenceTransformer
from ..models.document import Document, SearchQuery, EntityType
from ..models.search_result import SearchResult
from .vector_store import VectorStore
from .entity_extractor import EntityExtractor
from config.settings import settings
class SearchService:
"""Main search service for semantic search functionality."""
def __init__(self):
"""Initialize search service."""
self.logger = logging.getLogger(__name__)
self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL)
self.vector_store = VectorStore()
self.entity_extractor = EntityExtractor()
async def semantic_search(
self,
query: str,
limit: int = 10,
entity_types: Optional[List[EntityType]] = None,
similarity_threshold: float = None
) -> List[SearchResult]:
"""
Perform semantic search across documents.
Args:
query: Search query string
limit: Maximum number of results
entity_types: Filter by entity types
similarity_threshold: Minimum similarity score
Returns:
List of search results
"""
if similarity_threshold is None:
similarity_threshold = settings.SIMILARITY_THRESHOLD
try:
# Generate query embedding
query_embedding = self.embedding_model.encode(query).tolist()
# Search vector store
results = await self.vector_store.similarity_search(
query_embedding=query_embedding,
limit=limit,
threshold=similarity_threshold
)
# Filter by entity types if specified
if entity_types:
results = self._filter_by_entities(results, entity_types)
# Convert to SearchResult objects
search_results = []
for result in results:
search_result = SearchResult(
document_id=result["document_id"],
title=result.get("title", "Untitled"),
content_preview=result.get("content", "")[:200] + "...",
similarity_score=result["similarity_score"],
entities=result.get("entities", []),
metadata=result.get("metadata", {})
)
search_results.append(search_result)
return search_results
except Exception as e:
self.logger.error(f"Error in semantic search: {str(e)}")
raise
async def search_by_entity(
self,
entity_text: str,
entity_type: EntityType,
limit: int = 10
) -> List[SearchResult]:
"""
Search for documents containing specific entities.
Args:
entity_text: Entity text to search for
entity_type: Type of entity
limit: Maximum number of results
Returns:
List of search results
"""
try:
# Search for documents containing the entity
results = await self.vector_store.search_by_entity(
entity_text=entity_text,
entity_type=entity_type.value,
limit=limit
)
# Convert to SearchResult objects
search_results = []
for result in results:
search_result = SearchResult(
document_id=result["document_id"],
title=result.get("title", "Untitled"),
content_preview=result.get("content", "")[:200] + "...",
similarity_score=1.0, # Exact entity match
entities=result.get("entities", []),
metadata=result.get("metadata", {}),
matched_entity=entity_text
)
search_results.append(search_result)
return search_results
except Exception as e:
self.logger.error(f"Error in entity search: {str(e)}")
raise
async def get_entity_relationships(
self,
entity_text: str
) -> Dict[str, List[Dict[str, Any]]]:
"""
Get relationships for a specific entity.
Args:
entity_text: Entity to find relationships for
Returns:
Dictionary of relationship types and related entities
"""
try:
# Find documents containing the entity
entity_docs = await self.search_by_entity(
entity_text=entity_text,
entity_type=EntityType.PERSON, # Default to person
limit=100
)
# Extract co-occurring entities
relationships = {
"persons": [],
"places": [],
"events": [],
"organizations": []
}
for doc in entity_docs:
for entity in doc.entities:
if entity["text"].lower() != entity_text.lower():
rel_type = self._map_entity_to_relationship(entity["entity_type"])
if rel_type in relationships:
relationships[rel_type].append({
"entity": entity["text"],
"type": entity["entity_type"],
"document_id": doc.document_id,
"confidence": entity.get("confidence", 0.0)
})
# Remove duplicates and sort by frequency
for rel_type in relationships:
entities = relationships[rel_type]
unique_entities = {}
for entity in entities:
key = entity["entity"]
if key not in unique_entities:
unique_entities[key] = entity
unique_entities[key]["frequency"] = 1
else:
unique_entities[key]["frequency"] += 1
relationships[rel_type] = sorted(
unique_entities.values(),
key=lambda x: x["frequency"],
reverse=True
)[:10] # Top 10 relationships
return relationships
except Exception as e:
self.logger.error(f"Error getting entity relationships: {str(e)}")
raise
def _filter_by_entities(
self,
results: List[Dict[str, Any]],
entity_types: List[EntityType]
) -> List[Dict[str, Any]]:
"""Filter search results by entity types."""
filtered_results = []
entity_type_values = [et.value for et in entity_types]
for result in results:
entities = result.get("entities", [])
if any(entity.get("entity_type") in entity_type_values for entity in entities):
filtered_results.append(result)
return filtered_results
def _map_entity_to_relationship(self, entity_type: str) -> str:
"""Map entity type to relationship category."""
mapping = {
"PERSON": "persons",
"GPE": "places", # Geopolitical entity
"LOC": "places", # Location
"EVENT": "events",
"ORG": "organizations",
"BUILDING": "places"
}
return mapping.get(entity_type, "other")
+227
View File
@@ -0,0 +1,227 @@
import logging
from typing import List, Dict, Any, Optional
from pathlib import Path
import chromadb
from chromadb.config import Settings
from chromadb.errors import NotFoundError
import numpy as np
from config.settings import settings
class VectorStore:
"""Vector store for document embeddings using ChromaDB."""
def __init__(self):
"""Initialize vector store."""
self.logger = logging.getLogger(__name__)
self.client = None
self.collection = None
self.collection_name = "documents"
async def initialize(self):
"""Initialize the vector store."""
try:
persist_dir = Path(settings.CHROMA_PERSIST_DIR)
persist_dir.mkdir(parents=True, exist_ok=True)
self.client = chromadb.PersistentClient(
path=str(persist_dir),
settings=Settings(
anonymized_telemetry=False,
allow_reset=True
)
)
try:
self.collection = self.client.get_collection(name=self.collection_name)
self.logger.info(f"Loaded existing collection: {self.collection_name}")
except NotFoundError:
self.collection = self.client.create_collection(
name=self.collection_name,
metadata={"description": "Document embeddings for semantic search"}
)
self.logger.info(f"Created new collection: {self.collection_name}")
except Exception as e:
self.logger.error(f"Error initializing vector store: {str(e)}")
raise
async def add_document(
self,
document_id: str,
embedding: List[float],
content: str,
metadata: Dict[str, Any]
) -> bool:
try:
if not self.collection:
await self.initialize()
chroma_metadata = {k: str(v) for k, v in metadata.items()}
self.collection.add(
embeddings=[embedding],
documents=[content],
metadatas=[chroma_metadata],
ids=[document_id]
)
self.logger.info(f"Added document {document_id} to vector store")
return True
except Exception as e:
self.logger.error(f"Error adding document {document_id}: {str(e)}")
return False
async def similarity_search(
self,
query_embedding: List[float],
limit: int = 10,
threshold: float = 0.3,
where: Optional[Dict[str, Any]] = None
) -> List[Dict[str, Any]]:
try:
if not self.collection:
await self.initialize()
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=limit,
where=where
)
formatted_results = []
if results['ids'] and len(results['ids'][0]) > 0:
for i in range(len(results['ids'][0])):
distance = results['distances'][0][i]
similarity_score = 1 - distance
if similarity_score >= threshold:
result = {
'document_id': results['ids'][0][i],
'content': results['documents'][0][i] if results['documents'] else '',
'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
'similarity_score': similarity_score
}
if 'entities' in result['metadata']:
try:
import json
result['entities'] = json.loads(result['metadata']['entities'])
except:
result['entities'] = []
else:
result['entities'] = []
formatted_results.append(result)
formatted_results.sort(key=lambda x: x['similarity_score'], reverse=True)
self.logger.info(f"Found {len(formatted_results)} results above threshold {threshold}")
return formatted_results
except Exception as e:
self.logger.error(f"Error in similarity search: {str(e)}")
return []
async def search_by_entity(
self,
entity_text: str,
entity_type: str,
limit: int = 10
) -> List[Dict[str, Any]]:
try:
if not self.collection:
await self.initialize()
results = self.collection.get()
filtered_results = []
entity_lower = entity_text.lower()
if results['ids']:
for i in range(len(results['ids'])):
document_content = results['documents'][i] if results['documents'] else ''
metadata = results['metadatas'][i] if results['metadatas'] else {}
if (entity_lower in document_content.lower() or
entity_lower in str(metadata).lower()):
result = {
'document_id': results['ids'][i],
'content': document_content,
'metadata': metadata,
'similarity_score': 1.0
}
if 'entities' in metadata:
try:
import json
result['entities'] = json.loads(metadata['entities'])
except:
result['entities'] = []
else:
result['entities'] = []
filtered_results.append(result)
if len(filtered_results) >= limit:
break
self.logger.info(f"Found {len(filtered_results)} documents containing entity '{entity_text}'")
return filtered_results
except Exception as e:
self.logger.error(f"Error searching for entity '{entity_text}': {str(e)}")
return []
async def delete_document(self, document_id: str) -> bool:
try:
if not self.collection:
await self.initialize()
self.collection.delete(ids=[document_id])
self.logger.info(f"Deleted document {document_id} from vector store")
return True
except Exception as e:
self.logger.error(f"Error deleting document {document_id}: {str(e)}")
return False
async def get_collection_stats(self) -> Dict[str, Any]:
try:
if not self.collection:
await self.initialize()
count = self.collection.count()
return {
'total_documents': count,
'collection_name': self.collection_name,
'embedding_dimension': settings.EMBEDDING_DIMENSION
}
except Exception as e:
self.logger.error(f"Error getting collection stats: {str(e)}")
return {}
async def reset_collection(self) -> bool:
try:
if not self.client:
await self.initialize()
try:
self.client.delete_collection(self.collection_name)
except NotFoundError:
pass
self.collection = self.client.create_collection(
name=self.collection_name,
metadata={"description": "Document embeddings for semantic search"}
)
self.logger.info(f"Reset collection: {self.collection_name}")
return True
except Exception as e:
self.logger.error(f"Error resetting collection: {str(e)}")
return False
+788
View File
@@ -0,0 +1,788 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Semantic Search Engine - POC</title>
<script src="https://cdnjs.cloudflare.com/ajax/libs/axios/1.6.0/axios.min.js"></script>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
line-height: 1.6;
color: #333;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
min-height: 100vh;
}
.container {
max-width: 1200px;
margin: 0 auto;
padding: 20px;
}
.header {
text-align: center;
color: white;
margin-bottom: 40px;
}
.header h1 {
font-size: 3rem;
font-weight: 300;
margin-bottom: 10px;
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
}
.header p {
font-size: 1.2rem;
opacity: 0.9;
}
.search-container {
background: white;
border-radius: 20px;
padding: 40px;
box-shadow: 0 20px 40px rgba(0,0,0,0.1);
margin-bottom: 30px;
backdrop-filter: blur(10px);
}
.search-form {
display: flex;
gap: 15px;
margin-bottom: 20px;
flex-wrap: wrap;
}
.search-input {
flex: 1;
min-width: 300px;
padding: 15px 20px;
border: 2px solid #e1e5e9;
border-radius: 12px;
font-size: 16px;
transition: all 0.3s ease;
}
.search-input:focus {
outline: none;
border-color: #667eea;
box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
}
.search-btn {
padding: 15px 30px;
background: linear-gradient(135deg, #667eea, #764ba2);
color: white;
border: none;
border-radius: 12px;
font-size: 16px;
font-weight: 600;
cursor: pointer;
transition: all 0.3s ease;
min-width: 120px;
}
.search-btn:hover {
transform: translateY(-2px);
box-shadow: 0 8px 20px rgba(102, 126, 234, 0.3);
}
.search-btn:disabled {
opacity: 0.6;
cursor: not-allowed;
transform: none;
}
.filters {
display: flex;
gap: 15px;
flex-wrap: wrap;
align-items: center;
}
.filter-group {
display: flex;
align-items: center;
gap: 8px;
}
.filter-group label {
font-weight: 500;
color: #555;
}
.filter-select {
padding: 8px 12px;
border: 1px solid #ddd;
border-radius: 8px;
font-size: 14px;
}
.stats-container {
background: white;
border-radius: 20px;
padding: 30px;
box-shadow: 0 10px 30px rgba(0,0,0,0.1);
margin-bottom: 30px;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
}
.stat-card {
text-align: center;
padding: 20px;
background: linear-gradient(135deg, #f8f9fa, #e9ecef);
border-radius: 12px;
}
.stat-number {
font-size: 2rem;
font-weight: 700;
color: #667eea;
margin-bottom: 5px;
}
.stat-label {
color: #666;
font-size: 0.9rem;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.results-container {
background: white;
border-radius: 20px;
padding: 30px;
box-shadow: 0 10px 30px rgba(0,0,0,0.1);
margin-bottom: 30px;
}
.results-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 25px;
padding-bottom: 15px;
border-bottom: 2px solid #f0f0f0;
}
.results-title {
font-size: 1.5rem;
font-weight: 600;
color: #333;
}
.results-meta {
color: #666;
font-size: 0.9rem;
}
.result-card {
border: 1px solid #e9ecef;
border-radius: 12px;
padding: 20px;
margin-bottom: 15px;
transition: all 0.3s ease;
cursor: pointer;
}
.result-card:hover {
transform: translateY(-2px);
box-shadow: 0 8px 25px rgba(0,0,0,0.1);
border-color: #667eea;
}
.result-title {
font-size: 1.2rem;
font-weight: 600;
color: #333;
margin-bottom: 10px;
}
.result-preview {
color: #666;
line-height: 1.6;
margin-bottom: 15px;
}
.result-footer {
display: flex;
justify-content: space-between;
align-items: center;
font-size: 0.9rem;
}
.result-score {
background: linear-gradient(135deg, #667eea, #764ba2);
color: white;
padding: 4px 12px;
border-radius: 20px;
font-weight: 500;
}
.result-entities {
display: flex;
gap: 8px;
flex-wrap: wrap;
}
.entity-tag {
background: #f8f9fa;
color: #495057;
padding: 3px 8px;
border-radius: 15px;
font-size: 0.8rem;
border: 1px solid #dee2e6;
}
.loading {
text-align: center;
padding: 40px;
color: #666;
}
.spinner {
width: 40px;
height: 40px;
border: 4px solid #f3f3f3;
border-top: 4px solid #667eea;
border-radius: 50%;
animation: spin 1s linear infinite;
margin: 0 auto 20px;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.error {
background: #f8d7da;
color: #721c24;
padding: 15px;
border-radius: 8px;
margin: 20px 0;
border: 1px solid #f5c6cb;
}
.no-results {
text-align: center;
padding: 40px;
color: #666;
}
.upload-container {
background: white;
border-radius: 20px;
padding: 30px;
box-shadow: 0 10px 30px rgba(0,0,0,0.1);
margin-bottom: 30px;
}
.upload-area {
border: 2px dashed #ddd;
border-radius: 12px;
padding: 40px;
text-align: center;
transition: all 0.3s ease;
cursor: pointer;
}
.upload-area:hover {
border-color: #667eea;
background: rgba(102, 126, 234, 0.05);
}
.upload-area.dragover {
border-color: #667eea;
background: rgba(102, 126, 234, 0.1);
}
@media (max-width: 768px) {
.header h1 {
font-size: 2rem;
}
.search-form {
flex-direction: column;
}
.search-input {
min-width: auto;
}
.stats-grid {
grid-template-columns: repeat(2, 1fr);
}
.filters {
flex-direction: column;
align-items: stretch;
}
}
</style>
</head>
<body>
<div class="container">
<!-- Header -->
<div class="header">
<h1>🔍 Semantic Search Engine</h1>
<p>Intelligent search across archival documents with entity recognition</p>
</div>
<!-- Stats Container -->
<div class="stats-container">
<h2 style="margin-bottom: 20px; text-align: center;">System Statistics</h2>
<div class="stats-grid" id="statsGrid">
<div class="stat-card">
<div class="stat-number" id="totalDocs">-</div>
<div class="stat-label">Total Documents</div>
</div>
<div class="stat-card">
<div class="stat-number" id="totalEntities">-</div>
<div class="stat-label">Total Entities</div>
</div>
<div class="stat-card">
<div class="stat-number" id="avgEntities">-</div>
<div class="stat-label">Avg Entities/Doc</div>
</div>
<div class="stat-card">
<div class="stat-number" id="systemStatus">-</div>
<div class="stat-label">System Status</div>
</div>
</div>
</div>
<!-- Search Container -->
<div class="search-container">
<h2 style="margin-bottom: 20px;">Search Documents</h2>
<form class="search-form" id="searchForm">
<input
type="text"
class="search-input"
id="searchInput"
placeholder="Enter your search query... (e.g., 'Napoleon Bonaparte biography')"
required
>
<button type="submit" class="search-btn" id="searchBtn">
Search
</button>
</form>
<div class="filters">
<div class="filter-group">
<label for="entityTypeFilter">Entity Type:</label>
<select class="filter-select" id="entityTypeFilter">
<option value="">All Types</option>
<option value="person">Person</option>
<option value="place">Place</option>
<option value="event">Event</option>
<option value="organization">Organization</option>
<option value="building">Building</option>
<option value="date">Date</option>
</select>
</div>
<div class="filter-group">
<label for="limitFilter">Results:</label>
<select class="filter-select" id="limitFilter">
<option value="5">5 results</option>
<option value="10" selected>10 results</option>
<option value="20">20 results</option>
<option value="50">50 results</option>
</select>
</div>
<div class="filter-group">
<label for="thresholdFilter">Min Score:</label>
<select class="filter-select" id="thresholdFilter">
<option value="">Any score</option>
<option value="0.1">0.1+</option>
<option value="0.3" selected>0.3+</option>
<option value="0.5">0.5+</option>
<option value="0.7">0.7+</option>
</select>
</div>
</div>
</div>
<!-- Upload Container -->
<div class="upload-container">
<h2 style="margin-bottom: 20px;">Upload Document</h2>
<div class="upload-area" id="uploadArea">
<p>📄 Drag and drop a document here, or click to select</p>
<p style="color: #666; font-size: 0.9rem; margin-top: 10px;">
Supported formats: PDF, TXT, DOCX, XML (Max: 50MB)
</p>
<input type="file" id="fileInput" style="display: none;" accept=".pdf,.txt,.docx,.xml">
</div>
</div>
<!-- Results Container -->
<div class="results-container" id="resultsContainer" style="display: none;">
<div class="results-header">
<h2 class="results-title">Search Results</h2>
<div class="results-meta" id="resultsMeta"></div>
</div>
<div id="resultsContent"></div>
</div>
</div>
<script>
// API base URL
const API_BASE = '/api/v1';
// DOM elements
const searchForm = document.getElementById('searchForm');
const searchInput = document.getElementById('searchInput');
const searchBtn = document.getElementById('searchBtn');
const resultsContainer = document.getElementById('resultsContainer');
const resultsContent = document.getElementById('resultsContent');
const resultsMeta = document.getElementById('resultsMeta');
const uploadArea = document.getElementById('uploadArea');
const fileInput = document.getElementById('fileInput');
// Initialize app
document.addEventListener('DOMContentLoaded', function() {
loadStats();
setupEventListeners();
});
// Setup event listeners
function setupEventListeners() {
// Search form
searchForm.addEventListener('submit', handleSearch);
// Upload area
uploadArea.addEventListener('click', () => fileInput.click());
uploadArea.addEventListener('dragover', handleDragOver);
uploadArea.addEventListener('dragleave', handleDragLeave);
uploadArea.addEventListener('drop', handleFileDrop);
// File input
fileInput.addEventListener('change', handleFileSelect);
}
// Load system statistics
async function loadStats() {
try {
const response = await axios.get(`${API_BASE}/stats`);
const stats = response.data;
document.getElementById('totalDocs').textContent = stats.total_documents;
document.getElementById('totalEntities').textContent = stats.total_entities;
document.getElementById('avgEntities').textContent = stats.average_entities_per_doc.toFixed(1);
document.getElementById('systemStatus').textContent = stats.system_status.toUpperCase();
} catch (error) {
console.error('Failed to load stats:', error);
document.getElementById('systemStatus').textContent = 'ERROR';
}
}
// Handle search form submission
async function handleSearch(event) {
event.preventDefault();
const query = searchInput.value.trim();
if (!query) return;
// Get filter values
const entityType = document.getElementById('entityTypeFilter').value;
const limit = parseInt(document.getElementById('limitFilter').value);
const threshold = document.getElementById('thresholdFilter').value;
// Prepare search parameters
const params = new URLSearchParams({
q: query,
limit: limit.toString()
});
if (entityType) params.append('entity_types', entityType);
if (threshold) params.append('threshold', threshold);
// Show loading state
showLoading();
searchBtn.disabled = true;
searchBtn.textContent = 'Searching...';
try {
const response = await axios.get(`${API_BASE}/search?${params}`);
displayResults(response.data);
} catch (error) {
showError('Search failed: ' + (error.response?.data?.detail || error.message));
} finally {
searchBtn.disabled = false;
searchBtn.textContent = 'Search';
}
}
// Display search results
function displayResults(data) {
resultsContainer.style.display = 'block';
// Update meta information
resultsMeta.innerHTML = `
<div>
<strong>${data.total_results}</strong> results found in
<strong>${data.search_time_ms.toFixed(0)}ms</strong>
</div>
`;
// Clear previous results
resultsContent.innerHTML = '';
if (data.results.length === 0) {
resultsContent.innerHTML = `
<div class="no-results">
<h3>No results found</h3>
<p>Try adjusting your search query or filters</p>
</div>
`;
return;
}
// Display results
data.results.forEach(result => {
const resultCard = createResultCard(result);
resultsContent.appendChild(resultCard);
});
}
// Create result card element
function createResultCard(result) {
const card = document.createElement('div');
card.className = 'result-card';
// Create entities tags
const entitiesTags = result.entities.slice(0, 5).map(entity =>
`<span class="entity-tag">${entity.text} (${entity.entity_type})</span>`
).join('');
card.innerHTML = `
<div class="result-title">${escapeHtml(result.title)}</div>
<div class="result-preview">${escapeHtml(result.content_preview)}</div>
<div class="result-footer">
<div class="result-entities">
${entitiesTags}
${result.entities.length > 5 ? `<span class="entity-tag">+${result.entities.length - 5} more</span>` : ''}
</div>
<div class="result-score">Score: ${result.similarity_score.toFixed(3)}</div>
</div>
`;
// Add click handler to show document details
card.addEventListener('click', () => showDocumentDetails(result.document_id));
return card;
}
// Show document details
async function showDocumentDetails(documentId) {
try {
const response = await axios.get(`${API_BASE}/documents/${documentId}`);
const document = response.data;
// Create modal or detailed view
alert(`Document: ${document.filename}\nType: ${document.document_type}\nEntities: ${document.entities.length}`);
} catch (error) {
console.error('Failed to load document details:', error);
alert('Failed to load document details');
}
}
// File upload handlers
function handleDragOver(event) {
event.preventDefault();
uploadArea.classList.add('dragover');
}
function handleDragLeave(event) {
event.preventDefault();
uploadArea.classList.remove('dragover');
}
function handleFileDrop(event) {
event.preventDefault();
uploadArea.classList.remove('dragover');
const files = event.dataTransfer.files;
if (files.length > 0) {
uploadFile(files[0]);
}
}
function handleFileSelect(event) {
const files = event.target.files;
if (files.length > 0) {
uploadFile(files[0]);
}
}
// Upload file
async function uploadFile(file) {
// Validate file type
const allowedTypes = ['.pdf', '.txt', '.docx', '.xml'];
const fileExtension = '.' + file.name.split('.').pop().toLowerCase();
if (!allowedTypes.includes(fileExtension)) {
alert(`File type ${fileExtension} not supported. Allowed: ${allowedTypes.join(', ')}`);
return;
}
// Validate file size (50MB)
if (file.size > 50 * 1024 * 1024) {
alert('File too large. Maximum size: 50MB');
return;
}
const formData = new FormData();
formData.append('file', file);
formData.append('access_level', 'public');
// Show upload progress
uploadArea.innerHTML = `
<div class="loading">
<div class="spinner"></div>
<p>Uploading and processing "${file.name}"...</p>
</div>
`;
try {
const response = await axios.post(`${API_BASE}/documents/upload`, formData, {
headers: {
'Content-Type': 'multipart/form-data'
}
});
const result = response.data;
// Show success message
uploadArea.innerHTML = `
<div style="color: #28a745; text-align: center;">
<h3>✅ Upload Successful!</h3>
<p><strong>${result.filename}</strong></p>
<p>Document ID: ${result.document_id}</p>
<p>Entities found: ${result.entities_found}</p>
<button onclick="resetUploadArea()" style="margin-top: 15px; padding: 10px 20px; background: #667eea; color: white; border: none; border-radius: 8px; cursor: pointer;">
Upload Another
</button>
</div>
`;
// Refresh stats
loadStats();
} catch (error) {
console.error('Upload failed:', error);
uploadArea.innerHTML = `
<div style="color: #dc3545; text-align: center;">
<h3>❌ Upload Failed</h3>
<p>${error.response?.data?.detail || error.message}</p>
<button onclick="resetUploadArea()" style="margin-top: 15px; padding: 10px 20px; background: #dc3545; color: white; border: none; border-radius: 8px; cursor: pointer;">
Try Again
</button>
</div>
`;
}
}
// Reset upload area
function resetUploadArea() {
uploadArea.innerHTML = `
<p>📄 Drag and drop a document here, or click to select</p>
<p style="color: #666; font-size: 0.9rem; margin-top: 10px;">
Supported formats: PDF, TXT, DOCX, XML (Max: 50MB)
</p>
`;
fileInput.value = '';
}
// Show loading state
function showLoading() {
resultsContainer.style.display = 'block';
resultsMeta.textContent = '';
resultsContent.innerHTML = `
<div class="loading">
<div class="spinner"></div>
<p>Searching documents...</p>
</div>
`;
}
// Show error message
function showError(message) {
resultsContainer.style.display = 'block';
resultsMeta.textContent = '';
resultsContent.innerHTML = `
<div class="error">
<strong>Error:</strong> ${escapeHtml(message)}
</div>
`;
}
// Utility function to escape HTML
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
// Sample search queries for testing
const sampleQueries = [
"Napoleon Bonaparte biography",
"French Revolution events",
"Paris architecture buildings",
"Military campaigns Europe",
"Historical documents France"
];
// Add sample query buttons
function addSampleQueries() {
const samplesContainer = document.createElement('div');
samplesContainer.style.marginTop = '15px';
samplesContainer.innerHTML = '<p style="margin-bottom: 10px; color: #666;">Try these sample queries:</p>';
sampleQueries.forEach(query => {
const button = document.createElement('button');
button.textContent = query;
button.style.cssText = `
margin: 5px;
padding: 8px 15px;
background: #f8f9fa;
border: 1px solid #dee2e6;
border-radius: 20px;
cursor: pointer;
font-size: 0.9rem;
`;
button.addEventListener('click', () => {
searchInput.value = query;
handleSearch(new Event('submit'));
});
samplesContainer.appendChild(button);
});
document.querySelector('.search-container').appendChild(samplesContainer);
}
// Add sample queries
// setTimeout(addSampleQueries, 100);
</script>
</body>
</html>
+22
View File
@@ -0,0 +1,22 @@
<!DOCTYPE html>
<html>
<head>
<title>Vector Search</title>
</head>
<body>
<h1>Semantic Search</h1>
<form method="post">
<input type="text" name="query" placeholder="Enter your query">
<button type="submit">Search</button>
</form>
{% if results %}
<h2>Results</h2>
<ul>
{% for item in results %}
<li><strong>{{ item.document_id }}</strong>: {{ item.content }}</li>
{% endfor %}
</ul>
{% endif %}
</body>
</html>
View File
View File
View File