feat: Initial SCP project setup with AI-powered document compliance tools

This commit is contained in:
boladeE
2025-04-21 22:49:29 +01:00
commit b0ec64b883
28 changed files with 2405 additions and 0 deletions
+247
View File
@@ -0,0 +1,247 @@
from fastapi import FastAPI, UploadFile, File, HTTPException, Form, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse, RedirectResponse
from pydantic import BaseModel
from typing import List, Optional
import uuid
import os
import logging
import traceback
import json
from datetime import datetime
import markdown
from services.document_processor import DocumentProcessor
from services.vector_store import VectorStore
from services.database import Database
from dotenv import load_dotenv
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("app.log"),
logging.StreamHandler()
]
)
load_dotenv()
app = FastAPI(title="Mini SpecsComply Pro")
# Mount static files
app.mount("/static", StaticFiles(directory="src/static"), name="static")
# Templates
templates = Jinja2Templates(directory="src/templates")
# Add markdown filter to Jinja2
def markdown_filter(text):
return markdown.markdown(text, extensions=['extra', 'nl2br'])
templates.env.filters["markdown"] = markdown_filter
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Initialize services
vector_store = VectorStore()
document_processor = DocumentProcessor(vector_store)
database = Database()
class AnalysisResponse(BaseModel):
document_id: str
summary: str
issues: List[dict]
recommendations: List[str]
@app.get("/", response_class=HTMLResponse)
async def home(request: Request):
return templates.TemplateResponse("index.html", {"request": request})
@app.get("/documents", response_class=HTMLResponse)
async def documents(request: Request):
try:
# Get all documents from database
documents = database.get_all_metadata()
return templates.TemplateResponse(
"documents.html",
{
"request": request,
"documents": documents
}
)
except Exception as e:
error_msg = f"Error fetching documents: {str(e)}"
logging.error(error_msg)
logging.error(traceback.format_exc())
raise HTTPException(status_code=500, detail=error_msg)
@app.post("/upload-document")
async def upload_document(
file: UploadFile = File(...),
document_type: str = Form(...),
):
try:
# Generate unique document ID
doc_id = str(uuid.uuid4())
logging.info(f"Processing upload for document ID: {doc_id}")
# Save the uploaded file
file_path = f"data/uploads/{doc_id}_{file.filename}"
os.makedirs("data/uploads", exist_ok=True)
with open(file_path, "wb") as buffer:
content = await file.read()
buffer.write(content)
logging.info(f"File saved to {file_path}")
# Process the document
await document_processor.process_document(doc_id, file_path, document_type)
# Save document metadata
metadata = {
"document_id": doc_id,
"filename": file.filename,
"document_type": document_type,
}
# Save metadata to database
database.save_metadata(doc_id, metadata)
logging.info(f"Document {doc_id} processed successfully")
return {"document_id": doc_id, "message": "Document uploaded and processed successfully"}
except Exception as e:
error_msg = f"Error processing document: {str(e)}"
logging.error(error_msg)
logging.error(traceback.format_exc())
raise HTTPException(status_code=500, detail=error_msg)
@app.get("/document/{doc_id}/analysis", response_class=HTMLResponse)
async def get_analysis(request: Request, doc_id: str):
try:
analysis = await document_processor.get_analysis(doc_id)
metadata = database.get_metadata(doc_id)
return templates.TemplateResponse(
"analysis.html",
{
"request": request,
"analysis": analysis,
"metadata": metadata
}
)
except Exception as e:
error_msg = f"Error retrieving analysis: {str(e)}"
logging.error(error_msg)
logging.error(traceback.format_exc())
raise HTTPException(status_code=404, detail=error_msg)
@app.post("/document/{doc_id}/resubmit")
async def resubmit_document(
request: Request,
doc_id: str,
file: UploadFile = File(...),
document_type: Optional[str] = Form(None),
description: Optional[str] = Form(None)
):
try:
logging.info(f"Received resubmit request for document {doc_id}")
logging.info(f"File: {file.filename}, Document Type: {document_type}, Description: {description}")
# Save the resubmitted file
file_path = f"data/uploads/{doc_id}_resubmit_{file.filename}"
os.makedirs("data/uploads", exist_ok=True)
with open(file_path, "wb") as buffer:
content = await file.read()
buffer.write(content)
logging.info(f"Saved resubmitted file to {file_path}")
# Get existing metadata
try:
metadata = database.get_metadata(doc_id)
logging.info(f"Retrieved existing metadata for document {doc_id}: {metadata}")
# Update metadata if provided
if document_type:
metadata["document_type"] = document_type
if description:
metadata["description"] = description
# Save updated metadata
database.save_metadata(doc_id, metadata)
logging.info(f"Updated metadata for resubmitted document {doc_id}")
except Exception as e:
logging.error(f"Error updating metadata for resubmitted document {doc_id}: {str(e)}")
# If we can't get the metadata, use the provided document type or a default
if not document_type:
document_type = "unknown"
# Process the resubmitted document with the correct document type
doc_type = document_type if document_type else metadata.get("document_type", "unknown")
logging.info(f"Processing resubmitted document {doc_id} with document type {doc_type}")
await document_processor.process_document(doc_id, file_path, doc_type, is_resubmission=True)
logging.info(f"Document {doc_id} resubmitted successfully")
# Redirect back to the analysis page
return RedirectResponse(url=f"/document/{doc_id}/analysis", status_code=303)
except Exception as e:
error_msg = f"Error resubmitting document: {str(e)}"
logging.error(error_msg)
logging.error(traceback.format_exc())
raise HTTPException(status_code=500, detail=error_msg)
@app.delete("/document/{doc_id}")
async def delete_document(doc_id: str):
try:
# Get document metadata to find the filename
metadata = database.get_metadata(doc_id)
filename = metadata.get('filename', '')
# Delete the uploaded file
upload_path = f"data/uploads/{doc_id}_{filename}"
if os.path.exists(upload_path):
os.remove(upload_path)
logging.info(f"Deleted uploaded file: {upload_path}")
# Delete any resubmitted files
resubmit_pattern = f"data/uploads/{doc_id}_resubmit_*"
for resubmit_file in os.listdir("data/uploads"):
if resubmit_file.startswith(f"{doc_id}_resubmit_"):
os.remove(os.path.join("data/uploads", resubmit_file))
logging.info(f"Deleted resubmitted file: {resubmit_file}")
# Delete from database
database.delete_document(doc_id)
# Remove from vector store
vector_store.delete_document(doc_id)
logging.info(f"Removed document {doc_id} from vector store")
return {"message": "Document deleted successfully"}
except HTTPException:
raise
except Exception as e:
error_msg = f"Error deleting document: {str(e)}"
logging.error(error_msg)
logging.error(traceback.format_exc())
raise HTTPException(status_code=500, detail=error_msg)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=8000)
+42
View File
@@ -0,0 +1,42 @@
from pinecone import Pinecone
from services.config import config
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("reset_pinecone.log"),
logging.StreamHandler()
]
)
def reset_pinecone_index():
try:
# Initialize Pinecone client
pinecone = Pinecone(api_key=config.PINECONE_API_KEY)
# Check if index exists
if config.PINECONE_INDEX_NAME in pinecone.list_indexes().names():
logging.info(f"Deleting existing index '{config.PINECONE_INDEX_NAME}'")
pinecone.delete_index(config.PINECONE_INDEX_NAME)
# Create a new index with the correct dimension
logging.info(f"Creating new index '{config.PINECONE_INDEX_NAME}' with dimension {config.VECTOR_DIMENSION}")
pinecone.create_index(
name=config.PINECONE_INDEX_NAME,
dimension=config.VECTOR_DIMENSION,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
logging.info("Pinecone index reset successfully")
return True
except Exception as e:
logging.error(f"Error resetting Pinecone index: {str(e)}")
return False
if __name__ == "__main__":
from pinecone import ServerlessSpec
reset_pinecone_index()
+49
View File
@@ -0,0 +1,49 @@
import os
import json
import logging
from services.database import Database
def migrate_data():
"""Migrate existing data from filesystem to SQLite database."""
try:
database = Database()
# Migrate metadata
metadata_dir = "data/metadata"
if os.path.exists(metadata_dir):
for filename in os.listdir(metadata_dir):
if filename.endswith('.json'):
doc_id = filename[:-5] # Remove .json extension
with open(os.path.join(metadata_dir, filename), 'r') as f:
metadata = json.load(f)
database.save_metadata(doc_id, metadata)
logging.info(f"Migrated metadata for document {doc_id}")
# Migrate analysis
analysis_dir = "data/analysis"
if os.path.exists(analysis_dir):
for filename in os.listdir(analysis_dir):
if filename.endswith('.json'):
doc_id = filename[:-5] # Remove .json extension
with open(os.path.join(analysis_dir, filename), 'r') as f:
analysis = json.load(f)
database.save_analysis(doc_id, analysis)
logging.info(f"Migrated analysis for document {doc_id}")
logging.info("Migration completed successfully")
except Exception as e:
logging.error(f"Error during migration: {str(e)}")
raise
if __name__ == "__main__":
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("migration.log"),
logging.StreamHandler()
]
)
migrate_data()
View File
+24
View File
@@ -0,0 +1,24 @@
from dataclasses import dataclass
from dotenv import load_dotenv
import os
load_dotenv()
@dataclass
class Settings:
# API Keys
COHERE_API_KEY: str = os.getenv("COHERE_API_KEY", "")
DEEPSEEK_API_KEY: str = os.getenv("DEEPSEEK_API_KEY", "")
PINECONE_API_KEY: str = os.getenv("PINECONE_API_KEY", "")
PINECONE_ENVIRONMENT: str = os.getenv("PINECONE_ENVIRONMENT", "")
# Vector DB Settings
PINECONE_INDEX_NAME: str = "document-compliance"
# Model Settings
COHERE_EMBEDDING_MODEL: str = "embed-english-v3.0"
COHERE_RERANKER_MODEL: str = "rerank-english-v2.0"
DEEPSEEK_MODEL: str = "deepseek-r1"
VECTOR_DIMENSION: int = 1024 # Updated to match Cohere's embedding dimension
config = Settings()
+162
View File
@@ -0,0 +1,162 @@
import sqlite3
import json
import logging
from typing import Dict, Any, Optional
import os
class Database:
def __init__(self, db_path: str = "data/app.db"):
self.db_path = db_path
os.makedirs(os.path.dirname(db_path), exist_ok=True)
self._init_db()
def _init_db(self):
"""Initialize the database with required tables."""
try:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
# Create analysis table
cursor.execute('''
CREATE TABLE IF NOT EXISTS analysis (
document_id TEXT PRIMARY KEY,
summary TEXT,
issues TEXT,
recommendations TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Create metadata table
cursor.execute('''
CREATE TABLE IF NOT EXISTS metadata (
document_id TEXT PRIMARY KEY,
filename TEXT,
document_type TEXT,
description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
except Exception as e:
logging.error(f"Error initializing database: {str(e)}")
raise
def save_analysis(self, document_id: str, analysis: Dict[str, Any]):
"""Save analysis results to the database."""
try:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO analysis (document_id, summary, issues, recommendations)
VALUES (?, ?, ?, ?)
''', (
document_id,
analysis['summary'],
json.dumps(analysis['issues']),
json.dumps(analysis['recommendations'])
))
conn.commit()
except Exception as e:
logging.error(f"Error saving analysis for document {document_id}: {str(e)}")
raise
def get_analysis(self, document_id: str) -> Dict[str, Any]:
"""Retrieve analysis results from the database."""
try:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute('SELECT summary, issues, recommendations FROM analysis WHERE document_id = ?', (document_id,))
result = cursor.fetchone()
if not result:
raise FileNotFoundError(f"Analysis not found for document {document_id}")
return {
'document_id': document_id,
'summary': result[0],
'issues': json.loads(result[1]),
'recommendations': json.loads(result[2])
}
except Exception as e:
logging.error(f"Error retrieving analysis for document {document_id}: {str(e)}")
raise
def save_metadata(self, document_id: str, metadata: Dict[str, Any]):
"""Save document metadata to the database."""
try:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO metadata (document_id, filename, document_type, description)
VALUES (?, ?, ?, ?)
''', (
document_id,
metadata['filename'],
metadata['document_type'],
metadata.get('description')
))
conn.commit()
except Exception as e:
logging.error(f"Error saving metadata for document {document_id}: {str(e)}")
raise
def get_metadata(self, document_id: str) -> Dict[str, Any]:
"""Retrieve document metadata from the database."""
try:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute('SELECT filename, document_type, description FROM metadata WHERE document_id = ?', (document_id,))
result = cursor.fetchone()
if not result:
raise FileNotFoundError(f"Metadata not found for document {document_id}")
return {
'document_id': document_id,
'filename': result[0],
'document_type': result[1],
'description': result[2]
}
except Exception as e:
logging.error(f"Error retrieving metadata for document {document_id}: {str(e)}")
raise
def get_all_metadata(self) -> list:
"""Retrieve metadata for all documents."""
try:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT m.document_id, m.filename, m.document_type, m.description, m.created_at,
CASE WHEN a.document_id IS NOT NULL THEN 1 ELSE 0 END as has_analysis
FROM metadata m
LEFT JOIN analysis a ON m.document_id = a.document_id
ORDER BY m.created_at DESC
''')
results = cursor.fetchall()
return [{
'document_id': row[0],
'filename': row[1],
'document_type': row[2],
'description': row[3],
'upload_date': row[4],
'status': 'completed' if row[5] == 1 else 'processing'
} for row in results]
except Exception as e:
logging.error(f"Error retrieving all metadata: {str(e)}")
raise
def delete_document(self, document_id: str):
"""Delete a document and its associated data from the database."""
try:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM analysis WHERE document_id = ?', (document_id,))
cursor.execute('DELETE FROM metadata WHERE document_id = ?', (document_id,))
conn.commit()
except Exception as e:
logging.error(f"Error deleting document {document_id}: {str(e)}")
raise
+248
View File
@@ -0,0 +1,248 @@
import cohere
import requests
from typing import List, Dict, Any
import json
import os
import logging
from services.config import config
from services.database import Database
class DocumentProcessor:
def __init__(self, vector_store):
self.vector_store = vector_store
self.cohere_client = cohere.Client(config.COHERE_API_KEY)
self.deepseek_url = "https://api.deepseek.com/v1/chat/completions"
self.deepseek_headers = {
"Authorization": f"Bearer {config.DEEPSEEK_API_KEY}",
"Content-Type": "application/json"
}
self.database = Database()
async def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False):
try:
# Read document content with error handling for encoding
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
except UnicodeDecodeError:
# Try with a different encoding if UTF-8 fails
with open(file_path, 'r', encoding='latin-1') as f:
content = f.read()
logging.info(f"Processing document {doc_id} with content length: {len(content)}")
# Generate embeddings
embeddings = self.cohere_client.embed(
texts=[content],
model=config.COHERE_EMBEDDING_MODEL,
input_type="search_document" # Required parameter for the model
).embeddings[0]
# Store in vector database
self.vector_store.store_embedding(doc_id, embeddings, content)
# Process with DeepSeek for initial parsing
deepseek_parse_payload = {
"model": "deepseek-chat",
"messages": [
{
"role": "system",
"content": "You are a document analysis assistant. Extract key sections and requirements from the following document."
},
{
"role": "user",
"content": content
}
],
"max_tokens": 4000
}
# Make the API call with error handling
try:
deepseek_parse_response = requests.post(
self.deepseek_url,
json=deepseek_parse_payload,
headers=self.deepseek_headers,
timeout=60 # Add timeout
)
# Check if the response is successful
if deepseek_parse_response.status_code != 200:
logging.error(f"DeepSeek API error: {deepseek_parse_response.status_code} - {deepseek_parse_response.text}")
# Use a fallback summary if the API call fails
summary = "Document analysis could not be completed due to API limitations."
else:
# Try to parse the JSON response
try:
deepseek_parse_result = deepseek_parse_response.json()
summary = deepseek_parse_result['choices'][0]['message']['content']
except (json.JSONDecodeError, KeyError) as e:
logging.error(f"Error parsing DeepSeek response: {str(e)}")
logging.error(f"Response text: {deepseek_parse_response.text}")
summary = "Document analysis could not be completed due to parsing errors."
except requests.exceptions.RequestException as e:
logging.error(f"Error calling DeepSeek API: {str(e)}")
summary = "Document analysis could not be completed due to API connection issues."
# Process with DeepSeek for deep reasoning using URL
deepseek_payload = {
"model": "deepseek-chat",
"messages": [
{
"role": "system",
"content": "You are an expert in document compliance analysis. Analyze the following document for compliance issues and provide detailed feedback."
},
{
"role": "user",
"content": f"""Analyze this type of document {document_type} for compliance issues and provide detailed feedback:\n\n{content}
and these are the main sections of the document:\n\n{summary}"""
}
],
"max_tokens": 4000
}
# Make the API call with error handling
try:
deepseek_response = requests.post(
self.deepseek_url,
json=deepseek_payload,
headers=self.deepseek_headers,
timeout=60 # Add timeout
)
# Check if the response is successful
if deepseek_response.status_code != 200:
logging.error(f"DeepSeek API error: {deepseek_response.status_code} - {deepseek_response.text}")
# Use a fallback for issues if the API call fails
issues = ["Document analysis could not be completed due to API limitations."]
else:
# Try to parse the JSON response
try:
deepseek_result = deepseek_response.json()
issues = self._extract_issues(deepseek_result['choices'][0]['message']['content'])
except (json.JSONDecodeError, KeyError) as e:
logging.error(f"Error parsing DeepSeek response: {str(e)}")
logging.error(f"Response text: {deepseek_response.text}")
issues = ["Document analysis could not be completed due to parsing errors."]
except requests.exceptions.RequestException as e:
logging.error(f"Error calling DeepSeek API: {str(e)}")
issues = ["Document analysis could not be completed due to API connection issues."]
# Use Cohere reranker to prioritize issues
try:
reranked_issues = self.cohere_client.rerank(
query="Compliance issues in technical document",
documents=issues,
model=config.COHERE_RERANKER_MODEL
)
except Exception as e:
logging.error(f"Error using Cohere reranker: {str(e)}")
# Create a simple reranked issues list if Cohere fails
reranked_issues = [type('obj', (object,), {'document': issue, 'index': i}) for i, issue in enumerate(issues)]
# Store analysis results
analysis = {
"document_id": doc_id,
"summary": summary,
"issues": self._format_issues(reranked_issues),
"recommendations": self._generate_recommendations(reranked_issues)
}
# Save analysis to database
self.database.save_analysis(doc_id, analysis)
# If this is a resubmission, update the metadata in the database
if is_resubmission:
try:
# Get existing metadata
existing_metadata = self.database.get_metadata(doc_id)
# Update with new document type if provided
if document_type:
existing_metadata["document_type"] = document_type
# Save updated metadata
self.database.save_metadata(doc_id, existing_metadata)
logging.info(f"Updated metadata for resubmitted document {doc_id}")
except Exception as e:
logging.error(f"Error updating metadata for resubmitted document {doc_id}: {str(e)}")
logging.info(f"Document {doc_id} processed successfully")
return True
except Exception as e:
logging.error(f"Error processing document {doc_id}: {str(e)}")
raise
async def get_analysis(self, doc_id: str) -> Dict[str, Any]:
return self.database.get_analysis(doc_id)
def _extract_issues(self, deepseek_response: str) -> List[str]:
# Simple extraction of issues from DeepSeek's response
# In a real implementation, this would be more sophisticated
print(deepseek_response)
return [issue.strip() for issue in re.split(r'\d+\.', deepseek_response) if issue.strip()]
def _format_issues(self, reranked_issues) -> List[Dict[str, Any]]:
return [
{
"issue": issue[0] if isinstance(issue, tuple) else issue.document,
"severity": "high" if i < 3 else "medium" if i < 6 else "low",
"rank": i + 1
}
for i, issue in enumerate(reranked_issues)
]
def _generate_recommendations(self, reranked_issues) -> List[str]:
# Generate specific recommendations for each issue
recommendations = []
print(f"Generating recommendations for {reranked_issues} issues")
# Extract the results from the RerankResponse object
results = reranked_issues.results if hasattr(reranked_issues, 'results') else reranked_issues
for issue in results[:5]: # Focus on top 5 issues
recommendation_payload = {
"model": "deepseek-chat",
"messages": [
{
"role": "system",
"content": "You are an expert in document compliance. Provide specific, actionable recommendations to fix compliance issues."
},
{
"role": "user",
"content": f"Provide a specific, actionable recommendation to fix this compliance issue: {issue}"
}
],
"max_tokens": 1000
}
# Make the API call with error handling
try:
recommendation_response = requests.post(
self.deepseek_url,
json=recommendation_payload,
headers=self.deepseek_headers,
timeout=60 # Add timeout
)
# Check if the response is successful
if recommendation_response.status_code != 200:
logging.error(f"DeepSeek API error: {recommendation_response.status_code} - {recommendation_response.text}")
recommendations.append("Recommendation could not be generated due to API limitations.")
else:
# Try to parse the JSON response
try:
recommendation_result = recommendation_response.json()
recommendations.append(recommendation_result['choices'][0]['message']['content'])
except (json.JSONDecodeError, KeyError) as e:
logging.error(f"Error parsing DeepSeek response: {str(e)}")
logging.error(f"Response text: {recommendation_response.text}")
recommendations.append("Recommendation could not be generated due to parsing errors.")
except requests.exceptions.RequestException as e:
logging.error(f"Error calling DeepSeek API: {str(e)}")
recommendations.append("Recommendation could not be generated due to API connection issues.")
return recommendations
def _store_document(self, doc_id: str, file_path: str):
# save document to vector store
self.vector_store.add_document(doc_id, file_path)
+57
View File
@@ -0,0 +1,57 @@
import cohere
from typing import List, Union
from services.config import config
class EmbeddingService:
def __init__(self):
self.cohere_client = cohere.Client(config.COHERE_API_KEY)
self.model = config.COHERE_EMBEDDING_MODEL
def create_embedding(self, text: str) -> List[float]:
"""
Create an embedding for a single text using Cohere.
Args:
text (str): The text to create an embedding for
Returns:
List[float]: The embedding vector
"""
response = self.cohere_client.embed(
texts=[text],
model=self.model,
input_type="search_document"
)
return response.embeddings[0]
def create_embeddings(self, texts: List[str]) -> List[List[float]]:
"""
Create embeddings for multiple texts using Cohere.
Args:
texts (List[str]): List of texts to create embeddings for
Returns:
List[List[float]]: List of embedding vectors
"""
response = self.cohere_client.embed(
texts=texts,
model=self.model,
input_type="search_document",
dimension=config.VECTOR_DIMENSION
)
return response.embeddings
def create_embedding_from_file(self, file_path: str) -> List[float]:
"""
Create an embedding from a file's contents.
Args:
file_path (str): Path to the file to create an embedding for
Returns:
List[float]: The embedding vector
"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return self.create_embedding(content)
+139
View File
@@ -0,0 +1,139 @@
from pinecone import Pinecone, ServerlessSpec
from typing import List, Any, Optional
from services.config import config
from services.embedding_service import EmbeddingService
import logging
import os
class VectorStore:
def __init__(self, pinecone_client: Optional[Pinecone] = None, embedding_service: Optional[EmbeddingService] = None):
self.pinecone = pinecone_client or Pinecone(api_key=config.PINECONE_API_KEY)
self.index_name = config.PINECONE_INDEX_NAME
self.embedding_service = embedding_service or EmbeddingService()
self._ensure_index()
def _ensure_index(self):
"""Ensure the Pinecone index exists, create if it doesn't."""
try:
# Check if index exists, create if it doesn't
if self.index_name not in self.pinecone.list_indexes().names():
# Create a new index with the correct dimension
self.pinecone.create_index(
name=self.index_name,
dimension=config.VECTOR_DIMENSION, # Using the dimension from config
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
logging.info(f"Created new index '{self.index_name}' with dimension {config.VECTOR_DIMENSION}")
self.index = self.pinecone.Index(self.index_name)
# Check if the index dimension matches the config dimension
self._check_index_dimension()
except Exception as e:
logging.error(f"Error ensuring index exists: {str(e)}")
raise
def _check_index_dimension(self):
"""Check if the index dimension matches the config dimension and fix if needed."""
try:
# Get the index description
index_description = self.pinecone.describe_index(self.index_name)
index_dimension = index_description.dimension
if index_dimension != config.VECTOR_DIMENSION:
logging.warning(f"Index dimension {index_dimension} does not match config dimension {config.VECTOR_DIMENSION}")
logging.info("Recreating index with correct dimension...")
# Delete the existing index
self.pinecone.delete_index(self.index_name)
# Create a new index with the correct dimension
self.pinecone.create_index(
name=self.index_name,
dimension=config.VECTOR_DIMENSION,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
# Reinitialize the index
self.index = self.pinecone.Index(self.index_name)
logging.info(f"Index recreated with dimension {config.VECTOR_DIMENSION}")
except Exception as e:
logging.error(f"Error checking index dimension: {str(e)}")
raise
def store_embedding(self, doc_id: str, embedding: List[float], content: str):
"""Store document embedding in Pinecone."""
try:
# Verify embedding dimension matches the index dimension
if len(embedding) != config.VECTOR_DIMENSION:
raise ValueError(f"Embedding dimension {len(embedding)} does not match index dimension {config.VECTOR_DIMENSION}")
self.index.upsert(
vectors=[{
"id": doc_id,
"values": embedding,
"metadata": {
"content": content
}
}]
)
logging.info(f"Stored embedding for document {doc_id}")
except Exception as e:
logging.error(f"Error storing embedding for document {doc_id}: {str(e)}")
raise
def search_similar(self, query_embedding: List[float], top_k: int = 5) -> List[Any]:
"""Search for similar documents."""
try:
# Verify query embedding dimension matches the index dimension
if len(query_embedding) != config.VECTOR_DIMENSION:
raise ValueError(f"Query embedding dimension {len(query_embedding)} does not match index dimension {config.VECTOR_DIMENSION}")
results = self.index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True
)
return results.matches
except Exception as e:
logging.error(f"Error searching for similar documents: {str(e)}")
raise
def delete_document(self, doc_id: str):
"""Delete a document from the index."""
try:
self.index.delete(ids=[doc_id])
logging.info(f"Deleted document {doc_id} from index")
except Exception as e:
logging.error(f"Error deleting document {doc_id}: {str(e)}")
raise
def add_document(self, doc_id: str, file_path: str):
"""Add a document to the index."""
try:
# Check if file exists
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
# read document content with error handling for encoding
try:
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
except UnicodeDecodeError:
# Try with a different encoding if UTF-8 fails
with open(file_path, "r", encoding="latin-1") as file:
content = file.read()
# create embedding
embedding = self.embedding_service.create_embedding(content)
# store embedding
logging.info(f"Storing embedding for document {doc_id}")
self.store_embedding(doc_id, embedding, content)
return True
except Exception as e:
logging.error(f"Error adding document {doc_id}: {str(e)}")
raise
+104
View File
@@ -0,0 +1,104 @@
/* Markdown Styles */
.markdown-body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif;
font-size: 16px;
line-height: 1.5;
word-wrap: break-word;
}
.markdown-body h1,
.markdown-body h2,
.markdown-body h3,
.markdown-body h4,
.markdown-body h5,
.markdown-body h6 {
margin-top: 24px;
margin-bottom: 16px;
font-weight: 600;
line-height: 1.25;
}
.markdown-body h1 { font-size: 2em; }
.markdown-body h2 { font-size: 1.5em; }
.markdown-body h3 { font-size: 1.25em; }
.markdown-body h4 { font-size: 1em; }
.markdown-body h5 { font-size: 0.875em; }
.markdown-body h6 { font-size: 0.85em; }
.markdown-body p {
margin-top: 0;
margin-bottom: 16px;
}
.markdown-body ul,
.markdown-body ol {
padding-left: 2em;
margin-top: 0;
margin-bottom: 16px;
}
.markdown-body code {
padding: 0.2em 0.4em;
margin: 0;
font-size: 85%;
background-color: rgba(27, 31, 35, 0.05);
border-radius: 3px;
font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
}
.markdown-body pre {
padding: 16px;
overflow: auto;
font-size: 85%;
line-height: 1.45;
background-color: #f6f8fa;
border-radius: 3px;
margin-top: 0;
margin-bottom: 16px;
}
.markdown-body pre code {
padding: 0;
margin: 0;
font-size: 100%;
word-break: normal;
white-space: pre;
background: transparent;
border: 0;
}
.markdown-body blockquote {
padding: 0 1em;
color: #6a737d;
border-left: 0.25em solid #dfe2e5;
margin: 0 0 16px 0;
}
.markdown-body table {
display: block;
width: 100%;
overflow: auto;
margin-top: 0;
margin-bottom: 16px;
border-spacing: 0;
border-collapse: collapse;
}
.markdown-body table th {
font-weight: 600;
}
.markdown-body table th,
.markdown-body table td {
padding: 6px 13px;
border: 1px solid #dfe2e5;
}
.markdown-body table tr {
background-color: #fff;
border-top: 1px solid #c6cbd1;
}
.markdown-body table tr:nth-child(2n) {
background-color: #f6f8fa;
}
+62
View File
@@ -0,0 +1,62 @@
/* Custom styles for Mini SpecsComply Pro */
body {
min-height: 100vh;
display: flex;
flex-direction: column;
}
.footer {
margin-top: auto;
}
.card {
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
border: none;
border-radius: 8px;
}
.card-header {
border-radius: 8px 8px 0 0 !important;
}
.form-control:focus, .form-select:focus {
border-color: #0d6efd;
box-shadow: 0 0 0 0.25rem rgba(13, 110, 253, 0.25);
}
.btn-primary {
padding: 0.5rem 1.5rem;
font-weight: 500;
}
.list-group-item {
border-left: none;
border-right: none;
}
.list-group-item:first-child {
border-top: none;
}
.list-group-item:last-child {
border-bottom: none;
}
/* File upload styling */
input[type="file"] {
padding: 0.375rem 0.75rem;
}
/* Spinner styling */
.spinner-border {
margin-right: 0.5rem;
}
/* Responsive adjustments */
@media (max-width: 768px) {
.container {
padding-left: 15px;
padding-right: 15px;
}
}
+79
View File
@@ -0,0 +1,79 @@
// Main JavaScript file for Mini SpecsComply Pro
// Function to show toast notifications
function showToast(message, type = 'info') {
// Create toast element
const toast = document.createElement('div');
toast.className = `toast align-items-center text-white bg-${type} border-0`;
toast.setAttribute('role', 'alert');
toast.setAttribute('aria-live', 'assertive');
toast.setAttribute('aria-atomic', 'true');
// Create toast content
toast.innerHTML = `
<div class="d-flex">
<div class="toast-body">
${message}
</div>
<button type="button" class="btn-close btn-close-white me-2 m-auto" data-bs-dismiss="toast" aria-label="Close"></button>
</div>
`;
// Add toast to container
const toastContainer = document.getElementById('toastContainer') || createToastContainer();
toastContainer.appendChild(toast);
// Initialize and show toast
const bsToast = new bootstrap.Toast(toast);
bsToast.show();
// Remove toast after it's hidden
toast.addEventListener('hidden.bs.toast', () => {
toast.remove();
});
}
// Function to create toast container if it doesn't exist
function createToastContainer() {
const container = document.createElement('div');
container.id = 'toastContainer';
container.className = 'toast-container position-fixed bottom-0 end-0 p-3';
document.body.appendChild(container);
return container;
}
// Function to validate file type
function validateFileType(input) {
const allowedTypes = ['application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'text/plain', 'text/markdown'];
const file = input.files[0];
if (file && !allowedTypes.includes(file.type)) {
showToast('Please upload a PDF, DOCX, TXT, or MD file.', 'danger');
input.value = '';
return false;
}
return true;
}
// Add event listeners when DOM is loaded
document.addEventListener('DOMContentLoaded', function() {
// File input validation
const fileInput = document.getElementById('documentFile');
if (fileInput) {
fileInput.addEventListener('change', function() {
validateFileType(this);
});
}
// Document type change handler
const docTypeSelect = document.getElementById('documentType');
if (docTypeSelect) {
docTypeSelect.addEventListener('change', function() {
const descriptionField = document.getElementById('documentDescription');
if (descriptionField) {
descriptionField.placeholder = `Brief description of your ${this.options[this.selectedIndex].text.toLowerCase()}...`;
}
});
}
});
+135
View File
@@ -0,0 +1,135 @@
{% extends "base.html" %}
{% block title %}Analysis Results - Mini SpecsComply Pro{% endblock %}
{% block extra_css %}
<link rel="stylesheet" href="{{ url_for('static', path='css/markdown.css') }}">
{% endblock %}
{% block content %}
<div class="row">
<div class="col-md-8 mx-auto">
<div class="card mb-4">
<div class="card-header bg-primary text-white d-flex justify-content-between align-items-center">
<h4 class="mb-0">Analysis Results</h4>
<span class="badge bg-light text-primary">{{ metadata.document_type|replace('_', ' ')|title }}</span>
</div>
<div class="card-body">
<h5 class="card-title">Document Information</h5>
<p class="card-text"><strong>Filename:</strong> {{ metadata.filename }}</p>
{% if metadata.description %}
<p class="card-text"><strong>Description:</strong> {{ metadata.description }}</p>
{% endif %}
<hr>
<h5 class="card-title">Summary</h5>
<div class="card mb-3">
<div class="card-body bg-light markdown-body">
{{ analysis.summary|markdown|safe }}
</div>
</div>
<h5 class="card-title">Compliance Issues</h5>
<div class="accordion" id="issuesAccordion">
{% for issue in analysis.issues %}
<div class="accordion-item">
<h2 class="accordion-header" id="heading{{ loop.index }}">
<button class="accordion-button {% if not loop.first %}collapsed{% endif %}" type="button" data-bs-toggle="collapse" data-bs-target="#collapse{{ loop.index }}">
<span class="badge bg-{{ 'danger' if issue.severity == 'high' else 'warning' if issue.severity == 'medium' else 'info' }} me-2">
{{ issue.severity|title }}
</span>
{{ issue.issue }}
</button>
</h2>
<div id="collapse{{ loop.index }}" class="accordion-collapse collapse {% if loop.first %}show{% endif %}" data-bs-parent="#issuesAccordion">
<div class="accordion-body">
<p><strong>Rank:</strong> {{ issue.rank }}</p>
<p><strong>Recommendation:</strong></p>
<div class="alert alert-info markdown-body">
{{ analysis.recommendations[loop.index0]|markdown|safe }}
</div>
</div>
</div>
</div>
{% endfor %}
</div>
</div>
<div class="card-footer">
<div class="d-flex justify-content-between">
<a href="/" class="btn btn-outline-primary">Back to Home</a>
<button type="button" class="btn btn-primary" data-bs-toggle="modal" data-bs-target="#resubmitModal">
Resubmit Document
</button>
</div>
</div>
</div>
</div>
</div>
<!-- Resubmit Modal -->
<div class="modal fade" id="resubmitModal" tabindex="-1" aria-labelledby="resubmitModalLabel" aria-hidden="true">
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="resubmitModalLabel">Resubmit Document</h5>
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
<form id="resubmitForm" action="/document/{{ metadata.document_id }}/resubmit" method="post" enctype="multipart/form-data">
<input type="hidden" name="document_id" value="{{ metadata.document_id }}">
<div class="mb-3">
<label for="resubmitFile" class="form-label">Updated Document</label>
<input class="form-control" type="file" id="resubmitFile" name="file" required>
</div>
<div class="mb-3">
<label for="resubmitDocumentType" class="form-label">Document Type (Optional)</label>
<select class="form-select" id="resubmitDocumentType" name="document_type">
<option value="" selected>Keep current type ({{ metadata.document_type|replace('_', ' ')|title }})</option>
<option value="technical_specification">Technical Specification</option>
<option value="requirement_document">Requirement Document</option>
<option value="design_document">Design Document</option>
<option value="test_document">Test Document</option>
<option value="user_manual">User Manual</option>
<option value="other">Other</option>
</select>
</div>
<div class="mb-3">
<label for="resubmitDescription" class="form-label">Changes Made (Optional)</label>
<textarea class="form-control" id="resubmitDescription" name="description" rows="3" placeholder="Describe the changes you made to address the issues..."></textarea>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Cancel</button>
<button type="submit" class="btn btn-primary" id="resubmitButton">
<span class="spinner-border spinner-border-sm d-none" role="status" aria-hidden="true"></span>
Submit
</button>
</div>
</form>
</div>
</div>
</div>
</div>
{% endblock %}
{% block extra_js %}
<script>
// Add file type validation to the resubmit form
document.getElementById('resubmitFile').addEventListener('change', function() {
validateFileType(this);
});
document.getElementById('resubmitForm').addEventListener('submit', function(e) {
// Don't prevent default - let the form submit normally
console.log('Form submitted');
console.log('Form action:', this.action);
const submitButton = document.getElementById('resubmitButton');
const spinner = submitButton.querySelector('.spinner-border');
// Show loading state
submitButton.disabled = true;
spinner.classList.remove('d-none');
});
</script>
{% endblock %}
+52
View File
@@ -0,0 +1,52 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% block title %}Mini SpecsComply Pro{% endblock %}</title>
<!-- Bootstrap CSS -->
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<!-- Custom CSS -->
<link rel="stylesheet" href="{{ url_for('static', path='css/style.css') }}">
{% block extra_css %}{% endblock %}
</head>
<body>
<nav class="navbar navbar-expand-lg navbar-dark bg-primary">
<div class="container">
<a class="navbar-brand" href="/">Mini SpecsComply Pro</a>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarNav">
<ul class="navbar-nav">
<li class="nav-item">
<a class="nav-link" href="/">Home</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/documents">My Documents</a>
</li>
</ul>
</div>
</div>
</nav>
<div class="container mt-4">
{% block content %}{% endblock %}
</div>
<!-- Toast Container -->
<div class="toast-container position-fixed bottom-0 end-0 p-3" id="toastContainer"></div>
<footer class="footer mt-5 py-3 bg-light">
<div class="container text-center">
<span class="text-muted">© 2025 Mini SpecsComply Pro</span>
</div>
</footer>
<!-- Bootstrap JS -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
<!-- Custom JS -->
<script src="{{ url_for('static', path='js/main.js') }}"></script>
{% block extra_js %}{% endblock %}
</body>
</html>
+131
View File
@@ -0,0 +1,131 @@
{% extends "base.html" %}
{% block title %}My Documents - Mini SpecsComply Pro{% endblock %}
{% block content %}
<div class="row">
<div class="col-md-10 mx-auto">
<div class="card">
<div class="card-header bg-primary text-white d-flex justify-content-between align-items-center">
<h4 class="mb-0">My Documents</h4>
<a href="/" class="btn btn-light btn-sm">Upload New Document</a>
</div>
<div class="card-body">
{% if documents %}
<div class="table-responsive">
<table class="table table-hover">
<thead>
<tr>
<th>Document</th>
<th>Type</th>
<th>Upload Date</th>
<th>Status</th>
<th>Actions</th>
</tr>
</thead>
<tbody>
{% for doc in documents %}
<tr>
<td>
<div>
<strong>{{ doc.filename }}</strong>
{% if doc.description %}
<div class="text-muted small">{{ doc.description }}</div>
{% endif %}
</div>
</td>
<td>
<span class="badge bg-info">{{ doc.document_type|replace('_', ' ')|title }}</span>
</td>
<td>{{ doc.upload_date }}</td>
<td>
{% if doc.status == 'completed' %}
<span class="badge bg-success">Completed</span>
{% elif doc.status == 'processing' %}
<span class="badge bg-warning">Processing</span>
{% else %}
<span class="badge bg-danger">Failed</span>
{% endif %}
</td>
<td>
<div class="btn-group">
<a href="/document/{{ doc.document_id }}/analysis" class="btn btn-sm btn-outline-primary">View Analysis</a>
<button type="button" class="btn btn-sm btn-outline-danger" data-bs-toggle="modal" data-bs-target="#deleteModal{{ doc.document_id }}">
Delete
</button>
</div>
<!-- Delete Modal -->
<div class="modal fade" id="deleteModal{{ doc.document_id }}" tabindex="-1" aria-hidden="true">
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title">Confirm Delete</h5>
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
<p>Are you sure you want to delete "{{ doc.filename }}"?</p>
<p class="text-danger">This action cannot be undone.</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Cancel</button>
<button type="button" class="btn btn-danger" onclick="deleteDocument('{{ doc.document_id }}')">Delete</button>
</div>
</div>
</div>
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% else %}
<div class="text-center py-5">
<h5 class="text-muted">No documents uploaded yet</h5>
<p>Upload your first document to get started with compliance analysis.</p>
<a href="/" class="btn btn-primary mt-3">Upload Document</a>
</div>
{% endif %}
</div>
</div>
</div>
</div>
{% endblock %}
{% block extra_js %}
<script>
function deleteDocument(docId) {
// Make an API call to delete the document
fetch(`/document/${docId}`, {
method: 'DELETE',
headers: {
'Content-Type': 'application/json'
}
})
.then(response => {
if (!response.ok) {
throw new Error('Failed to delete document');
}
return response.json();
})
.then(data => {
// Show success message
showToast('Document deleted successfully.', 'success');
// Close the modal
const modal = bootstrap.Modal.getInstance(document.getElementById(`deleteModal${docId}`));
modal.hide();
// Reload the page after a short delay
setTimeout(() => {
window.location.reload();
}, 1000);
})
.catch(error => {
console.error('Error:', error);
showToast('Failed to delete document. Please try again.', 'danger');
});
}
</script>
{% endblock %}
+99
View File
@@ -0,0 +1,99 @@
{% extends "base.html" %}
{% block title %}Home - Mini SpecsComply Pro{% endblock %}
{% block content %}
<div class="row justify-content-center">
<div class="col-md-8">
<div class="card">
<div class="card-header bg-primary text-white">
<h4 class="mb-0">Upload Document for Compliance Analysis</h4>
</div>
<div class="card-body">
<form id="uploadForm" action="/upload-document" method="post" enctype="multipart/form-data">
<div class="mb-3">
<label for="documentType" class="form-label">Document Type</label>
<select class="form-select" id="documentType" name="document_type" required>
<option value="" selected disabled>Select document type</option>
<option value="technical_specification">Technical Specification</option>
<option value="requirement_document">Requirement Document</option>
<option value="design_document">Design Document</option>
<option value="test_document">Test Document</option>
<option value="user_manual">User Manual</option>
<option value="other">Other</option>
</select>
</div>
<div class="mb-3">
<label for="documentFile" class="form-label">Document File</label>
<input class="form-control" type="file" id="documentFile" name="file" required>
<div class="form-text">Supported formats: PDF, DOCX, TXT, MD</div>
</div>
<div class="d-grid">
<button type="submit" class="btn btn-primary" id="uploadButton">
<span class="spinner-border spinner-border-sm d-none" role="status" aria-hidden="true"></span>
Upload and analyze
</button>
</div>
</form>
</div>
</div>
<div class="card mt-4">
<div class="card-header bg-info text-white">
<h5 class="mb-0">How It Works</h5>
</div>
<div class="card-body">
<ol class="list-group list-group-numbered">
<li class="list-group-item">Upload your document and select its type</li>
<li class="list-group-item">Our AI analyzes the document for compliance issues</li>
<li class="list-group-item">Receive a detailed report with issues and recommendations</li>
<li class="list-group-item">Make necessary changes and resubmit if needed</li>
</ol>
</div>
</div>
</div>
</div>
{% endblock %}
{% block extra_js %}
<script>
document.getElementById('uploadForm').addEventListener('submit', function(e) {
e.preventDefault();
const uploadButton = document.getElementById('uploadButton');
const spinner = uploadButton.querySelector('.spinner-border');
// Show loading state
uploadButton.disabled = true;
spinner.classList.remove('d-none');
// Create FormData object
const formData = new FormData(this);
// Send the form data
fetch('/upload-document', {
method: 'POST',
body: formData
})
.then(response => {
if (response.ok) {
// Redirect to /documents route on success
window.location.href = '/documents';
} else {
throw new Error('Failed to upload document');
}
})
.catch(error => {
console.error('Error:', error);
alert('Error uploading document. Please try again.');
})
.finally(() => {
// Reset button state
uploadButton.disabled = false;
spinner.classList.add('d-none');
});
});
</script>
{% endblock %}