update functions
This commit is contained in:
@@ -29,6 +29,13 @@ data/
|
|||||||
*.feather
|
*.feather
|
||||||
*.pkl
|
*.pkl
|
||||||
*.pickle
|
*.pickle
|
||||||
|
*.pdf
|
||||||
|
*.docx
|
||||||
|
*.xlsx
|
||||||
|
data/uploads/
|
||||||
|
*.pdf
|
||||||
|
*.docx
|
||||||
|
*.xlsx
|
||||||
|
|
||||||
# Vector database files
|
# Vector database files
|
||||||
*.faiss
|
*.faiss
|
||||||
|
|||||||
@@ -0,0 +1,95 @@
|
|||||||
|
import os
|
||||||
|
from docx import Document
|
||||||
|
from typing import Dict, List
|
||||||
|
from .config import Config
|
||||||
|
from .embeddings import EmbeddingGenerator
|
||||||
|
|
||||||
|
|
||||||
|
class ComplianceLoader:
|
||||||
|
def __init__(self):
|
||||||
|
self.embedding_generator = EmbeddingGenerator()
|
||||||
|
self.compliance_docs = {}
|
||||||
|
self.compliance_embeddings = {}
|
||||||
|
|
||||||
|
def load_compliance_standards(self, data_folder: str = "data/"):
|
||||||
|
"""Load all compliance documents and generate embeddings"""
|
||||||
|
compliance_files = [
|
||||||
|
"Invitation to Tender.docx",
|
||||||
|
"Tender Specifications.docx",
|
||||||
|
"Bill of Quantities.docx",
|
||||||
|
"Scope of Work.docx",
|
||||||
|
"Supplier SQualification requirements.docx",
|
||||||
|
"form of tender.docx",
|
||||||
|
"confidentiality agreement.docx",
|
||||||
|
"Project1-FEED CONTRACTOR-MUL-E000-PR-LST-000.docx"
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in compliance_files:
|
||||||
|
file_path = os.path.join(data_folder, filename)
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
try:
|
||||||
|
# Extract text from compliance document
|
||||||
|
doc = Document(file_path)
|
||||||
|
text = '\n'.join([para.text for para in doc.paragraphs])
|
||||||
|
|
||||||
|
# Store text and generate embedding
|
||||||
|
doc_key = filename.replace('.docx', '').replace(' ', '_').lower()
|
||||||
|
self.compliance_docs[doc_key] = {
|
||||||
|
'filename': filename,
|
||||||
|
'content': text,
|
||||||
|
'sections': self._extract_sections(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Generate embedding for similarity search
|
||||||
|
self.compliance_embeddings[doc_key] = self.embedding_generator.generate_embeddings(text)
|
||||||
|
|
||||||
|
print(f"Loaded compliance standard: {filename}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading {filename}: {str(e)}")
|
||||||
|
|
||||||
|
def _extract_sections(self, text: str) -> List[str]:
|
||||||
|
"""Extract key sections from compliance documents"""
|
||||||
|
sections = []
|
||||||
|
lines = text.split('\n')
|
||||||
|
current_section = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if line and (line.isupper() or line.endswith(':') or
|
||||||
|
any(keyword in line.lower() for keyword in ['requirement', 'specification', 'must', 'shall'])):
|
||||||
|
if current_section:
|
||||||
|
sections.append('\n'.join(current_section))
|
||||||
|
current_section = [line]
|
||||||
|
elif line:
|
||||||
|
current_section.append(line)
|
||||||
|
|
||||||
|
if current_section:
|
||||||
|
sections.append('\n'.join(current_section))
|
||||||
|
|
||||||
|
return sections
|
||||||
|
|
||||||
|
def get_relevant_standards(self, document_embedding: List[float], threshold: float = 0.7) -> List[Dict]:
|
||||||
|
"""Find relevant compliance standards for a document"""
|
||||||
|
relevant_standards = []
|
||||||
|
|
||||||
|
for doc_key, compliance_embedding in self.compliance_embeddings.items():
|
||||||
|
# Calculate similarity
|
||||||
|
relevant_standards.append({
|
||||||
|
'standard': doc_key,
|
||||||
|
'filename': self.compliance_docs[doc_key]['filename'],
|
||||||
|
'content': self.compliance_docs[doc_key]['content'],
|
||||||
|
'sections': self.compliance_docs[doc_key]['sections']
|
||||||
|
})
|
||||||
|
|
||||||
|
return relevant_standards
|
||||||
|
|
||||||
|
def get_compliance_context(self) -> str:
|
||||||
|
"""Get formatted compliance context for LLM prompts"""
|
||||||
|
context = "COMPLIANCE STANDARDS:\n\n"
|
||||||
|
|
||||||
|
for doc_key, doc_data in self.compliance_docs.items():
|
||||||
|
context += f"=== {doc_data['filename']} ===\n"
|
||||||
|
context += f"{doc_data['content'][:1000]}...\n\n"
|
||||||
|
|
||||||
|
return context
|
||||||
+4
-3
@@ -12,21 +12,22 @@ class Config:
|
|||||||
|
|
||||||
# Groq
|
# Groq
|
||||||
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
||||||
GROQ_MODEL = "mixtral-8x7b-32768"
|
GROQ_MODEL = "llama3-70b-8192"
|
||||||
|
|
||||||
# Claude
|
# Claude
|
||||||
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")
|
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")
|
||||||
CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
|
CLAUDE_MODEL = "claude-3-5-sonnet-20241022"
|
||||||
|
|
||||||
# Vector Store
|
# Vector Store
|
||||||
VECTOR_STORE_TYPE = "pinecone"
|
VECTOR_STORE_TYPE = "pinecone"
|
||||||
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
|
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
|
||||||
PINECONE_INDEX = "scp-docs"
|
PINECONE_INDEX = "scp-docs"
|
||||||
PINECONE_ENV = "gcp-starter"
|
PINECONE_ENV = "gcp-starter"
|
||||||
|
EMBEDDING_DIMENSION = 1024
|
||||||
|
|
||||||
# Document Processing
|
# Document Processing
|
||||||
MAX_DOC_SIZE = 10 * 1024 * 1024 # 10MB
|
MAX_DOC_SIZE = 10 * 1024 * 1024 # 10MB
|
||||||
ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.txt'}
|
ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.txt'}
|
||||||
|
|
||||||
# Paths
|
# Paths
|
||||||
UPLOAD_FOLDER = "documents/"
|
UPLOAD_FOLDER = "documents/"
|
||||||
@@ -10,15 +10,19 @@ class EmbeddingGenerator:
|
|||||||
response = self.client.embed(
|
response = self.client.embed(
|
||||||
texts=[text],
|
texts=[text],
|
||||||
model=Config.EMBED_MODEL,
|
model=Config.EMBED_MODEL,
|
||||||
input_type="document"
|
input_type="search_document"
|
||||||
)
|
)
|
||||||
return response.embeddings[0]
|
return response.embeddings[0]
|
||||||
|
|
||||||
def rerank_issues(self, issues: list, query: str, top_n: int = 5):
|
def rerank_issues(self, issues: list, query: str, top_n: int = 5):
|
||||||
|
# Handle empty issues list
|
||||||
|
if not issues:
|
||||||
|
return []
|
||||||
|
|
||||||
response = self.client.rerank(
|
response = self.client.rerank(
|
||||||
query=query,
|
query=query,
|
||||||
documents=issues,
|
documents=issues,
|
||||||
top_n=top_n,
|
top_n=min(top_n, len(issues)),
|
||||||
model=Config.RERANK_MODEL
|
model=Config.RERANK_MODEL
|
||||||
)
|
)
|
||||||
return [result.document for result in response.results]
|
return [result.document for result in response.results]
|
||||||
|
|||||||
+410
-69
@@ -3,20 +3,32 @@ from fastapi.responses import JSONResponse
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
|
from docx import Document
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
import io
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from .config import Config
|
from .config import Config
|
||||||
from .embeddings import EmbeddingGenerator
|
from .embeddings import EmbeddingGenerator
|
||||||
from .vector_stores import VectorStore
|
from .vector_stores import VectorStore
|
||||||
|
from .compliance_loader import ComplianceLoader
|
||||||
import groq
|
import groq
|
||||||
import anthropic
|
import json
|
||||||
|
|
||||||
app = FastAPI(title="Mini SpecsComply Pro")
|
app = FastAPI(title="Mini SpecsComply Pro")
|
||||||
|
|
||||||
|
# Initialize components
|
||||||
embeddings = EmbeddingGenerator()
|
embeddings = EmbeddingGenerator()
|
||||||
vector_store = VectorStore()
|
vector_store = VectorStore()
|
||||||
|
compliance_loader = ComplianceLoader()
|
||||||
|
|
||||||
|
# Load compliance standards on startup
|
||||||
|
compliance_loader.load_compliance_standards()
|
||||||
|
|
||||||
# Initialize clients
|
# Initialize clients
|
||||||
groq_client = groq.Client(api_key=Config.GROQ_API_KEY)
|
groq_client = groq.Client(api_key=Config.GROQ_API_KEY)
|
||||||
claude_client = anthropic.Anthropic(api_key=Config.CLAUDE_API_KEY)
|
|
||||||
|
# In-memory storage for analysis results
|
||||||
|
analysis_storage = {}
|
||||||
|
|
||||||
|
|
||||||
def save_document(file: UploadFile) -> str:
|
def save_document(file: UploadFile) -> str:
|
||||||
@@ -35,87 +47,368 @@ def save_document(file: UploadFile) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def extract_text(file_path: str) -> str:
|
def extract_text(file_path: str) -> str:
|
||||||
pass
|
"""Extract text from files"""
|
||||||
|
try:
|
||||||
|
if file_path.endswith('.docx'):
|
||||||
|
doc = Document(file_path)
|
||||||
|
paragraphs = [para.text for para in doc.paragraphs if para.text]
|
||||||
|
return '\n'.join(paragraphs) if paragraphs else ""
|
||||||
|
|
||||||
|
elif file_path.endswith('.pdf'):
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
reader = PdfReader(f)
|
||||||
|
pages_text = []
|
||||||
|
for page in reader.pages:
|
||||||
|
page_text = page.extract_text()
|
||||||
|
if page_text:
|
||||||
|
pages_text.append(page_text)
|
||||||
|
return '\n'.join(pages_text) if pages_text else ""
|
||||||
|
|
||||||
|
elif file_path.endswith('.txt'):
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
return content if content else ""
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupported file type")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Failed to extract text: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def analyze_compliance(text: str) -> dict:
|
def analyze_compliance(text: str) -> dict:
|
||||||
# Parsing with Groq
|
"""Enhanced compliance analysis using Groq"""
|
||||||
groq_response = groq_client.chat.completions.create(
|
|
||||||
messages=[{"role": "user", "content": f"Extract key sections from this document:\n{text}"}],
|
|
||||||
model=Config.GROQ_MODEL
|
|
||||||
)
|
|
||||||
|
|
||||||
# Reasoning with Claude
|
try:
|
||||||
claude_response = claude_client.messages.create(
|
# Get compliance context
|
||||||
model=Config.CLAUDE_MODEL,
|
compliance_context = compliance_loader.get_compliance_context()
|
||||||
max_tokens=4000,
|
|
||||||
messages=[
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": f"Analyze this document for compliance issues:\n{text}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Rerank by importance
|
# Document parsing and section extraction with Groq
|
||||||
issues = claude_response.content
|
groq_parsing_prompt = f"""Extract key sections from this document and identify what type of tender document this appears to be:
|
||||||
ranked_issues = embeddings.rerank_issues(
|
|
||||||
issues=[issue.text for issue in issues],
|
DOCUMENT TO ANALYZE:
|
||||||
query="Most critical compliance issues"
|
{text[:3000]}...
|
||||||
)
|
|
||||||
|
Please provide:
|
||||||
|
1. Document type (e.g., tender response, technical proposal, etc.)
|
||||||
|
2. Key sections found
|
||||||
|
3. Main requirements mentioned
|
||||||
|
4. Document structure analysis
|
||||||
|
|
||||||
|
Be concise but thorough."""
|
||||||
|
|
||||||
|
parsing_response = groq_client.chat.completions.create(
|
||||||
|
messages=[{"role": "user", "content": groq_parsing_prompt}],
|
||||||
|
model=Config.GROQ_MODEL,
|
||||||
|
temperature=0.1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Safe extraction of parsing response
|
||||||
|
document_analysis = ""
|
||||||
|
if parsing_response and parsing_response.choices and len(parsing_response.choices) > 0:
|
||||||
|
if parsing_response.choices[0].message and parsing_response.choices[0].message.content:
|
||||||
|
document_analysis = parsing_response.choices[0].message.content
|
||||||
|
|
||||||
|
# Comprehensive compliance analysis with Groq
|
||||||
|
groq_compliance_prompt = f"""You are a compliance expert analyzing tender documents.
|
||||||
|
|
||||||
|
COMPLIANCE STANDARDS TO CHECK AGAINST:
|
||||||
|
{compliance_context[:4000]}
|
||||||
|
|
||||||
|
DOCUMENT TO ANALYZE:
|
||||||
|
{text[:4000]}
|
||||||
|
|
||||||
|
Please analyze this document for compliance issues and provide a structured response:
|
||||||
|
|
||||||
|
1. COMPLIANCE SUMMARY: Overall compliance status (Compliant/Non-Compliant/Partial)
|
||||||
|
|
||||||
|
2. SPECIFIC ISSUES: List specific compliance violations found, including:
|
||||||
|
- Which standard is violated
|
||||||
|
- What is missing or incorrect
|
||||||
|
- Severity (Critical/High/Medium/Low)
|
||||||
|
- Specific location in document if possible
|
||||||
|
|
||||||
|
3. REQUIREMENTS CHECK: Verify if the document meets requirements from:
|
||||||
|
- Tender specifications
|
||||||
|
- Supplier qualification requirements
|
||||||
|
- Form of tender requirements
|
||||||
|
- Confidentiality agreement requirements
|
||||||
|
|
||||||
|
4. RECOMMENDATIONS: Specific actions to fix each issue
|
||||||
|
|
||||||
|
5. MISSING ELEMENTS: What key elements are completely missing
|
||||||
|
|
||||||
|
Please be detailed and specific in your analysis. Focus on actionable feedback."""
|
||||||
|
|
||||||
|
compliance_response = groq_client.chat.completions.create(
|
||||||
|
messages=[{"role": "user", "content": groq_compliance_prompt}],
|
||||||
|
model=Config.GROQ_MODEL,
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=4000
|
||||||
|
)
|
||||||
|
|
||||||
|
# Safe extraction of compliance response
|
||||||
|
compliance_analysis = ""
|
||||||
|
if compliance_response and compliance_response.choices and len(compliance_response.choices) > 0:
|
||||||
|
if compliance_response.choices[0].message and compliance_response.choices[0].message.content:
|
||||||
|
compliance_analysis = compliance_response.choices[0].message.content
|
||||||
|
|
||||||
|
# Extract and structure issues from the compliance analysis
|
||||||
|
# Parse the structured compliance analysis directly
|
||||||
|
issues_list = []
|
||||||
|
|
||||||
|
# Extract issues from the numbered list in compliance_analysis
|
||||||
|
if compliance_analysis:
|
||||||
|
lines = compliance_analysis.split('\n')
|
||||||
|
current_issue = None
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
|
||||||
|
# Look for numbered issues (1. **Issue name**, 2. **Issue name**, etc.)
|
||||||
|
if line and (line.startswith('1.') or line.startswith('2.') or line.startswith('3.') or
|
||||||
|
line.startswith('4.') or line.startswith('5.') or line.startswith('6.') or
|
||||||
|
line.startswith('7.') or line.startswith('8.') or line.startswith('9.') or
|
||||||
|
line.startswith('10.')):
|
||||||
|
|
||||||
|
# Extract the issue title from lines
|
||||||
|
if '**' in line:
|
||||||
|
try:
|
||||||
|
# Extract text between ** markers
|
||||||
|
issue_title = line.split('**')[1].strip()
|
||||||
|
if issue_title and len(issue_title) > 3:
|
||||||
|
current_issue = issue_title
|
||||||
|
except IndexError:
|
||||||
|
# Fallback: extract everything after the number
|
||||||
|
issue_title = line.split('.', 1)[1].strip().replace('*', '').strip()
|
||||||
|
if issue_title and len(issue_title) > 3:
|
||||||
|
current_issue = issue_title
|
||||||
|
else:
|
||||||
|
# Extract everything after the number
|
||||||
|
issue_title = line.split('.', 1)[1].strip().replace('*', '').strip()
|
||||||
|
if issue_title and len(issue_title) > 3:
|
||||||
|
current_issue = issue_title
|
||||||
|
|
||||||
|
# Look for "What's missing or incorrect" to get more details
|
||||||
|
elif current_issue and line.startswith('* What\'s missing or incorrect:'):
|
||||||
|
details = line.replace('* What\'s missing or incorrect:', '').strip()
|
||||||
|
if details and len(details) > 10:
|
||||||
|
# Combine issue title with details for better context
|
||||||
|
full_issue = f"{current_issue}: {details}"
|
||||||
|
issues_list.append(full_issue)
|
||||||
|
current_issue = None # Reset
|
||||||
|
|
||||||
|
# Fallback
|
||||||
|
elif current_issue and (line.startswith('* Severity:') or line.startswith('* Location:')):
|
||||||
|
if current_issue not in [issue.split(':')[0] for issue in issues_list]:
|
||||||
|
issues_list.append(current_issue)
|
||||||
|
current_issue = None
|
||||||
|
|
||||||
|
# If no issues found via structured parsing, try fallback extraction
|
||||||
|
if not issues_list and compliance_analysis:
|
||||||
|
# Fallback method: look for bullet points or dashes
|
||||||
|
for line in compliance_analysis.split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if line.startswith('- ') or line.startswith('• ') or line.startswith('* '):
|
||||||
|
clean_issue = line[2:].strip()
|
||||||
|
if clean_issue and len(clean_issue) > 10 and not clean_issue.startswith(('Violated', 'What', 'Severity', 'Location')):
|
||||||
|
issues_list.append(clean_issue)
|
||||||
|
|
||||||
|
# Remove duplicates and filter valid issues
|
||||||
|
seen = set()
|
||||||
|
unique_issues = []
|
||||||
|
for issue in issues_list:
|
||||||
|
if issue and len(str(issue)) > 10 and issue not in seen:
|
||||||
|
seen.add(issue)
|
||||||
|
unique_issues.append(str(issue))
|
||||||
|
|
||||||
|
# Rerank issues by importance using Cohere
|
||||||
|
ranked_issues = []
|
||||||
|
if unique_issues:
|
||||||
|
try:
|
||||||
|
ranked_issues = embeddings.rerank_issues(
|
||||||
|
issues=unique_issues,
|
||||||
|
query="Most critical compliance violations and missing requirements",
|
||||||
|
top_n=min(10, len(unique_issues))
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Reranking failed: {e}")
|
||||||
|
ranked_issues = unique_issues[:10] # Fallback to first 10 issues
|
||||||
|
|
||||||
|
if not ranked_issues:
|
||||||
|
# Emergency fallback: extract from compliance_analysis manually
|
||||||
|
fallback_issues = []
|
||||||
|
if compliance_analysis:
|
||||||
|
for line in compliance_analysis.split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if ('missing' in line.lower() or 'violation' in line.lower() or
|
||||||
|
'non-compliant' in line.lower() or 'issue' in line.lower()) and len(line) > 15:
|
||||||
|
fallback_issues.append(line)
|
||||||
|
|
||||||
|
ranked_issues = fallback_issues[:5] if fallback_issues else ["No specific issues identified"]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"document_analysis": document_analysis,
|
||||||
|
"compliance_analysis": compliance_analysis,
|
||||||
|
"issues": ranked_issues,
|
||||||
|
"total_issues": len(ranked_issues),
|
||||||
|
"timestamp": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in analyze_compliance: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
# Return a safe fallback response
|
||||||
|
return {
|
||||||
|
"document_analysis": "Error occurred during document analysis",
|
||||||
|
"compliance_analysis": "Error occurred during compliance analysis",
|
||||||
|
"issues": ["Analysis failed due to technical error"],
|
||||||
|
"total_issues": 1,
|
||||||
|
"timestamp": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_metadata_for_pinecone(analysis: dict, filename: str) -> dict:
|
||||||
|
"""Prepare metadata for Pinecone by converting complex objects to strings"""
|
||||||
|
# Safely get issues and filter out None/empty values
|
||||||
|
issues = analysis.get("issues", [])
|
||||||
|
if issues:
|
||||||
|
# Filter out None, empty strings, and ensure all items are strings
|
||||||
|
clean_issues = [str(issue) for issue in issues if issue is not None and str(issue).strip()]
|
||||||
|
issues_str = " | ".join(clean_issues)
|
||||||
|
else:
|
||||||
|
issues_str = ""
|
||||||
|
|
||||||
|
# Truncate long strings to avoid Pinecone limits
|
||||||
|
def truncate_string(s: str, max_length: int = 30000) -> str:
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
return s[:max_length] + "..." if len(s) > max_length else s
|
||||||
|
|
||||||
|
# Get analysis fields with fallbacks
|
||||||
|
document_analysis = analysis.get("document_analysis", "") or ""
|
||||||
|
compliance_analysis = analysis.get("compliance_analysis", "") or ""
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"summary": groq_response.choices[0].message.content,
|
"filename": filename or "unknown",
|
||||||
"issues": ranked_issues,
|
"upload_time": datetime.now().isoformat(),
|
||||||
"timestamp": datetime.now().isoformat()
|
"status": "analyzed",
|
||||||
|
"total_issues": str(analysis.get("total_issues", 0)),
|
||||||
|
"timestamp": analysis.get("timestamp", datetime.now().isoformat()),
|
||||||
|
"issues_summary": truncate_string(issues_str),
|
||||||
|
"document_type": truncate_string(document_analysis[:500]),
|
||||||
|
"compliance_summary": truncate_string(compliance_analysis[:1000])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/upload-document")
|
@app.get("/")
|
||||||
async def upload_document(file: UploadFile = File(...)):
|
async def root():
|
||||||
try:
|
return {"message": "Mini SpecsComply Pro API", "status": "running"}
|
||||||
doc_id, file_path = save_document(file)
|
|
||||||
text = extract_text(file_path)
|
|
||||||
embedding = embeddings.generate_embeddings(text)
|
|
||||||
|
|
||||||
# Store in vector DB
|
|
||||||
vector_store.upsert_document(
|
|
||||||
doc_id=doc_id,
|
|
||||||
embedding=embedding,
|
|
||||||
metadata={
|
|
||||||
"filename": file.filename,
|
|
||||||
"upload_time": datetime.now().isoformat(),
|
|
||||||
"status": "pending"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Start analysis
|
|
||||||
analysis = analyze_compliance(text)
|
|
||||||
|
|
||||||
return JSONResponse({
|
|
||||||
"document_id": doc_id,
|
|
||||||
"status": "analysis_complete",
|
|
||||||
"analysis": analysis
|
|
||||||
})
|
|
||||||
except Exception as e:
|
|
||||||
raise HTTPException(500, str(e))
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/document/{doc_id}/analysis")
|
@app.get("/document/{doc_id}/analysis")
|
||||||
async def get_analysis(doc_id: str):
|
async def get_analysis(doc_id: str):
|
||||||
|
"""Get detailed analysis for a specific document"""
|
||||||
doc = vector_store.get_document(doc_id)
|
doc = vector_store.get_document(doc_id)
|
||||||
if not doc:
|
if not doc:
|
||||||
raise HTTPException(404, "Document not found")
|
raise HTTPException(404, "Document not found")
|
||||||
|
|
||||||
|
# Get full analysis from storage
|
||||||
|
full_analysis = analysis_storage.get(doc_id, {})
|
||||||
|
|
||||||
return JSONResponse({
|
return JSONResponse({
|
||||||
"document_id": doc_id,
|
"document_id": doc_id,
|
||||||
"metadata": doc.metadata,
|
"metadata": doc.metadata,
|
||||||
"analysis": doc.metadata.get("analysis", {})
|
"analysis": full_analysis
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/upload-document")
|
||||||
|
async def upload_document(file: UploadFile = File(...)):
|
||||||
|
"""Upload and process a document - returns only basic info, not full analysis"""
|
||||||
|
file_path = None
|
||||||
|
try:
|
||||||
|
# Validate file extension
|
||||||
|
ext = os.path.splitext(file.filename)[1].lower()
|
||||||
|
if ext not in Config.ALLOWED_EXTENSIONS:
|
||||||
|
raise HTTPException(400, "Unsupported file type")
|
||||||
|
|
||||||
|
# Save the file temporarily
|
||||||
|
doc_id = str(uuid.uuid4())
|
||||||
|
file_path = os.path.join(Config.UPLOAD_FOLDER, f"{doc_id}{ext}")
|
||||||
|
|
||||||
|
# Ensure upload directory exists
|
||||||
|
os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
|
||||||
|
|
||||||
|
with open(file_path, "wb") as buffer:
|
||||||
|
buffer.write(await file.read())
|
||||||
|
|
||||||
|
# Process the file
|
||||||
|
print(f"Extracting text from {file_path}")
|
||||||
|
text = extract_text(file_path)
|
||||||
|
|
||||||
|
# Validate extracted text
|
||||||
|
if not text or not text.strip():
|
||||||
|
raise HTTPException(400, "Could not extract any text from the uploaded file")
|
||||||
|
|
||||||
|
print(f"Generating embeddings for document {doc_id}")
|
||||||
|
embedding = embeddings.generate_embeddings(text)
|
||||||
|
|
||||||
|
# Perform compliance analysis
|
||||||
|
print(f"Analyzing compliance for document {doc_id}")
|
||||||
|
analysis = analyze_compliance(text)
|
||||||
|
|
||||||
|
# Store full analysis in memory/cache
|
||||||
|
print(f"Storing analysis for document {doc_id}")
|
||||||
|
analysis_storage[doc_id] = analysis
|
||||||
|
|
||||||
|
# Prepare Pinecone-compatible metadata
|
||||||
|
print(f"Preparing metadata for document {doc_id}")
|
||||||
|
pinecone_metadata = prepare_metadata_for_pinecone(analysis, file.filename)
|
||||||
|
|
||||||
|
# Store in vector DB with simplified metadata
|
||||||
|
print(f"Upserting document {doc_id} to vector store")
|
||||||
|
vector_store.upsert_document(
|
||||||
|
doc_id=doc_id,
|
||||||
|
embedding=embedding,
|
||||||
|
metadata=pinecone_metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clean up the temp file
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
|
# Return only basic info - NOT the full analysis
|
||||||
|
return JSONResponse({
|
||||||
|
"document_id": doc_id,
|
||||||
|
"status": "success",
|
||||||
|
"message": "Document processed and analyzed successfully",
|
||||||
|
"filename": file.filename,
|
||||||
|
"total_issues": analysis.get("total_issues", 0),
|
||||||
|
"timestamp": analysis.get("timestamp", datetime.now().isoformat())
|
||||||
|
})
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in upload_document: {e}")
|
||||||
|
print(f"Error type: {type(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
if file_path and os.path.exists(file_path):
|
||||||
|
os.remove(file_path)
|
||||||
|
raise HTTPException(500, f"Document processing failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
@app.post("/document/{doc_id}/resubmit")
|
@app.post("/document/{doc_id}/resubmit")
|
||||||
async def resubmit_document(doc_id: str, file: UploadFile = File(...)):
|
async def resubmit_document(doc_id: str, file: UploadFile = File(...)):
|
||||||
|
"""Resubmit a document for re-analysis"""
|
||||||
try:
|
try:
|
||||||
# Verify original exists
|
# Verify original exists
|
||||||
original = vector_store.get_document(doc_id)
|
original = vector_store.get_document(doc_id)
|
||||||
@@ -123,33 +416,81 @@ async def resubmit_document(doc_id: str, file: UploadFile = File(...)):
|
|||||||
raise HTTPException(404, "Original document not found")
|
raise HTTPException(404, "Original document not found")
|
||||||
|
|
||||||
# Process new version
|
# Process new version
|
||||||
new_doc_id, file_path = save_document(file)
|
ext = os.path.splitext(file.filename)[1].lower()
|
||||||
|
if ext not in Config.ALLOWED_EXTENSIONS:
|
||||||
|
raise HTTPException(400, "Unsupported file type")
|
||||||
|
|
||||||
|
new_doc_id = str(uuid.uuid4())
|
||||||
|
file_path = os.path.join(Config.UPLOAD_FOLDER, f"{new_doc_id}{ext}")
|
||||||
|
|
||||||
|
os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
|
||||||
|
|
||||||
|
with open(file_path, "wb") as buffer:
|
||||||
|
buffer.write(await file.read())
|
||||||
|
|
||||||
text = extract_text(file_path)
|
text = extract_text(file_path)
|
||||||
|
|
||||||
|
# Validate extracted text
|
||||||
|
if not text or not text.strip():
|
||||||
|
raise HTTPException(400, "Could not extract any text from the uploaded file")
|
||||||
|
|
||||||
embedding = embeddings.generate_embeddings(text)
|
embedding = embeddings.generate_embeddings(text)
|
||||||
|
|
||||||
|
# Analyze new version
|
||||||
|
analysis = analyze_compliance(text)
|
||||||
|
|
||||||
|
# Store full analysis in memory/cache
|
||||||
|
analysis_storage[new_doc_id] = analysis
|
||||||
|
|
||||||
|
# Prepare Pinecone-compatible metadata
|
||||||
|
pinecone_metadata = prepare_metadata_for_pinecone(analysis, file.filename)
|
||||||
|
pinecone_metadata["original_id"] = doc_id
|
||||||
|
pinecone_metadata["status"] = "resubmitted"
|
||||||
|
|
||||||
# Store new version
|
# Store new version
|
||||||
vector_store.upsert_document(
|
vector_store.upsert_document(
|
||||||
doc_id=new_doc_id,
|
doc_id=new_doc_id,
|
||||||
embedding=embedding,
|
embedding=embedding,
|
||||||
metadata={
|
metadata=pinecone_metadata
|
||||||
"filename": file.filename,
|
|
||||||
"upload_time": datetime.now().isoformat(),
|
|
||||||
"status": "resubmitted",
|
|
||||||
"original_id": doc_id
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Analyze new version
|
# Clean up temp file
|
||||||
analysis = analyze_compliance(text)
|
os.remove(file_path)
|
||||||
|
|
||||||
|
# Return basic info
|
||||||
return JSONResponse({
|
return JSONResponse({
|
||||||
"document_id": new_doc_id,
|
"document_id": new_doc_id,
|
||||||
"status": "analysis_complete",
|
"original_id": doc_id,
|
||||||
"analysis": analysis
|
"status": "success",
|
||||||
|
"message": "Document resubmitted and analyzed successfully",
|
||||||
|
"filename": file.filename,
|
||||||
|
"total_issues": analysis.get("total_issues", 0),
|
||||||
|
"timestamp": analysis.get("timestamp", datetime.now().isoformat())
|
||||||
})
|
})
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if 'file_path' in locals() and os.path.exists(file_path):
|
||||||
|
os.remove(file_path)
|
||||||
raise HTTPException(500, str(e))
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/compliance-standards")
|
||||||
|
async def get_compliance_standards():
|
||||||
|
"""Get list of loaded compliance standards"""
|
||||||
|
return JSONResponse({
|
||||||
|
"standards": [
|
||||||
|
{
|
||||||
|
"key": key,
|
||||||
|
"filename": data["filename"],
|
||||||
|
"sections_count": len(data["sections"])
|
||||||
|
}
|
||||||
|
for key, data in compliance_loader.compliance_docs.items()
|
||||||
|
]
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
|
|||||||
@@ -0,0 +1,19 @@
|
|||||||
|
import anthropic
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY"))
|
||||||
|
print("API Key loaded:", os.getenv("CLAUDE_API_KEY")[:20] + "..." if os.getenv("CLAUDE_API_KEY") else "NOT FOUND")
|
||||||
|
|
||||||
|
# Test the API
|
||||||
|
try:
|
||||||
|
response = client.messages.create(
|
||||||
|
model="claude-3-5-sonnet-20241022",
|
||||||
|
max_tokens=100,
|
||||||
|
messages=[{"role": "user", "content": "Hello"}]
|
||||||
|
)
|
||||||
|
print("API test successful!")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"API test failed: {e}")
|
||||||
@@ -1,12 +1,50 @@
|
|||||||
from .config import Config
|
from .config import Config
|
||||||
from pinecone import Pinecone
|
from pinecone import Pinecone, ServerlessSpec
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
class VectorStore:
|
class VectorStore:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
if Config.VECTOR_STORE_TYPE == "pinecone":
|
if Config.VECTOR_STORE_TYPE == "pinecone":
|
||||||
self.pc = Pinecone(api_key=Config.PINECONE_API_KEY)
|
self.pc = Pinecone(api_key=Config.PINECONE_API_KEY)
|
||||||
|
|
||||||
|
# Free tier supported regions
|
||||||
|
FREE_TIER_SUPPORTED_REGIONS = {
|
||||||
|
'aws': 'us-east-1',
|
||||||
|
'gcp': 'us-central1'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if index exists
|
||||||
|
if Config.PINECONE_INDEX not in self.pc.list_indexes().names():
|
||||||
|
print(f"Creating new Pinecone index: {Config.PINECONE_INDEX}")
|
||||||
|
try:
|
||||||
|
# First try AWS free tier region
|
||||||
|
self.pc.create_index(
|
||||||
|
name=Config.PINECONE_INDEX,
|
||||||
|
dimension=1024, # Cohere embed-english-v3.0 dimension
|
||||||
|
metric="cosine",
|
||||||
|
spec=ServerlessSpec(
|
||||||
|
cloud="aws",
|
||||||
|
region=FREE_TIER_SUPPORTED_REGIONS['aws']
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"AWS region failed, trying GCP: {str(e)}")
|
||||||
|
# Fallback to GCP if AWS fails
|
||||||
|
self.pc.create_index(
|
||||||
|
name=Config.PINECONE_INDEX,
|
||||||
|
dimension=1024,
|
||||||
|
metric="cosine",
|
||||||
|
spec=ServerlessSpec(
|
||||||
|
cloud="gcp",
|
||||||
|
region=FREE_TIER_SUPPORTED_REGIONS['gcp']
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for index to initialize
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
self.index = self.pc.Index(Config.PINECONE_INDEX)
|
self.index = self.pc.Index(Config.PINECONE_INDEX)
|
||||||
|
|
||||||
def upsert_document(self, doc_id: str, embedding: List[float], metadata: dict):
|
def upsert_document(self, doc_id: str, embedding: List[float], metadata: dict):
|
||||||
|
|||||||
Reference in New Issue
Block a user