initial commit

This commit is contained in:
Ayomide
2025-07-11 22:29:45 +01:00
commit 0b5a7218b0
8 changed files with 370 additions and 0 deletions
+32
View File
@@ -0,0 +1,32 @@
import os
from dotenv import load_dotenv
load_dotenv()
class Config:
# Cohere
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
EMBED_MODEL = "embed-english-v3.0"
RERANK_MODEL = "rerank-english-v3.0"
# Groq
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_MODEL = "mixtral-8x7b-32768"
# Claude
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")
CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
# Vector Store
VECTOR_STORE_TYPE = "pinecone"
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX = "scp-docs"
PINECONE_ENV = "gcp-starter"
# Document Processing
MAX_DOC_SIZE = 10 * 1024 * 1024 # 10MB
ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.txt'}
# Paths
UPLOAD_FOLDER = "documents/"
+24
View File
@@ -0,0 +1,24 @@
import cohere
from .config import Config
class EmbeddingGenerator:
def __init__(self):
self.client = cohere.Client(Config.COHERE_API_KEY)
def generate_embeddings(self, text: str):
response = self.client.embed(
texts=[text],
model=Config.EMBED_MODEL,
input_type="document"
)
return response.embeddings[0]
def rerank_issues(self, issues: list, query: str, top_n: int = 5):
response = self.client.rerank(
query=query,
documents=issues,
top_n=top_n,
model=Config.RERANK_MODEL
)
return [result.document for result in response.results]
+155
View File
@@ -0,0 +1,155 @@
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from typing import Optional
import os
import uuid
from datetime import datetime
from .config import Config
from .embeddings import EmbeddingGenerator
from .vector_stores import VectorStore
import groq
import anthropic
app = FastAPI(title="Mini SpecsComply Pro")
embeddings = EmbeddingGenerator()
vector_store = VectorStore()
# Initialize clients
groq_client = groq.Client(api_key=Config.GROQ_API_KEY)
claude_client = anthropic.Anthropic(api_key=Config.CLAUDE_API_KEY)
def save_document(file: UploadFile) -> str:
os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
doc_id = str(uuid.uuid4())
ext = os.path.splitext(file.filename)[1].lower()
if ext not in Config.ALLOWED_EXTENSIONS:
raise HTTPException(400, "Unsupported file type")
file_path = os.path.join(Config.UPLOAD_FOLDER, f"{doc_id}{ext}")
with open(file_path, "wb") as f:
f.write(file.file.read())
return doc_id, file_path
def extract_text(file_path: str) -> str:
pass
def analyze_compliance(text: str) -> dict:
# Parsing with Groq
groq_response = groq_client.chat.completions.create(
messages=[{"role": "user", "content": f"Extract key sections from this document:\n{text}"}],
model=Config.GROQ_MODEL
)
# Reasoning with Claude
claude_response = claude_client.messages.create(
model=Config.CLAUDE_MODEL,
max_tokens=4000,
messages=[
{
"role": "user",
"content": f"Analyze this document for compliance issues:\n{text}"
}
]
)
# Rerank by importance
issues = claude_response.content
ranked_issues = embeddings.rerank_issues(
issues=[issue.text for issue in issues],
query="Most critical compliance issues"
)
return {
"summary": groq_response.choices[0].message.content,
"issues": ranked_issues,
"timestamp": datetime.now().isoformat()
}
@app.post("/upload-document")
async def upload_document(file: UploadFile = File(...)):
try:
doc_id, file_path = save_document(file)
text = extract_text(file_path)
embedding = embeddings.generate_embeddings(text)
# Store in vector DB
vector_store.upsert_document(
doc_id=doc_id,
embedding=embedding,
metadata={
"filename": file.filename,
"upload_time": datetime.now().isoformat(),
"status": "pending"
}
)
# Start analysis
analysis = analyze_compliance(text)
return JSONResponse({
"document_id": doc_id,
"status": "analysis_complete",
"analysis": analysis
})
except Exception as e:
raise HTTPException(500, str(e))
@app.get("/document/{doc_id}/analysis")
async def get_analysis(doc_id: str):
doc = vector_store.get_document(doc_id)
if not doc:
raise HTTPException(404, "Document not found")
return JSONResponse({
"document_id": doc_id,
"metadata": doc.metadata,
"analysis": doc.metadata.get("analysis", {})
})
@app.post("/document/{doc_id}/resubmit")
async def resubmit_document(doc_id: str, file: UploadFile = File(...)):
try:
# Verify original exists
original = vector_store.get_document(doc_id)
if not original:
raise HTTPException(404, "Original document not found")
# Process new version
new_doc_id, file_path = save_document(file)
text = extract_text(file_path)
embedding = embeddings.generate_embeddings(text)
# Store new version
vector_store.upsert_document(
doc_id=new_doc_id,
embedding=embedding,
metadata={
"filename": file.filename,
"upload_time": datetime.now().isoformat(),
"status": "resubmitted",
"original_id": doc_id
}
)
# Analyze new version
analysis = analyze_compliance(text)
return JSONResponse({
"document_id": new_doc_id,
"status": "analysis_complete",
"analysis": analysis
})
except Exception as e:
raise HTTPException(500, str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
+9
View File
@@ -0,0 +1,9 @@
fastapi
uvicorn
python-dotenv
cohere
pinecone
groq
anthropic
PyPDF2
python-docx
+32
View File
@@ -0,0 +1,32 @@
from .config import Config
from pinecone import Pinecone
from typing import List, Optional
class VectorStore:
def __init__(self):
if Config.VECTOR_STORE_TYPE == "pinecone":
self.pc = Pinecone(api_key=Config.PINECONE_API_KEY)
self.index = self.pc.Index(Config.PINECONE_INDEX)
def upsert_document(self, doc_id: str, embedding: List[float], metadata: dict):
self.index.upsert(
vectors=[{
"id": doc_id,
"values": embedding,
"metadata": metadata
}]
)
def search_similar(self, embedding: List[float], top_k: int = 3):
return self.index.query(
vector=embedding,
top_k=top_k,
include_metadata=True
)
def get_document(self, doc_id: str) -> Optional[dict]:
fetch_response = self.index.fetch(ids=[doc_id])
if doc_id in fetch_response.vectors:
return fetch_response.vectors[doc_id]
return None