initial commit

2025-07-11 22:29:45 +01:00
commit 0b5a7218b0
8 changed files with 370 additions and 0 deletions
@@ -0,0 +1,32 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+class Config:
+    # Cohere
+    COHERE_API_KEY = os.getenv("COHERE_API_KEY")
+    EMBED_MODEL = "embed-english-v3.0"
+    RERANK_MODEL = "rerank-english-v3.0"
+
+    # Groq
+    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+    GROQ_MODEL = "mixtral-8x7b-32768"
+
+    # Claude
+    CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")
+    CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
+
+    # Vector Store
+    VECTOR_STORE_TYPE = "pinecone"
+    PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+    PINECONE_INDEX = "scp-docs"
+    PINECONE_ENV = "gcp-starter"
+
+    # Document Processing
+    MAX_DOC_SIZE = 10 * 1024 * 1024  # 10MB
+    ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.txt'}
+
+    # Paths
+    UPLOAD_FOLDER = "documents/"
@@ -0,0 +1,24 @@
+import cohere
+from .config import Config
+
+
+class EmbeddingGenerator:
+    def __init__(self):
+        self.client = cohere.Client(Config.COHERE_API_KEY)
+
+    def generate_embeddings(self, text: str):
+        response = self.client.embed(
+            texts=[text],
+            model=Config.EMBED_MODEL,
+            input_type="document"
+        )
+        return response.embeddings[0]
+
+    def rerank_issues(self, issues: list, query: str, top_n: int = 5):
+        response = self.client.rerank(
+            query=query,
+            documents=issues,
+            top_n=top_n,
+            model=Config.RERANK_MODEL
+        )
+        return [result.document for result in response.results]
@@ -0,0 +1,155 @@
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+from typing import Optional
+import os
+import uuid
+from datetime import datetime
+from .config import Config
+from .embeddings import EmbeddingGenerator
+from .vector_stores import VectorStore
+import groq
+import anthropic
+
+app = FastAPI(title="Mini SpecsComply Pro")
+embeddings = EmbeddingGenerator()
+vector_store = VectorStore()
+
+# Initialize clients
+groq_client = groq.Client(api_key=Config.GROQ_API_KEY)
+claude_client = anthropic.Anthropic(api_key=Config.CLAUDE_API_KEY)
+
+
+def save_document(file: UploadFile) -> str:
+    os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
+    doc_id = str(uuid.uuid4())
+    ext = os.path.splitext(file.filename)[1].lower()
+
+    if ext not in Config.ALLOWED_EXTENSIONS:
+        raise HTTPException(400, "Unsupported file type")
+
+    file_path = os.path.join(Config.UPLOAD_FOLDER, f"{doc_id}{ext}")
+    with open(file_path, "wb") as f:
+        f.write(file.file.read())
+
+    return doc_id, file_path
+
+
+def extract_text(file_path: str) -> str:
+    pass
+
+
+def analyze_compliance(text: str) -> dict:
+    # Parsing with Groq
+    groq_response = groq_client.chat.completions.create(
+        messages=[{"role": "user", "content": f"Extract key sections from this document:\n{text}"}],
+        model=Config.GROQ_MODEL
+    )
+
+    # Reasoning with Claude
+    claude_response = claude_client.messages.create(
+        model=Config.CLAUDE_MODEL,
+        max_tokens=4000,
+        messages=[
+            {
+                "role": "user",
+                "content": f"Analyze this document for compliance issues:\n{text}"
+            }
+        ]
+    )
+
+    # Rerank by importance
+    issues = claude_response.content
+    ranked_issues = embeddings.rerank_issues(
+        issues=[issue.text for issue in issues],
+        query="Most critical compliance issues"
+    )
+
+    return {
+        "summary": groq_response.choices[0].message.content,
+        "issues": ranked_issues,
+        "timestamp": datetime.now().isoformat()
+    }
+
+
+@app.post("/upload-document")
+async def upload_document(file: UploadFile = File(...)):
+    try:
+        doc_id, file_path = save_document(file)
+        text = extract_text(file_path)
+        embedding = embeddings.generate_embeddings(text)
+
+        # Store in vector DB
+        vector_store.upsert_document(
+            doc_id=doc_id,
+            embedding=embedding,
+            metadata={
+                "filename": file.filename,
+                "upload_time": datetime.now().isoformat(),
+                "status": "pending"
+            }
+        )
+
+        # Start analysis
+        analysis = analyze_compliance(text)
+
+        return JSONResponse({
+            "document_id": doc_id,
+            "status": "analysis_complete",
+            "analysis": analysis
+        })
+    except Exception as e:
+        raise HTTPException(500, str(e))
+
+
+@app.get("/document/{doc_id}/analysis")
+async def get_analysis(doc_id: str):
+    doc = vector_store.get_document(doc_id)
+    if not doc:
+        raise HTTPException(404, "Document not found")
+
+    return JSONResponse({
+        "document_id": doc_id,
+        "metadata": doc.metadata,
+        "analysis": doc.metadata.get("analysis", {})
+    })
+
+
+@app.post("/document/{doc_id}/resubmit")
+async def resubmit_document(doc_id: str, file: UploadFile = File(...)):
+    try:
+        # Verify original exists
+        original = vector_store.get_document(doc_id)
+        if not original:
+            raise HTTPException(404, "Original document not found")
+
+        # Process new version
+        new_doc_id, file_path = save_document(file)
+        text = extract_text(file_path)
+        embedding = embeddings.generate_embeddings(text)
+
+        # Store new version
+        vector_store.upsert_document(
+            doc_id=new_doc_id,
+            embedding=embedding,
+            metadata={
+                "filename": file.filename,
+                "upload_time": datetime.now().isoformat(),
+                "status": "resubmitted",
+                "original_id": doc_id
+            }
+        )
+
+        # Analyze new version
+        analysis = analyze_compliance(text)
+
+        return JSONResponse({
+            "document_id": new_doc_id,
+            "status": "analysis_complete",
+            "analysis": analysis
+        })
+    except Exception as e:
+        raise HTTPException(500, str(e))
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -0,0 +1,9 @@
+fastapi
+uvicorn
+python-dotenv
+cohere
+pinecone
+groq
+anthropic
+PyPDF2
+python-docx
@@ -0,0 +1,32 @@
+from .config import Config
+from pinecone import Pinecone
+from typing import List, Optional
+
+
+class VectorStore:
+    def __init__(self):
+        if Config.VECTOR_STORE_TYPE == "pinecone":
+            self.pc = Pinecone(api_key=Config.PINECONE_API_KEY)
+            self.index = self.pc.Index(Config.PINECONE_INDEX)
+
+    def upsert_document(self, doc_id: str, embedding: List[float], metadata: dict):
+        self.index.upsert(
+            vectors=[{
+                "id": doc_id,
+                "values": embedding,
+                "metadata": metadata
+            }]
+        )
+
+    def search_similar(self, embedding: List[float], top_k: int = 3):
+        return self.index.query(
+            vector=embedding,
+            top_k=top_k,
+            include_metadata=True
+        )
+
+    def get_document(self, doc_id: str) -> Optional[dict]:
+        fetch_response = self.index.fetch(ids=[doc_id])
+        if doc_id in fetch_response.vectors:
+            return fetch_response.vectors[doc_id]
+        return None