initial commit

2025-07-11 22:29:45 +01:00
commit 0b5a7218b0
8 changed files with 370 additions and 0 deletions
@@ -0,0 +1,48 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+### Environment ###
+.env
+.venv
+env/
+venv/
+
+# Virtual environment
+pythonenv*
+
+# VS Code
+.vscode/
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+
+### Data Files ###
+# Raw and processed news
+data/
+data/
+*.csv
+*.json
+*.parquet
+*.feather
+*.pkl
+*.pickle
+
+# Vector database files
+*.faiss
+*.index
+*.bin
+*.vec
+
+### Logs ###
+*.log
+logs/
+
+### Groq/Cohere Cache ###
+.cache/
+model_cache/
+
+### Documentation ###
+docs/_build/
@@ -0,0 +1,32 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+class Config:
+    # Cohere
+    COHERE_API_KEY = os.getenv("COHERE_API_KEY")
+    EMBED_MODEL = "embed-english-v3.0"
+    RERANK_MODEL = "rerank-english-v3.0"
+
+    # Groq
+    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+    GROQ_MODEL = "mixtral-8x7b-32768"
+
+    # Claude
+    CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")
+    CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
+
+    # Vector Store
+    VECTOR_STORE_TYPE = "pinecone"
+    PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+    PINECONE_INDEX = "scp-docs"
+    PINECONE_ENV = "gcp-starter"
+
+    # Document Processing
+    MAX_DOC_SIZE = 10 * 1024 * 1024  # 10MB
+    ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.txt'}
+
+    # Paths
+    UPLOAD_FOLDER = "documents/"
@@ -0,0 +1,24 @@
+import cohere
+from .config import Config
+
+
+class EmbeddingGenerator:
+    def __init__(self):
+        self.client = cohere.Client(Config.COHERE_API_KEY)
+
+    def generate_embeddings(self, text: str):
+        response = self.client.embed(
+            texts=[text],
+            model=Config.EMBED_MODEL,
+            input_type="document"
+        )
+        return response.embeddings[0]
+
+    def rerank_issues(self, issues: list, query: str, top_n: int = 5):
+        response = self.client.rerank(
+            query=query,
+            documents=issues,
+            top_n=top_n,
+            model=Config.RERANK_MODEL
+        )
+        return [result.document for result in response.results]
@@ -0,0 +1,155 @@
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+from typing import Optional
+import os
+import uuid
+from datetime import datetime
+from .config import Config
+from .embeddings import EmbeddingGenerator
+from .vector_stores import VectorStore
+import groq
+import anthropic
+
+app = FastAPI(title="Mini SpecsComply Pro")
+embeddings = EmbeddingGenerator()
+vector_store = VectorStore()
+
+# Initialize clients
+groq_client = groq.Client(api_key=Config.GROQ_API_KEY)
+claude_client = anthropic.Anthropic(api_key=Config.CLAUDE_API_KEY)
+
+
+def save_document(file: UploadFile) -> str:
+    os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
+    doc_id = str(uuid.uuid4())
+    ext = os.path.splitext(file.filename)[1].lower()
+
+    if ext not in Config.ALLOWED_EXTENSIONS:
+        raise HTTPException(400, "Unsupported file type")
+
+    file_path = os.path.join(Config.UPLOAD_FOLDER, f"{doc_id}{ext}")
+    with open(file_path, "wb") as f:
+        f.write(file.file.read())
+
+    return doc_id, file_path
+
+
+def extract_text(file_path: str) -> str:
+    pass
+
+
+def analyze_compliance(text: str) -> dict:
+    # Parsing with Groq
+    groq_response = groq_client.chat.completions.create(
+        messages=[{"role": "user", "content": f"Extract key sections from this document:\n{text}"}],
+        model=Config.GROQ_MODEL
+    )
+
+    # Reasoning with Claude
+    claude_response = claude_client.messages.create(
+        model=Config.CLAUDE_MODEL,
+        max_tokens=4000,
+        messages=[
+            {
+                "role": "user",
+                "content": f"Analyze this document for compliance issues:\n{text}"
+            }
+        ]
+    )
+
+    # Rerank by importance
+    issues = claude_response.content
+    ranked_issues = embeddings.rerank_issues(
+        issues=[issue.text for issue in issues],
+        query="Most critical compliance issues"
+    )
+
+    return {
+        "summary": groq_response.choices[0].message.content,
+        "issues": ranked_issues,
+        "timestamp": datetime.now().isoformat()
+    }
+
+
+@app.post("/upload-document")
+async def upload_document(file: UploadFile = File(...)):
+    try:
+        doc_id, file_path = save_document(file)
+        text = extract_text(file_path)
+        embedding = embeddings.generate_embeddings(text)
+
+        # Store in vector DB
+        vector_store.upsert_document(
+            doc_id=doc_id,
+            embedding=embedding,
+            metadata={
+                "filename": file.filename,
+                "upload_time": datetime.now().isoformat(),
+                "status": "pending"
+            }
+        )
+
+        # Start analysis
+        analysis = analyze_compliance(text)
+
+        return JSONResponse({
+            "document_id": doc_id,
+            "status": "analysis_complete",
+            "analysis": analysis
+        })
+    except Exception as e:
+        raise HTTPException(500, str(e))
+
+
+@app.get("/document/{doc_id}/analysis")
+async def get_analysis(doc_id: str):
+    doc = vector_store.get_document(doc_id)
+    if not doc:
+        raise HTTPException(404, "Document not found")
+
+    return JSONResponse({
+        "document_id": doc_id,
+        "metadata": doc.metadata,
+        "analysis": doc.metadata.get("analysis", {})
+    })
+
+
+@app.post("/document/{doc_id}/resubmit")
+async def resubmit_document(doc_id: str, file: UploadFile = File(...)):
+    try:
+        # Verify original exists
+        original = vector_store.get_document(doc_id)
+        if not original:
+            raise HTTPException(404, "Original document not found")
+
+        # Process new version
+        new_doc_id, file_path = save_document(file)
+        text = extract_text(file_path)
+        embedding = embeddings.generate_embeddings(text)
+
+        # Store new version
+        vector_store.upsert_document(
+            doc_id=new_doc_id,
+            embedding=embedding,
+            metadata={
+                "filename": file.filename,
+                "upload_time": datetime.now().isoformat(),
+                "status": "resubmitted",
+                "original_id": doc_id
+            }
+        )
+
+        # Analyze new version
+        analysis = analyze_compliance(text)
+
+        return JSONResponse({
+            "document_id": new_doc_id,
+            "status": "analysis_complete",
+            "analysis": analysis
+        })
+    except Exception as e:
+        raise HTTPException(500, str(e))
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -0,0 +1,9 @@
+fastapi
+uvicorn
+python-dotenv
+cohere
+pinecone
+groq
+anthropic
+PyPDF2
+python-docx
@@ -0,0 +1,32 @@
+from .config import Config
+from pinecone import Pinecone
+from typing import List, Optional
+
+
+class VectorStore:
+    def __init__(self):
+        if Config.VECTOR_STORE_TYPE == "pinecone":
+            self.pc = Pinecone(api_key=Config.PINECONE_API_KEY)
+            self.index = self.pc.Index(Config.PINECONE_INDEX)
+
+    def upsert_document(self, doc_id: str, embedding: List[float], metadata: dict):
+        self.index.upsert(
+            vectors=[{
+                "id": doc_id,
+                "values": embedding,
+                "metadata": metadata
+            }]
+        )
+
+    def search_similar(self, embedding: List[float], top_k: int = 3):
+        return self.index.query(
+            vector=embedding,
+            top_k=top_k,
+            include_metadata=True
+        )
+
+    def get_document(self, doc_id: str) -> Optional[dict]:
+        fetch_response = self.index.fetch(ids=[doc_id])
+        if doc_id in fetch_response.vectors:
+            return fetch_response.vectors[doc_id]
+        return None
@@ -0,0 +1,6 @@
+git init
+git checkout -b main
+git add README.md
+git commit -m "initial commit"
+git remote add origin http://23.29.118.76:3000/ayomide/ds_task_scp.git
+git push -u origin main
@@ -0,0 +1,64 @@
+# Mini SpecsComply Pro (SCP)
+
+## Overview
+
+Mini SpecsComply Pro (SCP) is a lightweight document compliance and validation tool designed to analyze and verify technical documents against predefined standards and project-specific requirements. It leverages advanced AI models for embedding, reasoning, and ranking to ensure fast and accurate document processing.
+
+## Key Features
+
+- **Document Embedding:** Uses Cohere Embedding Model to generate vector representations for efficient comparison.
+- **Fast LLM Processing:** GROQ LLM provides rapid document parsing and analysis.
+- **Advanced Reasoning:** Claude 3.5 Sonnet is used for deep reasoning and compliance verification.
+- **Enhanced Ranking:** Cohere Reranker ensures the most relevant compliance issues are prioritized.
+- **Structured Compliance Feedback:** Generates summaries and detailed issue breakdowns for document corrections.
+- **Efficient Resubmission Workflow:** Allows users to revise and resubmit documents based on feedback.
+
+## Tech Stack
+
+- **Backend:** Python (FastAPI or Flask for API development)
+- **Vector Database:** Pinecone or Weaviate for document embeddings storage and retrieval
+- **LLMs:**
+
+  - GROQ for quick responses
+  - Claude 3.5 Sonnet for reasoning
+- **Embedding & Reranking:**
+
+  - Cohere Embedding Model
+  - Cohere Reranker
+
+## Workflow
+
+1. **Document Upload**
+
+   - User uploads a document for compliance verification.
+   - Document is converted into embeddings using the Cohere Embedding Model.
+   - Stored in the vector database for efficient retrieval.
+2. **Processing & Analysis**
+
+   - GROQ LLM parses the document and extracts key sections.
+   - Claude 3.5 Sonnet performs reasoning to check compliance against standards.
+   - Cohere Reranker prioritizes the most critical compliance issues.
+3. **Compliance Report Generation**
+
+   - A structured report is generated, including:
+     - **Summary of Findings**
+     - **Detailed Compliance Issues**
+     - **Recommended Fixes**
+4. **Feedback & Resubmission**
+
+   - User receives feedback and revises the document.
+   - Resubmitted documents undergo the same pipeline for re-evaluation.
+
+## API Endpoints (Example)
+
+```yaml
+POST /upload-document
+  - Uploads a document for analysis
+  - Returns document ID for tracking
+
+GET /document/{doc_id}/analysis
+  - Retrieves the compliance report for a document
+
+POST /document/{doc_id}/resubmit
+  - Allows resubmission of a revised document
+```