initial commit

2025-07-11 22:29:45 +01:00
commit 0b5a7218b0
8 changed files with 370 additions and 0 deletions
@@ -0,0 +1,48 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 ### Environment ###
 .env
 .venv
 env/
 venv/
 # Virtual environment
 pythonenv*
 # VS Code
 .vscode/
 !.vscode/settings.json
 !.vscode/tasks.json
 !.vscode/launch.json
 !.vscode/extensions.json
 ### Data Files ###
 # Raw and processed news
 data/
 data/
 *.csv
 *.json
 *.parquet
 *.feather
 *.pkl
 *.pickle
 # Vector database files
 *.faiss
 *.index
 *.bin
 *.vec
 ### Logs ###
 *.log
 logs/
 ### Groq/Cohere Cache ###
 .cache/
 model_cache/
 ### Documentation ###
 docs/_build/
@@ -0,0 +1,32 @@
 import os
 from dotenv import load_dotenv
 load_dotenv()
 class Config:
    # Cohere
    COHERE_API_KEY = os.getenv("COHERE_API_KEY")
    EMBED_MODEL = "embed-english-v3.0"
    RERANK_MODEL = "rerank-english-v3.0"
    # Groq
    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
    GROQ_MODEL = "mixtral-8x7b-32768"
    # Claude
    CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")
    CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
    # Vector Store
    VECTOR_STORE_TYPE = "pinecone"
    PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
    PINECONE_INDEX = "scp-docs"
    PINECONE_ENV = "gcp-starter"
    # Document Processing
    MAX_DOC_SIZE = 10 * 1024 * 1024  # 10MB
    ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.txt'}
    # Paths
    UPLOAD_FOLDER = "documents/"
@@ -0,0 +1,24 @@
 import cohere
 from .config import Config
 class EmbeddingGenerator:
    def __init__(self):
        self.client = cohere.Client(Config.COHERE_API_KEY)
    def generate_embeddings(self, text: str):
        response = self.client.embed(
            texts=[text],
            model=Config.EMBED_MODEL,
            input_type="document"
        )
        return response.embeddings[0]
    def rerank_issues(self, issues: list, query: str, top_n: int = 5):
        response = self.client.rerank(
            query=query,
            documents=issues,
            top_n=top_n,
            model=Config.RERANK_MODEL
        )
        return [result.document for result in response.results]
@@ -0,0 +1,155 @@
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import JSONResponse
 from typing import Optional
 import os
 import uuid
 from datetime import datetime
 from .config import Config
 from .embeddings import EmbeddingGenerator
 from .vector_stores import VectorStore
 import groq
 import anthropic
 app = FastAPI(title="Mini SpecsComply Pro")
 embeddings = EmbeddingGenerator()
 vector_store = VectorStore()
 # Initialize clients
 groq_client = groq.Client(api_key=Config.GROQ_API_KEY)
 claude_client = anthropic.Anthropic(api_key=Config.CLAUDE_API_KEY)
 def save_document(file: UploadFile) -> str:
    os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
    doc_id = str(uuid.uuid4())
    ext = os.path.splitext(file.filename)[1].lower()
    if ext not in Config.ALLOWED_EXTENSIONS:
        raise HTTPException(400, "Unsupported file type")
    file_path = os.path.join(Config.UPLOAD_FOLDER, f"{doc_id}{ext}")
    with open(file_path, "wb") as f:
        f.write(file.file.read())
    return doc_id, file_path
 def extract_text(file_path: str) -> str:
    pass
 def analyze_compliance(text: str) -> dict:
    # Parsing with Groq
    groq_response = groq_client.chat.completions.create(
        messages=[{"role": "user", "content": f"Extract key sections from this document:\n{text}"}],
        model=Config.GROQ_MODEL
    )
    # Reasoning with Claude
    claude_response = claude_client.messages.create(
        model=Config.CLAUDE_MODEL,
        max_tokens=4000,
        messages=[
            {
                "role": "user",
                "content": f"Analyze this document for compliance issues:\n{text}"
            }
        ]
    )
    # Rerank by importance
    issues = claude_response.content
    ranked_issues = embeddings.rerank_issues(
        issues=[issue.text for issue in issues],
        query="Most critical compliance issues"
    )
    return {
        "summary": groq_response.choices[0].message.content,
        "issues": ranked_issues,
        "timestamp": datetime.now().isoformat()
    }
@app.post("/upload-document")
 async def upload_document(file: UploadFile = File(...)):
    try:
        doc_id, file_path = save_document(file)
        text = extract_text(file_path)
        embedding = embeddings.generate_embeddings(text)
        # Store in vector DB
        vector_store.upsert_document(
            doc_id=doc_id,
            embedding=embedding,
            metadata={
                "filename": file.filename,
                "upload_time": datetime.now().isoformat(),
                "status": "pending"
            }
        )
        # Start analysis
        analysis = analyze_compliance(text)
        return JSONResponse({
            "document_id": doc_id,
            "status": "analysis_complete",
            "analysis": analysis
        })
    except Exception as e:
        raise HTTPException(500, str(e))
@app.get("/document/{doc_id}/analysis")
 async def get_analysis(doc_id: str):
    doc = vector_store.get_document(doc_id)
    if not doc:
        raise HTTPException(404, "Document not found")
    return JSONResponse({
        "document_id": doc_id,
        "metadata": doc.metadata,
        "analysis": doc.metadata.get("analysis", {})
    })
@app.post("/document/{doc_id}/resubmit")
 async def resubmit_document(doc_id: str, file: UploadFile = File(...)):
    try:
        # Verify original exists
        original = vector_store.get_document(doc_id)
        if not original:
            raise HTTPException(404, "Original document not found")
        # Process new version
        new_doc_id, file_path = save_document(file)
        text = extract_text(file_path)
        embedding = embeddings.generate_embeddings(text)
        # Store new version
        vector_store.upsert_document(
            doc_id=new_doc_id,
            embedding=embedding,
            metadata={
                "filename": file.filename,
                "upload_time": datetime.now().isoformat(),
                "status": "resubmitted",
                "original_id": doc_id
            }
        )
        # Analyze new version
        analysis = analyze_compliance(text)
        return JSONResponse({
            "document_id": new_doc_id,
            "status": "analysis_complete",
            "analysis": analysis
        })
    except Exception as e:
        raise HTTPException(500, str(e))
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -0,0 +1,9 @@
 fastapi
 uvicorn
 python-dotenv
 cohere
 pinecone
 groq
 anthropic
 PyPDF2
 python-docx
@@ -0,0 +1,32 @@
 from .config import Config
 from pinecone import Pinecone
 from typing import List, Optional
 class VectorStore:
    def __init__(self):
        if Config.VECTOR_STORE_TYPE == "pinecone":
            self.pc = Pinecone(api_key=Config.PINECONE_API_KEY)
            self.index = self.pc.Index(Config.PINECONE_INDEX)
    def upsert_document(self, doc_id: str, embedding: List[float], metadata: dict):
        self.index.upsert(
            vectors=[{
                "id": doc_id,
                "values": embedding,
                "metadata": metadata
            }]
        )
    def search_similar(self, embedding: List[float], top_k: int = 3):
        return self.index.query(
            vector=embedding,
            top_k=top_k,
            include_metadata=True
        )
    def get_document(self, doc_id: str) -> Optional[dict]:
        fetch_response = self.index.fetch(ids=[doc_id])
        if doc_id in fetch_response.vectors:
            return fetch_response.vectors[doc_id]
        return None
@@ -0,0 +1,6 @@
 git init
 git checkout -b main
 git add README.md
 git commit -m "initial commit"
 git remote add origin http://23.29.118.76:3000/ayomide/ds_task_scp.git
 git push -u origin main
@@ -0,0 +1,64 @@
 # Mini SpecsComply Pro (SCP)
 ## Overview
 Mini SpecsComply Pro (SCP) is a lightweight document compliance and validation tool designed to analyze and verify technical documents against predefined standards and project-specific requirements. It leverages advanced AI models for embedding, reasoning, and ranking to ensure fast and accurate document processing.
 ## Key Features
 - **Document Embedding:** Uses Cohere Embedding Model to generate vector representations for efficient comparison.
 - **Fast LLM Processing:** GROQ LLM provides rapid document parsing and analysis.
 - **Advanced Reasoning:** Claude 3.5 Sonnet is used for deep reasoning and compliance verification.
 - **Enhanced Ranking:** Cohere Reranker ensures the most relevant compliance issues are prioritized.
 - **Structured Compliance Feedback:** Generates summaries and detailed issue breakdowns for document corrections.
 - **Efficient Resubmission Workflow:** Allows users to revise and resubmit documents based on feedback.
 ## Tech Stack
 - **Backend:** Python (FastAPI or Flask for API development)
 - **Vector Database:** Pinecone or Weaviate for document embeddings storage and retrieval
 - **LLMs:**
  - GROQ for quick responses
  - Claude 3.5 Sonnet for reasoning
 - **Embedding & Reranking:**
  - Cohere Embedding Model
  - Cohere Reranker
 ## Workflow
 1. **Document Upload**
   - User uploads a document for compliance verification.
   - Document is converted into embeddings using the Cohere Embedding Model.
   - Stored in the vector database for efficient retrieval.
 2. **Processing & Analysis**
   - GROQ LLM parses the document and extracts key sections.
   - Claude 3.5 Sonnet performs reasoning to check compliance against standards.
   - Cohere Reranker prioritizes the most critical compliance issues.
 3. **Compliance Report Generation**
   - A structured report is generated, including:
     - **Summary of Findings**
     - **Detailed Compliance Issues**
     - **Recommended Fixes**
 4. **Feedback & Resubmission**
   - User receives feedback and revises the document.
   - Resubmitted documents undergo the same pipeline for re-evaluation.
 ## API Endpoints (Example)
 ```yaml
 POST /upload-document
  - Uploads a document for analysis
  - Returns document ID for tracking
 GET /document/{doc_id}/analysis
  - Retrieves the compliance report for a document
 POST /document/{doc_id}/resubmit
  - Allows resubmission of a revised document
 ```