From 0b5a7218b0271966303583522ab61ef38b279511 Mon Sep 17 00:00:00 2001 From: Ayomide Date: Fri, 11 Jul 2025 22:29:45 +0100 Subject: [PATCH] initial commit --- .gitignore | 48 ++++++++++++ backend/config.py | 32 ++++++++ backend/embeddings.py | 24 ++++++ backend/main.py | 155 ++++++++++++++++++++++++++++++++++++++ backend/requirements.txt | 9 +++ backend/vector_stores.py | 32 ++++++++ docs/API_Documentation.md | 6 ++ docs/README.md | 64 ++++++++++++++++ 8 files changed, 370 insertions(+) create mode 100644 .gitignore create mode 100644 backend/config.py create mode 100644 backend/embeddings.py create mode 100644 backend/main.py create mode 100644 backend/requirements.txt create mode 100644 backend/vector_stores.py create mode 100644 docs/API_Documentation.md create mode 100644 docs/README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5960347 --- /dev/null +++ b/.gitignore @@ -0,0 +1,48 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +### Environment ### +.env +.venv +env/ +venv/ + +# Virtual environment +pythonenv* + +# VS Code +.vscode/ +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json + +### Data Files ### +# Raw and processed news +data/ +data/ +*.csv +*.json +*.parquet +*.feather +*.pkl +*.pickle + +# Vector database files +*.faiss +*.index +*.bin +*.vec + +### Logs ### +*.log +logs/ + +### Groq/Cohere Cache ### +.cache/ +model_cache/ + +### Documentation ### +docs/_build/ \ No newline at end of file diff --git a/backend/config.py b/backend/config.py new file mode 100644 index 0000000..7b8c372 --- /dev/null +++ b/backend/config.py @@ -0,0 +1,32 @@ +import os +from dotenv import load_dotenv + +load_dotenv() + + +class Config: + # Cohere + COHERE_API_KEY = os.getenv("COHERE_API_KEY") + EMBED_MODEL = "embed-english-v3.0" + RERANK_MODEL = "rerank-english-v3.0" + + # Groq + GROQ_API_KEY = os.getenv("GROQ_API_KEY") + GROQ_MODEL = "mixtral-8x7b-32768" + + # Claude + CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY") + CLAUDE_MODEL = "claude-3-5-sonnet-20240620" + + # Vector Store + VECTOR_STORE_TYPE = "pinecone" + PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") + PINECONE_INDEX = "scp-docs" + PINECONE_ENV = "gcp-starter" + + # Document Processing + MAX_DOC_SIZE = 10 * 1024 * 1024 # 10MB + ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.txt'} + + # Paths + UPLOAD_FOLDER = "documents/" diff --git a/backend/embeddings.py b/backend/embeddings.py new file mode 100644 index 0000000..3d71c4a --- /dev/null +++ b/backend/embeddings.py @@ -0,0 +1,24 @@ +import cohere +from .config import Config + + +class EmbeddingGenerator: + def __init__(self): + self.client = cohere.Client(Config.COHERE_API_KEY) + + def generate_embeddings(self, text: str): + response = self.client.embed( + texts=[text], + model=Config.EMBED_MODEL, + input_type="document" + ) + return response.embeddings[0] + + def rerank_issues(self, issues: list, query: str, top_n: int = 5): + response = self.client.rerank( + query=query, + documents=issues, + top_n=top_n, + model=Config.RERANK_MODEL + ) + return [result.document for result in response.results] diff --git a/backend/main.py b/backend/main.py new file mode 100644 index 0000000..6b9378a --- /dev/null +++ b/backend/main.py @@ -0,0 +1,155 @@ +from fastapi import FastAPI, UploadFile, File, HTTPException +from fastapi.responses import JSONResponse +from typing import Optional +import os +import uuid +from datetime import datetime +from .config import Config +from .embeddings import EmbeddingGenerator +from .vector_stores import VectorStore +import groq +import anthropic + +app = FastAPI(title="Mini SpecsComply Pro") +embeddings = EmbeddingGenerator() +vector_store = VectorStore() + +# Initialize clients +groq_client = groq.Client(api_key=Config.GROQ_API_KEY) +claude_client = anthropic.Anthropic(api_key=Config.CLAUDE_API_KEY) + + +def save_document(file: UploadFile) -> str: + os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True) + doc_id = str(uuid.uuid4()) + ext = os.path.splitext(file.filename)[1].lower() + + if ext not in Config.ALLOWED_EXTENSIONS: + raise HTTPException(400, "Unsupported file type") + + file_path = os.path.join(Config.UPLOAD_FOLDER, f"{doc_id}{ext}") + with open(file_path, "wb") as f: + f.write(file.file.read()) + + return doc_id, file_path + + +def extract_text(file_path: str) -> str: + pass + + +def analyze_compliance(text: str) -> dict: + # Parsing with Groq + groq_response = groq_client.chat.completions.create( + messages=[{"role": "user", "content": f"Extract key sections from this document:\n{text}"}], + model=Config.GROQ_MODEL + ) + + # Reasoning with Claude + claude_response = claude_client.messages.create( + model=Config.CLAUDE_MODEL, + max_tokens=4000, + messages=[ + { + "role": "user", + "content": f"Analyze this document for compliance issues:\n{text}" + } + ] + ) + + # Rerank by importance + issues = claude_response.content + ranked_issues = embeddings.rerank_issues( + issues=[issue.text for issue in issues], + query="Most critical compliance issues" + ) + + return { + "summary": groq_response.choices[0].message.content, + "issues": ranked_issues, + "timestamp": datetime.now().isoformat() + } + + +@app.post("/upload-document") +async def upload_document(file: UploadFile = File(...)): + try: + doc_id, file_path = save_document(file) + text = extract_text(file_path) + embedding = embeddings.generate_embeddings(text) + + # Store in vector DB + vector_store.upsert_document( + doc_id=doc_id, + embedding=embedding, + metadata={ + "filename": file.filename, + "upload_time": datetime.now().isoformat(), + "status": "pending" + } + ) + + # Start analysis + analysis = analyze_compliance(text) + + return JSONResponse({ + "document_id": doc_id, + "status": "analysis_complete", + "analysis": analysis + }) + except Exception as e: + raise HTTPException(500, str(e)) + + +@app.get("/document/{doc_id}/analysis") +async def get_analysis(doc_id: str): + doc = vector_store.get_document(doc_id) + if not doc: + raise HTTPException(404, "Document not found") + + return JSONResponse({ + "document_id": doc_id, + "metadata": doc.metadata, + "analysis": doc.metadata.get("analysis", {}) + }) + + +@app.post("/document/{doc_id}/resubmit") +async def resubmit_document(doc_id: str, file: UploadFile = File(...)): + try: + # Verify original exists + original = vector_store.get_document(doc_id) + if not original: + raise HTTPException(404, "Original document not found") + + # Process new version + new_doc_id, file_path = save_document(file) + text = extract_text(file_path) + embedding = embeddings.generate_embeddings(text) + + # Store new version + vector_store.upsert_document( + doc_id=new_doc_id, + embedding=embedding, + metadata={ + "filename": file.filename, + "upload_time": datetime.now().isoformat(), + "status": "resubmitted", + "original_id": doc_id + } + ) + + # Analyze new version + analysis = analyze_compliance(text) + + return JSONResponse({ + "document_id": new_doc_id, + "status": "analysis_complete", + "analysis": analysis + }) + except Exception as e: + raise HTTPException(500, str(e)) + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..df44fcd --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,9 @@ +fastapi +uvicorn +python-dotenv +cohere +pinecone +groq +anthropic +PyPDF2 +python-docx diff --git a/backend/vector_stores.py b/backend/vector_stores.py new file mode 100644 index 0000000..088a897 --- /dev/null +++ b/backend/vector_stores.py @@ -0,0 +1,32 @@ +from .config import Config +from pinecone import Pinecone +from typing import List, Optional + + +class VectorStore: + def __init__(self): + if Config.VECTOR_STORE_TYPE == "pinecone": + self.pc = Pinecone(api_key=Config.PINECONE_API_KEY) + self.index = self.pc.Index(Config.PINECONE_INDEX) + + def upsert_document(self, doc_id: str, embedding: List[float], metadata: dict): + self.index.upsert( + vectors=[{ + "id": doc_id, + "values": embedding, + "metadata": metadata + }] + ) + + def search_similar(self, embedding: List[float], top_k: int = 3): + return self.index.query( + vector=embedding, + top_k=top_k, + include_metadata=True + ) + + def get_document(self, doc_id: str) -> Optional[dict]: + fetch_response = self.index.fetch(ids=[doc_id]) + if doc_id in fetch_response.vectors: + return fetch_response.vectors[doc_id] + return None diff --git a/docs/API_Documentation.md b/docs/API_Documentation.md new file mode 100644 index 0000000..d3f6dc5 --- /dev/null +++ b/docs/API_Documentation.md @@ -0,0 +1,6 @@ +git init +git checkout -b main +git add README.md +git commit -m "initial commit" +git remote add origin http://23.29.118.76:3000/ayomide/ds_task_scp.git +git push -u origin main \ No newline at end of file diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..a3094cc --- /dev/null +++ b/docs/README.md @@ -0,0 +1,64 @@ +# Mini SpecsComply Pro (SCP) + +## Overview + +Mini SpecsComply Pro (SCP) is a lightweight document compliance and validation tool designed to analyze and verify technical documents against predefined standards and project-specific requirements. It leverages advanced AI models for embedding, reasoning, and ranking to ensure fast and accurate document processing. + +## Key Features + +- **Document Embedding:** Uses Cohere Embedding Model to generate vector representations for efficient comparison. +- **Fast LLM Processing:** GROQ LLM provides rapid document parsing and analysis. +- **Advanced Reasoning:** Claude 3.5 Sonnet is used for deep reasoning and compliance verification. +- **Enhanced Ranking:** Cohere Reranker ensures the most relevant compliance issues are prioritized. +- **Structured Compliance Feedback:** Generates summaries and detailed issue breakdowns for document corrections. +- **Efficient Resubmission Workflow:** Allows users to revise and resubmit documents based on feedback. + +## Tech Stack + +- **Backend:** Python (FastAPI or Flask for API development) +- **Vector Database:** Pinecone or Weaviate for document embeddings storage and retrieval +- **LLMs:** + + - GROQ for quick responses + - Claude 3.5 Sonnet for reasoning +- **Embedding & Reranking:** + + - Cohere Embedding Model + - Cohere Reranker + +## Workflow + +1. **Document Upload** + + - User uploads a document for compliance verification. + - Document is converted into embeddings using the Cohere Embedding Model. + - Stored in the vector database for efficient retrieval. +2. **Processing & Analysis** + + - GROQ LLM parses the document and extracts key sections. + - Claude 3.5 Sonnet performs reasoning to check compliance against standards. + - Cohere Reranker prioritizes the most critical compliance issues. +3. **Compliance Report Generation** + + - A structured report is generated, including: + - **Summary of Findings** + - **Detailed Compliance Issues** + - **Recommended Fixes** +4. **Feedback & Resubmission** + + - User receives feedback and revises the document. + - Resubmitted documents undergo the same pipeline for re-evaluation. + +## API Endpoints (Example) + +```yaml +POST /upload-document + - Uploads a document for analysis + - Returns document ID for tracking + +GET /document/{doc_id}/analysis + - Retrieves the compliance report for a document + +POST /document/{doc_id}/resubmit + - Allows resubmission of a revised document +```