initial commit

This commit is contained in:
Ayomide
2025-07-11 22:29:45 +01:00
commit 0b5a7218b0
8 changed files with 370 additions and 0 deletions
+48
View File
@@ -0,0 +1,48 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
### Environment ###
.env
.venv
env/
venv/
# Virtual environment
pythonenv*
# VS Code
.vscode/
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
### Data Files ###
# Raw and processed news
data/
data/
*.csv
*.json
*.parquet
*.feather
*.pkl
*.pickle
# Vector database files
*.faiss
*.index
*.bin
*.vec
### Logs ###
*.log
logs/
### Groq/Cohere Cache ###
.cache/
model_cache/
### Documentation ###
docs/_build/
+32
View File
@@ -0,0 +1,32 @@
import os
from dotenv import load_dotenv
load_dotenv()
class Config:
# Cohere
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
EMBED_MODEL = "embed-english-v3.0"
RERANK_MODEL = "rerank-english-v3.0"
# Groq
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_MODEL = "mixtral-8x7b-32768"
# Claude
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")
CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
# Vector Store
VECTOR_STORE_TYPE = "pinecone"
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX = "scp-docs"
PINECONE_ENV = "gcp-starter"
# Document Processing
MAX_DOC_SIZE = 10 * 1024 * 1024 # 10MB
ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.txt'}
# Paths
UPLOAD_FOLDER = "documents/"
+24
View File
@@ -0,0 +1,24 @@
import cohere
from .config import Config
class EmbeddingGenerator:
def __init__(self):
self.client = cohere.Client(Config.COHERE_API_KEY)
def generate_embeddings(self, text: str):
response = self.client.embed(
texts=[text],
model=Config.EMBED_MODEL,
input_type="document"
)
return response.embeddings[0]
def rerank_issues(self, issues: list, query: str, top_n: int = 5):
response = self.client.rerank(
query=query,
documents=issues,
top_n=top_n,
model=Config.RERANK_MODEL
)
return [result.document for result in response.results]
+155
View File
@@ -0,0 +1,155 @@
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from typing import Optional
import os
import uuid
from datetime import datetime
from .config import Config
from .embeddings import EmbeddingGenerator
from .vector_stores import VectorStore
import groq
import anthropic
app = FastAPI(title="Mini SpecsComply Pro")
embeddings = EmbeddingGenerator()
vector_store = VectorStore()
# Initialize clients
groq_client = groq.Client(api_key=Config.GROQ_API_KEY)
claude_client = anthropic.Anthropic(api_key=Config.CLAUDE_API_KEY)
def save_document(file: UploadFile) -> str:
os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
doc_id = str(uuid.uuid4())
ext = os.path.splitext(file.filename)[1].lower()
if ext not in Config.ALLOWED_EXTENSIONS:
raise HTTPException(400, "Unsupported file type")
file_path = os.path.join(Config.UPLOAD_FOLDER, f"{doc_id}{ext}")
with open(file_path, "wb") as f:
f.write(file.file.read())
return doc_id, file_path
def extract_text(file_path: str) -> str:
pass
def analyze_compliance(text: str) -> dict:
# Parsing with Groq
groq_response = groq_client.chat.completions.create(
messages=[{"role": "user", "content": f"Extract key sections from this document:\n{text}"}],
model=Config.GROQ_MODEL
)
# Reasoning with Claude
claude_response = claude_client.messages.create(
model=Config.CLAUDE_MODEL,
max_tokens=4000,
messages=[
{
"role": "user",
"content": f"Analyze this document for compliance issues:\n{text}"
}
]
)
# Rerank by importance
issues = claude_response.content
ranked_issues = embeddings.rerank_issues(
issues=[issue.text for issue in issues],
query="Most critical compliance issues"
)
return {
"summary": groq_response.choices[0].message.content,
"issues": ranked_issues,
"timestamp": datetime.now().isoformat()
}
@app.post("/upload-document")
async def upload_document(file: UploadFile = File(...)):
try:
doc_id, file_path = save_document(file)
text = extract_text(file_path)
embedding = embeddings.generate_embeddings(text)
# Store in vector DB
vector_store.upsert_document(
doc_id=doc_id,
embedding=embedding,
metadata={
"filename": file.filename,
"upload_time": datetime.now().isoformat(),
"status": "pending"
}
)
# Start analysis
analysis = analyze_compliance(text)
return JSONResponse({
"document_id": doc_id,
"status": "analysis_complete",
"analysis": analysis
})
except Exception as e:
raise HTTPException(500, str(e))
@app.get("/document/{doc_id}/analysis")
async def get_analysis(doc_id: str):
doc = vector_store.get_document(doc_id)
if not doc:
raise HTTPException(404, "Document not found")
return JSONResponse({
"document_id": doc_id,
"metadata": doc.metadata,
"analysis": doc.metadata.get("analysis", {})
})
@app.post("/document/{doc_id}/resubmit")
async def resubmit_document(doc_id: str, file: UploadFile = File(...)):
try:
# Verify original exists
original = vector_store.get_document(doc_id)
if not original:
raise HTTPException(404, "Original document not found")
# Process new version
new_doc_id, file_path = save_document(file)
text = extract_text(file_path)
embedding = embeddings.generate_embeddings(text)
# Store new version
vector_store.upsert_document(
doc_id=new_doc_id,
embedding=embedding,
metadata={
"filename": file.filename,
"upload_time": datetime.now().isoformat(),
"status": "resubmitted",
"original_id": doc_id
}
)
# Analyze new version
analysis = analyze_compliance(text)
return JSONResponse({
"document_id": new_doc_id,
"status": "analysis_complete",
"analysis": analysis
})
except Exception as e:
raise HTTPException(500, str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
+9
View File
@@ -0,0 +1,9 @@
fastapi
uvicorn
python-dotenv
cohere
pinecone
groq
anthropic
PyPDF2
python-docx
+32
View File
@@ -0,0 +1,32 @@
from .config import Config
from pinecone import Pinecone
from typing import List, Optional
class VectorStore:
def __init__(self):
if Config.VECTOR_STORE_TYPE == "pinecone":
self.pc = Pinecone(api_key=Config.PINECONE_API_KEY)
self.index = self.pc.Index(Config.PINECONE_INDEX)
def upsert_document(self, doc_id: str, embedding: List[float], metadata: dict):
self.index.upsert(
vectors=[{
"id": doc_id,
"values": embedding,
"metadata": metadata
}]
)
def search_similar(self, embedding: List[float], top_k: int = 3):
return self.index.query(
vector=embedding,
top_k=top_k,
include_metadata=True
)
def get_document(self, doc_id: str) -> Optional[dict]:
fetch_response = self.index.fetch(ids=[doc_id])
if doc_id in fetch_response.vectors:
return fetch_response.vectors[doc_id]
return None
+6
View File
@@ -0,0 +1,6 @@
git init
git checkout -b main
git add README.md
git commit -m "initial commit"
git remote add origin http://23.29.118.76:3000/ayomide/ds_task_scp.git
git push -u origin main
+64
View File
@@ -0,0 +1,64 @@
# Mini SpecsComply Pro (SCP)
## Overview
Mini SpecsComply Pro (SCP) is a lightweight document compliance and validation tool designed to analyze and verify technical documents against predefined standards and project-specific requirements. It leverages advanced AI models for embedding, reasoning, and ranking to ensure fast and accurate document processing.
## Key Features
- **Document Embedding:** Uses Cohere Embedding Model to generate vector representations for efficient comparison.
- **Fast LLM Processing:** GROQ LLM provides rapid document parsing and analysis.
- **Advanced Reasoning:** Claude 3.5 Sonnet is used for deep reasoning and compliance verification.
- **Enhanced Ranking:** Cohere Reranker ensures the most relevant compliance issues are prioritized.
- **Structured Compliance Feedback:** Generates summaries and detailed issue breakdowns for document corrections.
- **Efficient Resubmission Workflow:** Allows users to revise and resubmit documents based on feedback.
## Tech Stack
- **Backend:** Python (FastAPI or Flask for API development)
- **Vector Database:** Pinecone or Weaviate for document embeddings storage and retrieval
- **LLMs:**
- GROQ for quick responses
- Claude 3.5 Sonnet for reasoning
- **Embedding & Reranking:**
- Cohere Embedding Model
- Cohere Reranker
## Workflow
1. **Document Upload**
- User uploads a document for compliance verification.
- Document is converted into embeddings using the Cohere Embedding Model.
- Stored in the vector database for efficient retrieval.
2. **Processing & Analysis**
- GROQ LLM parses the document and extracts key sections.
- Claude 3.5 Sonnet performs reasoning to check compliance against standards.
- Cohere Reranker prioritizes the most critical compliance issues.
3. **Compliance Report Generation**
- A structured report is generated, including:
- **Summary of Findings**
- **Detailed Compliance Issues**
- **Recommended Fixes**
4. **Feedback & Resubmission**
- User receives feedback and revises the document.
- Resubmitted documents undergo the same pipeline for re-evaluation.
## API Endpoints (Example)
```yaml
POST /upload-document
- Uploads a document for analysis
- Returns document ID for tracking
GET /document/{doc_id}/analysis
- Retrieves the compliance report for a document
POST /document/{doc_id}/resubmit
- Allows resubmission of a revised document
```