initial commit
This commit is contained in:
+48
@@ -0,0 +1,48 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
### Environment ###
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
|
||||||
|
# Virtual environment
|
||||||
|
pythonenv*
|
||||||
|
|
||||||
|
# VS Code
|
||||||
|
.vscode/
|
||||||
|
!.vscode/settings.json
|
||||||
|
!.vscode/tasks.json
|
||||||
|
!.vscode/launch.json
|
||||||
|
!.vscode/extensions.json
|
||||||
|
|
||||||
|
### Data Files ###
|
||||||
|
# Raw and processed news
|
||||||
|
data/
|
||||||
|
data/
|
||||||
|
*.csv
|
||||||
|
*.json
|
||||||
|
*.parquet
|
||||||
|
*.feather
|
||||||
|
*.pkl
|
||||||
|
*.pickle
|
||||||
|
|
||||||
|
# Vector database files
|
||||||
|
*.faiss
|
||||||
|
*.index
|
||||||
|
*.bin
|
||||||
|
*.vec
|
||||||
|
|
||||||
|
### Logs ###
|
||||||
|
*.log
|
||||||
|
logs/
|
||||||
|
|
||||||
|
### Groq/Cohere Cache ###
|
||||||
|
.cache/
|
||||||
|
model_cache/
|
||||||
|
|
||||||
|
### Documentation ###
|
||||||
|
docs/_build/
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
# Cohere
|
||||||
|
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
|
||||||
|
EMBED_MODEL = "embed-english-v3.0"
|
||||||
|
RERANK_MODEL = "rerank-english-v3.0"
|
||||||
|
|
||||||
|
# Groq
|
||||||
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
||||||
|
GROQ_MODEL = "mixtral-8x7b-32768"
|
||||||
|
|
||||||
|
# Claude
|
||||||
|
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")
|
||||||
|
CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
|
||||||
|
|
||||||
|
# Vector Store
|
||||||
|
VECTOR_STORE_TYPE = "pinecone"
|
||||||
|
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
|
||||||
|
PINECONE_INDEX = "scp-docs"
|
||||||
|
PINECONE_ENV = "gcp-starter"
|
||||||
|
|
||||||
|
# Document Processing
|
||||||
|
MAX_DOC_SIZE = 10 * 1024 * 1024 # 10MB
|
||||||
|
ALLOWED_EXTENSIONS = {'.pdf', '.docx', '.txt'}
|
||||||
|
|
||||||
|
# Paths
|
||||||
|
UPLOAD_FOLDER = "documents/"
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
import cohere
|
||||||
|
from .config import Config
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingGenerator:
|
||||||
|
def __init__(self):
|
||||||
|
self.client = cohere.Client(Config.COHERE_API_KEY)
|
||||||
|
|
||||||
|
def generate_embeddings(self, text: str):
|
||||||
|
response = self.client.embed(
|
||||||
|
texts=[text],
|
||||||
|
model=Config.EMBED_MODEL,
|
||||||
|
input_type="document"
|
||||||
|
)
|
||||||
|
return response.embeddings[0]
|
||||||
|
|
||||||
|
def rerank_issues(self, issues: list, query: str, top_n: int = 5):
|
||||||
|
response = self.client.rerank(
|
||||||
|
query=query,
|
||||||
|
documents=issues,
|
||||||
|
top_n=top_n,
|
||||||
|
model=Config.RERANK_MODEL
|
||||||
|
)
|
||||||
|
return [result.document for result in response.results]
|
||||||
+155
@@ -0,0 +1,155 @@
|
|||||||
|
from fastapi import FastAPI, UploadFile, File, HTTPException
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from typing import Optional
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
from .config import Config
|
||||||
|
from .embeddings import EmbeddingGenerator
|
||||||
|
from .vector_stores import VectorStore
|
||||||
|
import groq
|
||||||
|
import anthropic
|
||||||
|
|
||||||
|
app = FastAPI(title="Mini SpecsComply Pro")
|
||||||
|
embeddings = EmbeddingGenerator()
|
||||||
|
vector_store = VectorStore()
|
||||||
|
|
||||||
|
# Initialize clients
|
||||||
|
groq_client = groq.Client(api_key=Config.GROQ_API_KEY)
|
||||||
|
claude_client = anthropic.Anthropic(api_key=Config.CLAUDE_API_KEY)
|
||||||
|
|
||||||
|
|
||||||
|
def save_document(file: UploadFile) -> str:
|
||||||
|
os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
|
||||||
|
doc_id = str(uuid.uuid4())
|
||||||
|
ext = os.path.splitext(file.filename)[1].lower()
|
||||||
|
|
||||||
|
if ext not in Config.ALLOWED_EXTENSIONS:
|
||||||
|
raise HTTPException(400, "Unsupported file type")
|
||||||
|
|
||||||
|
file_path = os.path.join(Config.UPLOAD_FOLDER, f"{doc_id}{ext}")
|
||||||
|
with open(file_path, "wb") as f:
|
||||||
|
f.write(file.file.read())
|
||||||
|
|
||||||
|
return doc_id, file_path
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(file_path: str) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_compliance(text: str) -> dict:
|
||||||
|
# Parsing with Groq
|
||||||
|
groq_response = groq_client.chat.completions.create(
|
||||||
|
messages=[{"role": "user", "content": f"Extract key sections from this document:\n{text}"}],
|
||||||
|
model=Config.GROQ_MODEL
|
||||||
|
)
|
||||||
|
|
||||||
|
# Reasoning with Claude
|
||||||
|
claude_response = claude_client.messages.create(
|
||||||
|
model=Config.CLAUDE_MODEL,
|
||||||
|
max_tokens=4000,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": f"Analyze this document for compliance issues:\n{text}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Rerank by importance
|
||||||
|
issues = claude_response.content
|
||||||
|
ranked_issues = embeddings.rerank_issues(
|
||||||
|
issues=[issue.text for issue in issues],
|
||||||
|
query="Most critical compliance issues"
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"summary": groq_response.choices[0].message.content,
|
||||||
|
"issues": ranked_issues,
|
||||||
|
"timestamp": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/upload-document")
|
||||||
|
async def upload_document(file: UploadFile = File(...)):
|
||||||
|
try:
|
||||||
|
doc_id, file_path = save_document(file)
|
||||||
|
text = extract_text(file_path)
|
||||||
|
embedding = embeddings.generate_embeddings(text)
|
||||||
|
|
||||||
|
# Store in vector DB
|
||||||
|
vector_store.upsert_document(
|
||||||
|
doc_id=doc_id,
|
||||||
|
embedding=embedding,
|
||||||
|
metadata={
|
||||||
|
"filename": file.filename,
|
||||||
|
"upload_time": datetime.now().isoformat(),
|
||||||
|
"status": "pending"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start analysis
|
||||||
|
analysis = analyze_compliance(text)
|
||||||
|
|
||||||
|
return JSONResponse({
|
||||||
|
"document_id": doc_id,
|
||||||
|
"status": "analysis_complete",
|
||||||
|
"analysis": analysis
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/document/{doc_id}/analysis")
|
||||||
|
async def get_analysis(doc_id: str):
|
||||||
|
doc = vector_store.get_document(doc_id)
|
||||||
|
if not doc:
|
||||||
|
raise HTTPException(404, "Document not found")
|
||||||
|
|
||||||
|
return JSONResponse({
|
||||||
|
"document_id": doc_id,
|
||||||
|
"metadata": doc.metadata,
|
||||||
|
"analysis": doc.metadata.get("analysis", {})
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/document/{doc_id}/resubmit")
|
||||||
|
async def resubmit_document(doc_id: str, file: UploadFile = File(...)):
|
||||||
|
try:
|
||||||
|
# Verify original exists
|
||||||
|
original = vector_store.get_document(doc_id)
|
||||||
|
if not original:
|
||||||
|
raise HTTPException(404, "Original document not found")
|
||||||
|
|
||||||
|
# Process new version
|
||||||
|
new_doc_id, file_path = save_document(file)
|
||||||
|
text = extract_text(file_path)
|
||||||
|
embedding = embeddings.generate_embeddings(text)
|
||||||
|
|
||||||
|
# Store new version
|
||||||
|
vector_store.upsert_document(
|
||||||
|
doc_id=new_doc_id,
|
||||||
|
embedding=embedding,
|
||||||
|
metadata={
|
||||||
|
"filename": file.filename,
|
||||||
|
"upload_time": datetime.now().isoformat(),
|
||||||
|
"status": "resubmitted",
|
||||||
|
"original_id": doc_id
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Analyze new version
|
||||||
|
analysis = analyze_compliance(text)
|
||||||
|
|
||||||
|
return JSONResponse({
|
||||||
|
"document_id": new_doc_id,
|
||||||
|
"status": "analysis_complete",
|
||||||
|
"analysis": analysis
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
fastapi
|
||||||
|
uvicorn
|
||||||
|
python-dotenv
|
||||||
|
cohere
|
||||||
|
pinecone
|
||||||
|
groq
|
||||||
|
anthropic
|
||||||
|
PyPDF2
|
||||||
|
python-docx
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
from .config import Config
|
||||||
|
from pinecone import Pinecone
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class VectorStore:
|
||||||
|
def __init__(self):
|
||||||
|
if Config.VECTOR_STORE_TYPE == "pinecone":
|
||||||
|
self.pc = Pinecone(api_key=Config.PINECONE_API_KEY)
|
||||||
|
self.index = self.pc.Index(Config.PINECONE_INDEX)
|
||||||
|
|
||||||
|
def upsert_document(self, doc_id: str, embedding: List[float], metadata: dict):
|
||||||
|
self.index.upsert(
|
||||||
|
vectors=[{
|
||||||
|
"id": doc_id,
|
||||||
|
"values": embedding,
|
||||||
|
"metadata": metadata
|
||||||
|
}]
|
||||||
|
)
|
||||||
|
|
||||||
|
def search_similar(self, embedding: List[float], top_k: int = 3):
|
||||||
|
return self.index.query(
|
||||||
|
vector=embedding,
|
||||||
|
top_k=top_k,
|
||||||
|
include_metadata=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_document(self, doc_id: str) -> Optional[dict]:
|
||||||
|
fetch_response = self.index.fetch(ids=[doc_id])
|
||||||
|
if doc_id in fetch_response.vectors:
|
||||||
|
return fetch_response.vectors[doc_id]
|
||||||
|
return None
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
git init
|
||||||
|
git checkout -b main
|
||||||
|
git add README.md
|
||||||
|
git commit -m "initial commit"
|
||||||
|
git remote add origin http://23.29.118.76:3000/ayomide/ds_task_scp.git
|
||||||
|
git push -u origin main
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
# Mini SpecsComply Pro (SCP)
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Mini SpecsComply Pro (SCP) is a lightweight document compliance and validation tool designed to analyze and verify technical documents against predefined standards and project-specific requirements. It leverages advanced AI models for embedding, reasoning, and ranking to ensure fast and accurate document processing.
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
- **Document Embedding:** Uses Cohere Embedding Model to generate vector representations for efficient comparison.
|
||||||
|
- **Fast LLM Processing:** GROQ LLM provides rapid document parsing and analysis.
|
||||||
|
- **Advanced Reasoning:** Claude 3.5 Sonnet is used for deep reasoning and compliance verification.
|
||||||
|
- **Enhanced Ranking:** Cohere Reranker ensures the most relevant compliance issues are prioritized.
|
||||||
|
- **Structured Compliance Feedback:** Generates summaries and detailed issue breakdowns for document corrections.
|
||||||
|
- **Efficient Resubmission Workflow:** Allows users to revise and resubmit documents based on feedback.
|
||||||
|
|
||||||
|
## Tech Stack
|
||||||
|
|
||||||
|
- **Backend:** Python (FastAPI or Flask for API development)
|
||||||
|
- **Vector Database:** Pinecone or Weaviate for document embeddings storage and retrieval
|
||||||
|
- **LLMs:**
|
||||||
|
|
||||||
|
- GROQ for quick responses
|
||||||
|
- Claude 3.5 Sonnet for reasoning
|
||||||
|
- **Embedding & Reranking:**
|
||||||
|
|
||||||
|
- Cohere Embedding Model
|
||||||
|
- Cohere Reranker
|
||||||
|
|
||||||
|
## Workflow
|
||||||
|
|
||||||
|
1. **Document Upload**
|
||||||
|
|
||||||
|
- User uploads a document for compliance verification.
|
||||||
|
- Document is converted into embeddings using the Cohere Embedding Model.
|
||||||
|
- Stored in the vector database for efficient retrieval.
|
||||||
|
2. **Processing & Analysis**
|
||||||
|
|
||||||
|
- GROQ LLM parses the document and extracts key sections.
|
||||||
|
- Claude 3.5 Sonnet performs reasoning to check compliance against standards.
|
||||||
|
- Cohere Reranker prioritizes the most critical compliance issues.
|
||||||
|
3. **Compliance Report Generation**
|
||||||
|
|
||||||
|
- A structured report is generated, including:
|
||||||
|
- **Summary of Findings**
|
||||||
|
- **Detailed Compliance Issues**
|
||||||
|
- **Recommended Fixes**
|
||||||
|
4. **Feedback & Resubmission**
|
||||||
|
|
||||||
|
- User receives feedback and revises the document.
|
||||||
|
- Resubmitted documents undergo the same pipeline for re-evaluation.
|
||||||
|
|
||||||
|
## API Endpoints (Example)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
POST /upload-document
|
||||||
|
- Uploads a document for analysis
|
||||||
|
- Returns document ID for tracking
|
||||||
|
|
||||||
|
GET /document/{doc_id}/analysis
|
||||||
|
- Retrieves the compliance report for a document
|
||||||
|
|
||||||
|
POST /document/{doc_id}/resubmit
|
||||||
|
- Allows resubmission of a revised document
|
||||||
|
```
|
||||||
Reference in New Issue
Block a user