From 4a10c67c93aaf01f89d7b5c074f9f790a432d5a7 Mon Sep 17 00:00:00 2001 From: boladeE Date: Wed, 23 Apr 2025 21:45:45 +0100 Subject: [PATCH] feat: Implement background document processing and namespace support - Added background task processing for document uploads to improve responsiveness. - Updated the DocumentProcessor to use synchronous processing. - Introduced namespace configuration in VectorStore for better organization of stored embeddings. - Enhanced logging to reflect changes in document processing and embedding storage. --- src/main.py | 18 +++++++++++++----- src/services/document_processor.py | 2 +- src/services/vector_store.py | 13 ++++++++----- src/templates/documents.html | 3 ++- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/main.py b/src/main.py index a5c02f1..65cba9a 100644 --- a/src/main.py +++ b/src/main.py @@ -1,4 +1,4 @@ -from fastapi import FastAPI, UploadFile, File, HTTPException, Form, Request +from fastapi import FastAPI, UploadFile, File, HTTPException, Form, Request, BackgroundTasks from fastapi.middleware.cors import CORSMiddleware from fastapi.templating import Jinja2Templates from fastapi.staticfiles import StaticFiles @@ -90,6 +90,7 @@ async def documents(request: Request): @app.post("/upload-document") async def upload_document( + background_tasks: BackgroundTasks, file: UploadFile = File(...), document_type: str = Form(...), ): @@ -108,8 +109,8 @@ async def upload_document( logging.info(f"File saved to {file_path}") - # Process the document - await document_processor.process_document(doc_id, file_path, document_type) + # Add document processing to background tasks + background_tasks.add_task(document_processor.process_document, doc_id, file_path, document_type) # Save document metadata metadata = { @@ -121,14 +122,21 @@ async def upload_document( # Save metadata to database database.save_metadata(doc_id, metadata) - logging.info(f"Document {doc_id} processed successfully") - return {"document_id": doc_id, "message": "Document uploaded and processed successfully"} + logging.info(f"Document {doc_id} upload initiated successfully") + return {"document_id": doc_id, "message": "Document uploaded successfully, processing in background"} except Exception as e: error_msg = f"Error processing document: {str(e)}" logging.error(error_msg) logging.error(traceback.format_exc()) raise HTTPException(status_code=500, detail=error_msg) +def process_document_background(doc_id: str, file_path: str, document_type: str): + try: + document_processor.process_document(doc_id, file_path, document_type) + logging.info(f"Document {doc_id} processed successfully in background") + except Exception as e: + logging.error(f"Error processing document {doc_id} in background: {str(e)}") + @app.get("/document/{doc_id}/analysis", response_class=HTMLResponse) async def get_analysis(request: Request, doc_id: str): try: diff --git a/src/services/document_processor.py b/src/services/document_processor.py index c5afb8b..09045bb 100644 --- a/src/services/document_processor.py +++ b/src/services/document_processor.py @@ -18,7 +18,7 @@ class DocumentProcessor: } self.database = Database() or database - async def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False): + def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False): try: # Read document content with error handling for encoding diff --git a/src/services/vector_store.py b/src/services/vector_store.py index 839a4f2..27917ff 100644 --- a/src/services/vector_store.py +++ b/src/services/vector_store.py @@ -9,6 +9,7 @@ class VectorStore: def __init__(self, pinecone_client: Optional[Pinecone] = None, embedding_service: Optional[EmbeddingService] = None): self.pinecone = pinecone_client or Pinecone(api_key=config.PINECONE_API_KEY) self.index_name = config.PINECONE_INDEX_NAME + self.namespace = config.PINECONE_NAMESPACE # Added namespace configuration self.embedding_service = embedding_service or EmbeddingService() self._ensure_index() @@ -77,9 +78,10 @@ class VectorStore: "metadata": { "content": content } - }] + }], + namespace=self.namespace # Use the namespace when storing embeddings ) - logging.info(f"Stored embedding for document {doc_id}") + logging.info(f"Stored embedding for document {doc_id} in namespace '{self.namespace}'") except Exception as e: logging.error(f"Error storing embedding for document {doc_id}: {str(e)}") raise @@ -94,7 +96,8 @@ class VectorStore: results = self.index.query( vector=query_embedding, top_k=top_k, - include_metadata=True + include_metadata=True, + namespace=self.namespace # Use the namespace when querying ) return results.matches except Exception as e: @@ -104,8 +107,8 @@ class VectorStore: def delete_document(self, doc_id: str): """Delete a document from the index.""" try: - self.index.delete(ids=[doc_id]) - logging.info(f"Deleted document {doc_id} from index") + self.index.delete(ids=[doc_id], namespace=self.namespace) # Use the namespace when deleting + logging.info(f"Deleted document {doc_id} from namespace '{self.namespace}'") except Exception as e: logging.error(f"Error deleting document {doc_id}: {str(e)}") raise diff --git a/src/templates/documents.html b/src/templates/documents.html index f54e1dc..a401da2 100644 --- a/src/templates/documents.html +++ b/src/templates/documents.html @@ -48,13 +48,14 @@ {% endif %} + {% if doc.status == 'completed' %}
View Analysis
- + {% endif %}