feat: Implement background document processing and namespace support

- Added background task processing for document uploads to improve responsiveness.
- Updated the DocumentProcessor to use synchronous processing.
- Introduced namespace configuration in VectorStore for better organization of stored embeddings.
- Enhanced logging to reflect changes in document processing and embedding storage.
This commit is contained in:
boladeE
2025-04-23 21:45:45 +01:00
parent 932f76b603
commit 4a10c67c93
4 changed files with 24 additions and 12 deletions
+13 -5
View File
@@ -1,4 +1,4 @@
from fastapi import FastAPI, UploadFile, File, HTTPException, Form, Request from fastapi import FastAPI, UploadFile, File, HTTPException, Form, Request, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
@@ -90,6 +90,7 @@ async def documents(request: Request):
@app.post("/upload-document") @app.post("/upload-document")
async def upload_document( async def upload_document(
background_tasks: BackgroundTasks,
file: UploadFile = File(...), file: UploadFile = File(...),
document_type: str = Form(...), document_type: str = Form(...),
): ):
@@ -108,8 +109,8 @@ async def upload_document(
logging.info(f"File saved to {file_path}") logging.info(f"File saved to {file_path}")
# Process the document # Add document processing to background tasks
await document_processor.process_document(doc_id, file_path, document_type) background_tasks.add_task(document_processor.process_document, doc_id, file_path, document_type)
# Save document metadata # Save document metadata
metadata = { metadata = {
@@ -121,14 +122,21 @@ async def upload_document(
# Save metadata to database # Save metadata to database
database.save_metadata(doc_id, metadata) database.save_metadata(doc_id, metadata)
logging.info(f"Document {doc_id} processed successfully") logging.info(f"Document {doc_id} upload initiated successfully")
return {"document_id": doc_id, "message": "Document uploaded and processed successfully"} return {"document_id": doc_id, "message": "Document uploaded successfully, processing in background"}
except Exception as e: except Exception as e:
error_msg = f"Error processing document: {str(e)}" error_msg = f"Error processing document: {str(e)}"
logging.error(error_msg) logging.error(error_msg)
logging.error(traceback.format_exc()) logging.error(traceback.format_exc())
raise HTTPException(status_code=500, detail=error_msg) raise HTTPException(status_code=500, detail=error_msg)
def process_document_background(doc_id: str, file_path: str, document_type: str):
try:
document_processor.process_document(doc_id, file_path, document_type)
logging.info(f"Document {doc_id} processed successfully in background")
except Exception as e:
logging.error(f"Error processing document {doc_id} in background: {str(e)}")
@app.get("/document/{doc_id}/analysis", response_class=HTMLResponse) @app.get("/document/{doc_id}/analysis", response_class=HTMLResponse)
async def get_analysis(request: Request, doc_id: str): async def get_analysis(request: Request, doc_id: str):
try: try:
+1 -1
View File
@@ -18,7 +18,7 @@ class DocumentProcessor:
} }
self.database = Database() or database self.database = Database() or database
async def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False): def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False):
try: try:
# Read document content with error handling for encoding # Read document content with error handling for encoding
+8 -5
View File
@@ -9,6 +9,7 @@ class VectorStore:
def __init__(self, pinecone_client: Optional[Pinecone] = None, embedding_service: Optional[EmbeddingService] = None): def __init__(self, pinecone_client: Optional[Pinecone] = None, embedding_service: Optional[EmbeddingService] = None):
self.pinecone = pinecone_client or Pinecone(api_key=config.PINECONE_API_KEY) self.pinecone = pinecone_client or Pinecone(api_key=config.PINECONE_API_KEY)
self.index_name = config.PINECONE_INDEX_NAME self.index_name = config.PINECONE_INDEX_NAME
self.namespace = config.PINECONE_NAMESPACE # Added namespace configuration
self.embedding_service = embedding_service or EmbeddingService() self.embedding_service = embedding_service or EmbeddingService()
self._ensure_index() self._ensure_index()
@@ -77,9 +78,10 @@ class VectorStore:
"metadata": { "metadata": {
"content": content "content": content
} }
}] }],
namespace=self.namespace # Use the namespace when storing embeddings
) )
logging.info(f"Stored embedding for document {doc_id}") logging.info(f"Stored embedding for document {doc_id} in namespace '{self.namespace}'")
except Exception as e: except Exception as e:
logging.error(f"Error storing embedding for document {doc_id}: {str(e)}") logging.error(f"Error storing embedding for document {doc_id}: {str(e)}")
raise raise
@@ -94,7 +96,8 @@ class VectorStore:
results = self.index.query( results = self.index.query(
vector=query_embedding, vector=query_embedding,
top_k=top_k, top_k=top_k,
include_metadata=True include_metadata=True,
namespace=self.namespace # Use the namespace when querying
) )
return results.matches return results.matches
except Exception as e: except Exception as e:
@@ -104,8 +107,8 @@ class VectorStore:
def delete_document(self, doc_id: str): def delete_document(self, doc_id: str):
"""Delete a document from the index.""" """Delete a document from the index."""
try: try:
self.index.delete(ids=[doc_id]) self.index.delete(ids=[doc_id], namespace=self.namespace) # Use the namespace when deleting
logging.info(f"Deleted document {doc_id} from index") logging.info(f"Deleted document {doc_id} from namespace '{self.namespace}'")
except Exception as e: except Exception as e:
logging.error(f"Error deleting document {doc_id}: {str(e)}") logging.error(f"Error deleting document {doc_id}: {str(e)}")
raise raise
+2 -1
View File
@@ -48,13 +48,14 @@
{% endif %} {% endif %}
</td> </td>
<td> <td>
{% if doc.status == 'completed' %}
<div class="btn-group"> <div class="btn-group">
<a href="/document/{{ doc.document_id }}/analysis" class="btn btn-sm btn-outline-primary">View Analysis</a> <a href="/document/{{ doc.document_id }}/analysis" class="btn btn-sm btn-outline-primary">View Analysis</a>
<button type="button" class="btn btn-sm btn-outline-danger" data-bs-toggle="modal" data-bs-target="#deleteModal{{ doc.document_id }}"> <button type="button" class="btn btn-sm btn-outline-danger" data-bs-toggle="modal" data-bs-target="#deleteModal{{ doc.document_id }}">
Delete Delete
</button> </button>
</div> </div>
{% endif %}
<!-- Delete Modal --> <!-- Delete Modal -->
<div class="modal fade" id="deleteModal{{ doc.document_id }}" tabindex="-1" aria-hidden="true"> <div class="modal fade" id="deleteModal{{ doc.document_id }}" tabindex="-1" aria-hidden="true">
<div class="modal-dialog"> <div class="modal-dialog">