feat: Implement background document processing and namespace support

- Added background task processing for document uploads to improve responsiveness.
- Updated the DocumentProcessor to use synchronous processing.
- Introduced namespace configuration in VectorStore for better organization of stored embeddings.
- Enhanced logging to reflect changes in document processing and embedding storage.
This commit is contained in:
boladeE
2025-04-23 21:45:45 +01:00
parent 932f76b603
commit 4a10c67c93
4 changed files with 24 additions and 12 deletions
+13 -5
View File
@@ -1,4 +1,4 @@
from fastapi import FastAPI, UploadFile, File, HTTPException, Form, Request
from fastapi import FastAPI, UploadFile, File, HTTPException, Form, Request, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles
@@ -90,6 +90,7 @@ async def documents(request: Request):
@app.post("/upload-document")
async def upload_document(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
document_type: str = Form(...),
):
@@ -108,8 +109,8 @@ async def upload_document(
logging.info(f"File saved to {file_path}")
# Process the document
await document_processor.process_document(doc_id, file_path, document_type)
# Add document processing to background tasks
background_tasks.add_task(document_processor.process_document, doc_id, file_path, document_type)
# Save document metadata
metadata = {
@@ -121,14 +122,21 @@ async def upload_document(
# Save metadata to database
database.save_metadata(doc_id, metadata)
logging.info(f"Document {doc_id} processed successfully")
return {"document_id": doc_id, "message": "Document uploaded and processed successfully"}
logging.info(f"Document {doc_id} upload initiated successfully")
return {"document_id": doc_id, "message": "Document uploaded successfully, processing in background"}
except Exception as e:
error_msg = f"Error processing document: {str(e)}"
logging.error(error_msg)
logging.error(traceback.format_exc())
raise HTTPException(status_code=500, detail=error_msg)
def process_document_background(doc_id: str, file_path: str, document_type: str):
try:
document_processor.process_document(doc_id, file_path, document_type)
logging.info(f"Document {doc_id} processed successfully in background")
except Exception as e:
logging.error(f"Error processing document {doc_id} in background: {str(e)}")
@app.get("/document/{doc_id}/analysis", response_class=HTMLResponse)
async def get_analysis(request: Request, doc_id: str):
try:
+1 -1
View File
@@ -18,7 +18,7 @@ class DocumentProcessor:
}
self.database = Database() or database
async def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False):
def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False):
try:
# Read document content with error handling for encoding
+8 -5
View File
@@ -9,6 +9,7 @@ class VectorStore:
def __init__(self, pinecone_client: Optional[Pinecone] = None, embedding_service: Optional[EmbeddingService] = None):
self.pinecone = pinecone_client or Pinecone(api_key=config.PINECONE_API_KEY)
self.index_name = config.PINECONE_INDEX_NAME
self.namespace = config.PINECONE_NAMESPACE # Added namespace configuration
self.embedding_service = embedding_service or EmbeddingService()
self._ensure_index()
@@ -77,9 +78,10 @@ class VectorStore:
"metadata": {
"content": content
}
}]
}],
namespace=self.namespace # Use the namespace when storing embeddings
)
logging.info(f"Stored embedding for document {doc_id}")
logging.info(f"Stored embedding for document {doc_id} in namespace '{self.namespace}'")
except Exception as e:
logging.error(f"Error storing embedding for document {doc_id}: {str(e)}")
raise
@@ -94,7 +96,8 @@ class VectorStore:
results = self.index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True
include_metadata=True,
namespace=self.namespace # Use the namespace when querying
)
return results.matches
except Exception as e:
@@ -104,8 +107,8 @@ class VectorStore:
def delete_document(self, doc_id: str):
"""Delete a document from the index."""
try:
self.index.delete(ids=[doc_id])
logging.info(f"Deleted document {doc_id} from index")
self.index.delete(ids=[doc_id], namespace=self.namespace) # Use the namespace when deleting
logging.info(f"Deleted document {doc_id} from namespace '{self.namespace}'")
except Exception as e:
logging.error(f"Error deleting document {doc_id}: {str(e)}")
raise
+2 -1
View File
@@ -48,13 +48,14 @@
{% endif %}
</td>
<td>
{% if doc.status == 'completed' %}
<div class="btn-group">
<a href="/document/{{ doc.document_id }}/analysis" class="btn btn-sm btn-outline-primary">View Analysis</a>
<button type="button" class="btn btn-sm btn-outline-danger" data-bs-toggle="modal" data-bs-target="#deleteModal{{ doc.document_id }}">
Delete
</button>
</div>
{% endif %}
<!-- Delete Modal -->
<div class="modal fade" id="deleteModal{{ doc.document_id }}" tabindex="-1" aria-hidden="true">
<div class="modal-dialog">