diff --git a/app.log b/app.log index 1089802..c0e0073 100644 --- a/app.log +++ b/app.log @@ -862,3 +862,35 @@ sqlite3.OperationalError: table analysis has no column named issues_and_recommen 2025-04-22 11:55:33,740 - root - INFO - Stored embedding for document 9dc21524-8c93-427b-a6cc-04b7585a9545 2025-04-22 11:56:27,580 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully 2025-04-22 11:56:27,588 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully +2025-04-23 22:03:47,212 - root - INFO - Processing upload for document ID: 65dd5884-72ef-4eb4-bb03-b9fad0886da4 +2025-04-23 22:03:47,215 - root - INFO - File saved to data/uploads/65dd5884-72ef-4eb4-bb03-b9fad0886da4_3.Bill of Quantities.docx +2025-04-23 22:03:47,260 - root - INFO - Document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 upload initiated successfully +2025-04-23 22:03:47,905 - root - INFO - Processing document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 with content length: 2057 +2025-04-23 22:03:48,399 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-23 22:03:49,194 - root - INFO - Stored embedding for document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 in namespace '' +2025-04-23 22:04:54,737 - root - INFO - Document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 processed successfully +2025-04-23 22:08:00,451 - root - INFO - Processing upload for document ID: 74bcb446-dccb-4707-afb3-dbd45abba9a2 +2025-04-23 22:08:00,455 - root - INFO - File saved to data/uploads/74bcb446-dccb-4707-afb3-dbd45abba9a2_8.form of tender.docx +2025-04-23 22:08:00,522 - root - INFO - Document 74bcb446-dccb-4707-afb3-dbd45abba9a2 upload initiated successfully +2025-04-23 22:08:01,027 - root - INFO - Processing document 74bcb446-dccb-4707-afb3-dbd45abba9a2 with content length: 523 +2025-04-23 22:08:01,262 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-23 22:08:01,797 - root - INFO - Stored embedding for document 74bcb446-dccb-4707-afb3-dbd45abba9a2 in namespace 'gcp-starter' +2025-04-23 22:08:46,050 - root - INFO - Document 74bcb446-dccb-4707-afb3-dbd45abba9a2 processed successfully +2025-04-28 21:04:14,200 - root - INFO - Processing upload for document ID: de1e735c-15a7-4d03-9fbf-92e8acc822fd +2025-04-28 21:04:14,200 - root - INFO - File saved to data/uploads/de1e735c-15a7-4d03-9fbf-92e8acc822fd_2.Tender Specifications.docx +2025-04-28 21:04:14,215 - root - INFO - Document de1e735c-15a7-4d03-9fbf-92e8acc822fd upload initiated successfully +2025-04-28 21:04:16,258 - tika.tika - INFO - Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/3.1.0/tika-server-standard-3.1.0.jar to C:\Users\babaw\AppData\Local\Temp\tika-server.jar. +2025-04-28 21:04:52,162 - tika.tika - INFO - Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/3.1.0/tika-server-standard-3.1.0.jar.md5 to C:\Users\babaw\AppData\Local\Temp\tika-server.jar.md5. +2025-04-28 21:04:53,338 - tika.tika - WARNING - Failed to see startup log message; retrying... +2025-04-28 21:04:58,353 - tika.tika - WARNING - Failed to see startup log message; retrying... +2025-04-28 21:05:03,354 - tika.tika - WARNING - Failed to see startup log message; retrying... +2025-04-28 21:05:08,367 - tika.tika - ERROR - Tika startup log message not received after 3 tries. +2025-04-28 21:05:08,367 - tika.tika - ERROR - Failed to receive startup confirmation from startServer. +2025-04-28 21:05:08,367 - root - ERROR - Error parsing document with Tika: Unable to start Tika server. +2025-04-28 21:05:08,369 - root - INFO - Processing document de1e735c-15a7-4d03-9fbf-92e8acc822fd with content length: 0 +2025-04-28 21:05:19,572 - root - ERROR - Error processing document de1e735c-15a7-4d03-9fbf-92e8acc822fd: [Errno 11001] getaddrinfo failed +2025-04-28 21:53:58,710 - root - INFO - Processing upload for document ID: c7b13f74-699b-4735-b9bc-a3c2e66545a6 +2025-04-28 21:53:58,715 - root - INFO - File saved to data/uploads/c7b13f74-699b-4735-b9bc-a3c2e66545a6_9.confidentiality agreement.pdf +2025-04-28 21:53:58,731 - root - INFO - Document c7b13f74-699b-4735-b9bc-a3c2e66545a6 upload initiated successfully +2025-04-28 21:53:59,507 - root - INFO - Processing document c7b13f74-699b-4735-b9bc-a3c2e66545a6 with content length: 217 +2025-04-28 21:54:10,920 - root - ERROR - Error processing document c7b13f74-699b-4735-b9bc-a3c2e66545a6: [Errno 11001] getaddrinfo failed diff --git a/data/9.confidentiality agreement.pdf b/data/9.confidentiality agreement.pdf new file mode 100644 index 0000000..f59dfa0 Binary files /dev/null and b/data/9.confidentiality agreement.pdf differ diff --git a/requirements.txt b/requirements.txt index 5653fce..45f2d65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ cohere==4.47 groq==0.4.2 python-dotenv==1.0.1 pydantic==2.6.3 -sqlalchemy==2.0.27 \ No newline at end of file +sqlalchemy==2.0.27 +tika==2.6.0 diff --git a/src/file_reader.py b/src/file_reader.py new file mode 100644 index 0000000..c4df886 --- /dev/null +++ b/src/file_reader.py @@ -0,0 +1,26 @@ +from tika import parser +import tika + +# Path to the file you want to parse +file_path = "C:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\ds_task_scp\\data\\9.confidentiality agreement.pdf" + + +try: + # Parse the file + parsed = parser.from_file(file_path) + + # Extract content + content = parsed["content"] + print("--- Content ---") + print(str(content).strip()) + + # Extract metadata + metadata = parsed["metadata"] + print("\n--- Metadata ---") + for key, value in metadata.items(): + print(f"{key}: {value}") + +except FileNotFoundError: + print(f"Error: File not found at {file_path}") +except Exception as e: + print(f"An error occurred: {e}") \ No newline at end of file diff --git a/src/services/config.py b/src/services/config.py index 7d55410..39edab2 100644 --- a/src/services/config.py +++ b/src/services/config.py @@ -10,7 +10,7 @@ class Settings: COHERE_API_KEY: str = os.getenv("COHERE_API_KEY", "") DEEPSEEK_API_KEY: str = os.getenv("DEEPSEEK_API_KEY", "") PINECONE_API_KEY: str = os.getenv("PINECONE_API_KEY", "") - PINECONE_ENVIRONMENT: str = os.getenv("PINECONE_ENVIRONMENT", "") + PINECONE_NAMESPACE: str = os.getenv("PINECONE_NAMESPACE", "document-compliance") # Vector DB Settings PINECONE_INDEX_NAME: str = "document-compliance" diff --git a/src/services/document_processor.py b/src/services/document_processor.py index 09045bb..3e283be 100644 --- a/src/services/document_processor.py +++ b/src/services/document_processor.py @@ -6,6 +6,7 @@ import os import logging from services.config import config from services.database import Database +from tika import parser class DocumentProcessor: def __init__(self, vector_store, database: Optional[Database] = None): @@ -21,13 +22,14 @@ class DocumentProcessor: def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False): try: - # Read document content with error handling for encoding + # Read document content using Apache Tika for multiple formats try: - import docx - doc = docx.Document(file_path) - content = "\n".join([para.text for para in doc.paragraphs]) + parsed = parser.from_file(file_path) + content = parsed["content"] or "" + if not content.strip(): + logging.warning(f"Tika parsed empty content from {file_path}") except Exception as e: - logging.error(f"Error reading Word document: {str(e)}") + logging.error(f"Error parsing document with Tika: {str(e)}") content = "" logging.info(f"Processing document {doc_id} with content length: {len(content)}") @@ -293,4 +295,4 @@ class DocumentProcessor: except Exception as e: logging.error(f"Error reranking issues: {str(e)}") # If reranking fails, return the original order - return issues_and_recommendations \ No newline at end of file + return issues_and_recommendations