Using tika to support multiple file formats

This commit is contained in:
boladeE
2025-04-28 22:00:47 +01:00
parent 4a10c67c93
commit 1b7fb0c64b
6 changed files with 69 additions and 8 deletions
+32
View File
@@ -862,3 +862,35 @@ sqlite3.OperationalError: table analysis has no column named issues_and_recommen
2025-04-22 11:55:33,740 - root - INFO - Stored embedding for document 9dc21524-8c93-427b-a6cc-04b7585a9545 2025-04-22 11:55:33,740 - root - INFO - Stored embedding for document 9dc21524-8c93-427b-a6cc-04b7585a9545
2025-04-22 11:56:27,580 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully 2025-04-22 11:56:27,580 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully
2025-04-22 11:56:27,588 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully 2025-04-22 11:56:27,588 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully
2025-04-23 22:03:47,212 - root - INFO - Processing upload for document ID: 65dd5884-72ef-4eb4-bb03-b9fad0886da4
2025-04-23 22:03:47,215 - root - INFO - File saved to data/uploads/65dd5884-72ef-4eb4-bb03-b9fad0886da4_3.Bill of Quantities.docx
2025-04-23 22:03:47,260 - root - INFO - Document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 upload initiated successfully
2025-04-23 22:03:47,905 - root - INFO - Processing document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 with content length: 2057
2025-04-23 22:03:48,399 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2025-04-23 22:03:49,194 - root - INFO - Stored embedding for document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 in namespace ''
2025-04-23 22:04:54,737 - root - INFO - Document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 processed successfully
2025-04-23 22:08:00,451 - root - INFO - Processing upload for document ID: 74bcb446-dccb-4707-afb3-dbd45abba9a2
2025-04-23 22:08:00,455 - root - INFO - File saved to data/uploads/74bcb446-dccb-4707-afb3-dbd45abba9a2_8.form of tender.docx
2025-04-23 22:08:00,522 - root - INFO - Document 74bcb446-dccb-4707-afb3-dbd45abba9a2 upload initiated successfully
2025-04-23 22:08:01,027 - root - INFO - Processing document 74bcb446-dccb-4707-afb3-dbd45abba9a2 with content length: 523
2025-04-23 22:08:01,262 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2025-04-23 22:08:01,797 - root - INFO - Stored embedding for document 74bcb446-dccb-4707-afb3-dbd45abba9a2 in namespace 'gcp-starter'
2025-04-23 22:08:46,050 - root - INFO - Document 74bcb446-dccb-4707-afb3-dbd45abba9a2 processed successfully
2025-04-28 21:04:14,200 - root - INFO - Processing upload for document ID: de1e735c-15a7-4d03-9fbf-92e8acc822fd
2025-04-28 21:04:14,200 - root - INFO - File saved to data/uploads/de1e735c-15a7-4d03-9fbf-92e8acc822fd_2.Tender Specifications.docx
2025-04-28 21:04:14,215 - root - INFO - Document de1e735c-15a7-4d03-9fbf-92e8acc822fd upload initiated successfully
2025-04-28 21:04:16,258 - tika.tika - INFO - Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/3.1.0/tika-server-standard-3.1.0.jar to C:\Users\babaw\AppData\Local\Temp\tika-server.jar.
2025-04-28 21:04:52,162 - tika.tika - INFO - Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/3.1.0/tika-server-standard-3.1.0.jar.md5 to C:\Users\babaw\AppData\Local\Temp\tika-server.jar.md5.
2025-04-28 21:04:53,338 - tika.tika - WARNING - Failed to see startup log message; retrying...
2025-04-28 21:04:58,353 - tika.tika - WARNING - Failed to see startup log message; retrying...
2025-04-28 21:05:03,354 - tika.tika - WARNING - Failed to see startup log message; retrying...
2025-04-28 21:05:08,367 - tika.tika - ERROR - Tika startup log message not received after 3 tries.
2025-04-28 21:05:08,367 - tika.tika - ERROR - Failed to receive startup confirmation from startServer.
2025-04-28 21:05:08,367 - root - ERROR - Error parsing document with Tika: Unable to start Tika server.
2025-04-28 21:05:08,369 - root - INFO - Processing document de1e735c-15a7-4d03-9fbf-92e8acc822fd with content length: 0
2025-04-28 21:05:19,572 - root - ERROR - Error processing document de1e735c-15a7-4d03-9fbf-92e8acc822fd: [Errno 11001] getaddrinfo failed
2025-04-28 21:53:58,710 - root - INFO - Processing upload for document ID: c7b13f74-699b-4735-b9bc-a3c2e66545a6
2025-04-28 21:53:58,715 - root - INFO - File saved to data/uploads/c7b13f74-699b-4735-b9bc-a3c2e66545a6_9.confidentiality agreement.pdf
2025-04-28 21:53:58,731 - root - INFO - Document c7b13f74-699b-4735-b9bc-a3c2e66545a6 upload initiated successfully
2025-04-28 21:53:59,507 - root - INFO - Processing document c7b13f74-699b-4735-b9bc-a3c2e66545a6 with content length: 217
2025-04-28 21:54:10,920 - root - ERROR - Error processing document c7b13f74-699b-4735-b9bc-a3c2e66545a6: [Errno 11001] getaddrinfo failed
Binary file not shown.
+1
View File
@@ -7,3 +7,4 @@ groq==0.4.2
python-dotenv==1.0.1 python-dotenv==1.0.1
pydantic==2.6.3 pydantic==2.6.3
sqlalchemy==2.0.27 sqlalchemy==2.0.27
tika==2.6.0
+26
View File
@@ -0,0 +1,26 @@
from tika import parser
import tika
# Path to the file you want to parse
file_path = "C:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\ds_task_scp\\data\\9.confidentiality agreement.pdf"
try:
# Parse the file
parsed = parser.from_file(file_path)
# Extract content
content = parsed["content"]
print("--- Content ---")
print(str(content).strip())
# Extract metadata
metadata = parsed["metadata"]
print("\n--- Metadata ---")
for key, value in metadata.items():
print(f"{key}: {value}")
except FileNotFoundError:
print(f"Error: File not found at {file_path}")
except Exception as e:
print(f"An error occurred: {e}")
+1 -1
View File
@@ -10,7 +10,7 @@ class Settings:
COHERE_API_KEY: str = os.getenv("COHERE_API_KEY", "") COHERE_API_KEY: str = os.getenv("COHERE_API_KEY", "")
DEEPSEEK_API_KEY: str = os.getenv("DEEPSEEK_API_KEY", "") DEEPSEEK_API_KEY: str = os.getenv("DEEPSEEK_API_KEY", "")
PINECONE_API_KEY: str = os.getenv("PINECONE_API_KEY", "") PINECONE_API_KEY: str = os.getenv("PINECONE_API_KEY", "")
PINECONE_ENVIRONMENT: str = os.getenv("PINECONE_ENVIRONMENT", "") PINECONE_NAMESPACE: str = os.getenv("PINECONE_NAMESPACE", "document-compliance")
# Vector DB Settings # Vector DB Settings
PINECONE_INDEX_NAME: str = "document-compliance" PINECONE_INDEX_NAME: str = "document-compliance"
+7 -5
View File
@@ -6,6 +6,7 @@ import os
import logging import logging
from services.config import config from services.config import config
from services.database import Database from services.database import Database
from tika import parser
class DocumentProcessor: class DocumentProcessor:
def __init__(self, vector_store, database: Optional[Database] = None): def __init__(self, vector_store, database: Optional[Database] = None):
@@ -21,13 +22,14 @@ class DocumentProcessor:
def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False): def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False):
try: try:
# Read document content with error handling for encoding # Read document content using Apache Tika for multiple formats
try: try:
import docx parsed = parser.from_file(file_path)
doc = docx.Document(file_path) content = parsed["content"] or ""
content = "\n".join([para.text for para in doc.paragraphs]) if not content.strip():
logging.warning(f"Tika parsed empty content from {file_path}")
except Exception as e: except Exception as e:
logging.error(f"Error reading Word document: {str(e)}") logging.error(f"Error parsing document with Tika: {str(e)}")
content = "" content = ""
logging.info(f"Processing document {doc_id} with content length: {len(content)}") logging.info(f"Processing document {doc_id} with content length: {len(content)}")