Using tika to support multiple file formats
This commit is contained in:
@@ -862,3 +862,35 @@ sqlite3.OperationalError: table analysis has no column named issues_and_recommen
|
|||||||
2025-04-22 11:55:33,740 - root - INFO - Stored embedding for document 9dc21524-8c93-427b-a6cc-04b7585a9545
|
2025-04-22 11:55:33,740 - root - INFO - Stored embedding for document 9dc21524-8c93-427b-a6cc-04b7585a9545
|
||||||
2025-04-22 11:56:27,580 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully
|
2025-04-22 11:56:27,580 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully
|
||||||
2025-04-22 11:56:27,588 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully
|
2025-04-22 11:56:27,588 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully
|
||||||
|
2025-04-23 22:03:47,212 - root - INFO - Processing upload for document ID: 65dd5884-72ef-4eb4-bb03-b9fad0886da4
|
||||||
|
2025-04-23 22:03:47,215 - root - INFO - File saved to data/uploads/65dd5884-72ef-4eb4-bb03-b9fad0886da4_3.Bill of Quantities.docx
|
||||||
|
2025-04-23 22:03:47,260 - root - INFO - Document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 upload initiated successfully
|
||||||
|
2025-04-23 22:03:47,905 - root - INFO - Processing document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 with content length: 2057
|
||||||
|
2025-04-23 22:03:48,399 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
|
||||||
|
2025-04-23 22:03:49,194 - root - INFO - Stored embedding for document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 in namespace ''
|
||||||
|
2025-04-23 22:04:54,737 - root - INFO - Document 65dd5884-72ef-4eb4-bb03-b9fad0886da4 processed successfully
|
||||||
|
2025-04-23 22:08:00,451 - root - INFO - Processing upload for document ID: 74bcb446-dccb-4707-afb3-dbd45abba9a2
|
||||||
|
2025-04-23 22:08:00,455 - root - INFO - File saved to data/uploads/74bcb446-dccb-4707-afb3-dbd45abba9a2_8.form of tender.docx
|
||||||
|
2025-04-23 22:08:00,522 - root - INFO - Document 74bcb446-dccb-4707-afb3-dbd45abba9a2 upload initiated successfully
|
||||||
|
2025-04-23 22:08:01,027 - root - INFO - Processing document 74bcb446-dccb-4707-afb3-dbd45abba9a2 with content length: 523
|
||||||
|
2025-04-23 22:08:01,262 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
|
||||||
|
2025-04-23 22:08:01,797 - root - INFO - Stored embedding for document 74bcb446-dccb-4707-afb3-dbd45abba9a2 in namespace 'gcp-starter'
|
||||||
|
2025-04-23 22:08:46,050 - root - INFO - Document 74bcb446-dccb-4707-afb3-dbd45abba9a2 processed successfully
|
||||||
|
2025-04-28 21:04:14,200 - root - INFO - Processing upload for document ID: de1e735c-15a7-4d03-9fbf-92e8acc822fd
|
||||||
|
2025-04-28 21:04:14,200 - root - INFO - File saved to data/uploads/de1e735c-15a7-4d03-9fbf-92e8acc822fd_2.Tender Specifications.docx
|
||||||
|
2025-04-28 21:04:14,215 - root - INFO - Document de1e735c-15a7-4d03-9fbf-92e8acc822fd upload initiated successfully
|
||||||
|
2025-04-28 21:04:16,258 - tika.tika - INFO - Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/3.1.0/tika-server-standard-3.1.0.jar to C:\Users\babaw\AppData\Local\Temp\tika-server.jar.
|
||||||
|
2025-04-28 21:04:52,162 - tika.tika - INFO - Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/3.1.0/tika-server-standard-3.1.0.jar.md5 to C:\Users\babaw\AppData\Local\Temp\tika-server.jar.md5.
|
||||||
|
2025-04-28 21:04:53,338 - tika.tika - WARNING - Failed to see startup log message; retrying...
|
||||||
|
2025-04-28 21:04:58,353 - tika.tika - WARNING - Failed to see startup log message; retrying...
|
||||||
|
2025-04-28 21:05:03,354 - tika.tika - WARNING - Failed to see startup log message; retrying...
|
||||||
|
2025-04-28 21:05:08,367 - tika.tika - ERROR - Tika startup log message not received after 3 tries.
|
||||||
|
2025-04-28 21:05:08,367 - tika.tika - ERROR - Failed to receive startup confirmation from startServer.
|
||||||
|
2025-04-28 21:05:08,367 - root - ERROR - Error parsing document with Tika: Unable to start Tika server.
|
||||||
|
2025-04-28 21:05:08,369 - root - INFO - Processing document de1e735c-15a7-4d03-9fbf-92e8acc822fd with content length: 0
|
||||||
|
2025-04-28 21:05:19,572 - root - ERROR - Error processing document de1e735c-15a7-4d03-9fbf-92e8acc822fd: [Errno 11001] getaddrinfo failed
|
||||||
|
2025-04-28 21:53:58,710 - root - INFO - Processing upload for document ID: c7b13f74-699b-4735-b9bc-a3c2e66545a6
|
||||||
|
2025-04-28 21:53:58,715 - root - INFO - File saved to data/uploads/c7b13f74-699b-4735-b9bc-a3c2e66545a6_9.confidentiality agreement.pdf
|
||||||
|
2025-04-28 21:53:58,731 - root - INFO - Document c7b13f74-699b-4735-b9bc-a3c2e66545a6 upload initiated successfully
|
||||||
|
2025-04-28 21:53:59,507 - root - INFO - Processing document c7b13f74-699b-4735-b9bc-a3c2e66545a6 with content length: 217
|
||||||
|
2025-04-28 21:54:10,920 - root - ERROR - Error processing document c7b13f74-699b-4735-b9bc-a3c2e66545a6: [Errno 11001] getaddrinfo failed
|
||||||
|
|||||||
Binary file not shown.
+2
-1
@@ -6,4 +6,5 @@ cohere==4.47
|
|||||||
groq==0.4.2
|
groq==0.4.2
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
pydantic==2.6.3
|
pydantic==2.6.3
|
||||||
sqlalchemy==2.0.27
|
sqlalchemy==2.0.27
|
||||||
|
tika==2.6.0
|
||||||
|
|||||||
@@ -0,0 +1,26 @@
|
|||||||
|
from tika import parser
|
||||||
|
import tika
|
||||||
|
|
||||||
|
# Path to the file you want to parse
|
||||||
|
file_path = "C:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\ds_task_scp\\data\\9.confidentiality agreement.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Parse the file
|
||||||
|
parsed = parser.from_file(file_path)
|
||||||
|
|
||||||
|
# Extract content
|
||||||
|
content = parsed["content"]
|
||||||
|
print("--- Content ---")
|
||||||
|
print(str(content).strip())
|
||||||
|
|
||||||
|
# Extract metadata
|
||||||
|
metadata = parsed["metadata"]
|
||||||
|
print("\n--- Metadata ---")
|
||||||
|
for key, value in metadata.items():
|
||||||
|
print(f"{key}: {value}")
|
||||||
|
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Error: File not found at {file_path}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
@@ -10,7 +10,7 @@ class Settings:
|
|||||||
COHERE_API_KEY: str = os.getenv("COHERE_API_KEY", "")
|
COHERE_API_KEY: str = os.getenv("COHERE_API_KEY", "")
|
||||||
DEEPSEEK_API_KEY: str = os.getenv("DEEPSEEK_API_KEY", "")
|
DEEPSEEK_API_KEY: str = os.getenv("DEEPSEEK_API_KEY", "")
|
||||||
PINECONE_API_KEY: str = os.getenv("PINECONE_API_KEY", "")
|
PINECONE_API_KEY: str = os.getenv("PINECONE_API_KEY", "")
|
||||||
PINECONE_ENVIRONMENT: str = os.getenv("PINECONE_ENVIRONMENT", "")
|
PINECONE_NAMESPACE: str = os.getenv("PINECONE_NAMESPACE", "document-compliance")
|
||||||
|
|
||||||
# Vector DB Settings
|
# Vector DB Settings
|
||||||
PINECONE_INDEX_NAME: str = "document-compliance"
|
PINECONE_INDEX_NAME: str = "document-compliance"
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import os
|
|||||||
import logging
|
import logging
|
||||||
from services.config import config
|
from services.config import config
|
||||||
from services.database import Database
|
from services.database import Database
|
||||||
|
from tika import parser
|
||||||
|
|
||||||
class DocumentProcessor:
|
class DocumentProcessor:
|
||||||
def __init__(self, vector_store, database: Optional[Database] = None):
|
def __init__(self, vector_store, database: Optional[Database] = None):
|
||||||
@@ -21,13 +22,14 @@ class DocumentProcessor:
|
|||||||
def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False):
|
def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Read document content with error handling for encoding
|
# Read document content using Apache Tika for multiple formats
|
||||||
try:
|
try:
|
||||||
import docx
|
parsed = parser.from_file(file_path)
|
||||||
doc = docx.Document(file_path)
|
content = parsed["content"] or ""
|
||||||
content = "\n".join([para.text for para in doc.paragraphs])
|
if not content.strip():
|
||||||
|
logging.warning(f"Tika parsed empty content from {file_path}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error reading Word document: {str(e)}")
|
logging.error(f"Error parsing document with Tika: {str(e)}")
|
||||||
content = ""
|
content = ""
|
||||||
|
|
||||||
logging.info(f"Processing document {doc_id} with content length: {len(content)}")
|
logging.info(f"Processing document {doc_id} with content length: {len(content)}")
|
||||||
@@ -293,4 +295,4 @@ class DocumentProcessor:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error reranking issues: {str(e)}")
|
logging.error(f"Error reranking issues: {str(e)}")
|
||||||
# If reranking fails, return the original order
|
# If reranking fails, return the original order
|
||||||
return issues_and_recommendations
|
return issues_and_recommendations
|
||||||
|
|||||||
Reference in New Issue
Block a user