diff --git a/app.log b/app.log index bcfef8c..1089802 100644 --- a/app.log +++ b/app.log @@ -693,3 +693,172 @@ jinja2.exceptions.UndefinedError: list object has no element 1 2025-04-21 22:33:04,556 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" 2025-04-21 22:34:43,603 - root - INFO - Document e79aeb90-799a-4d06-9efd-1d19315eebcc processed successfully 2025-04-21 22:34:43,611 - root - INFO - Document e79aeb90-799a-4d06-9efd-1d19315eebcc processed successfully +2025-04-22 09:39:48,962 - root - INFO - Processing upload for document ID: e838ee14-75a7-483a-9a6e-46b218127dc5 +2025-04-22 09:39:48,968 - root - INFO - File saved to data/uploads/e838ee14-75a7-483a-9a6e-46b218127dc5_7.Supplier SQualification requirements.docx +2025-04-22 09:39:49,014 - root - INFO - Processing document e838ee14-75a7-483a-9a6e-46b218127dc5 with content length: 15335 +2025-04-22 09:39:50,452 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 09:39:51,219 - root - INFO - Stored embedding for document e838ee14-75a7-483a-9a6e-46b218127dc5 +2025-04-22 09:41:03,267 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" +2025-04-22 09:41:56,158 - root - INFO - Document e838ee14-75a7-483a-9a6e-46b218127dc5 processed successfully +2025-04-22 09:41:56,169 - root - INFO - Document e838ee14-75a7-483a-9a6e-46b218127dc5 processed successfully +2025-04-22 09:50:01,851 - root - INFO - Processing upload for document ID: eaed1290-2993-4133-9a04-6a4e5cb8b431 +2025-04-22 09:50:01,854 - root - INFO - File saved to data/uploads/eaed1290-2993-4133-9a04-6a4e5cb8b431_7.Supplier SQualification requirements.docx +2025-04-22 09:50:02,083 - root - ERROR - Error reading Word document: No module named 'exceptions' +2025-04-22 09:50:02,083 - root - INFO - Processing document eaed1290-2993-4133-9a04-6a4e5cb8b431 with content length: 0 +2025-04-22 09:50:02,520 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 09:50:03,207 - root - INFO - Stored embedding for document eaed1290-2993-4133-9a04-6a4e5cb8b431 +2025-04-22 09:50:47,257 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" +2025-04-22 09:51:36,358 - root - INFO - Document eaed1290-2993-4133-9a04-6a4e5cb8b431 processed successfully +2025-04-22 09:51:36,377 - root - INFO - Document eaed1290-2993-4133-9a04-6a4e5cb8b431 processed successfully +2025-04-22 09:55:01,750 - root - INFO - Processing upload for document ID: aec927db-9f43-49b9-90af-2af8ad64e793 +2025-04-22 09:55:01,752 - root - INFO - File saved to data/uploads/aec927db-9f43-49b9-90af-2af8ad64e793_7.Supplier SQualification requirements.docx +2025-04-22 09:55:01,791 - root - ERROR - Error reading Word document: No module named 'exceptions' +2025-04-22 09:55:01,791 - root - INFO - Processing document aec927db-9f43-49b9-90af-2af8ad64e793 with content length: 0 +2025-04-22 09:55:02,033 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 09:55:02,565 - root - INFO - Stored embedding for document aec927db-9f43-49b9-90af-2af8ad64e793 +2025-04-22 09:55:57,175 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" +2025-04-22 09:56:48,889 - root - INFO - Document aec927db-9f43-49b9-90af-2af8ad64e793 processed successfully +2025-04-22 09:56:48,898 - root - INFO - Document aec927db-9f43-49b9-90af-2af8ad64e793 processed successfully +2025-04-22 10:00:35,648 - root - INFO - Processing upload for document ID: 48480333-d451-4907-988b-f059166fd1a5 +2025-04-22 10:00:35,652 - root - INFO - File saved to data/uploads/48480333-d451-4907-988b-f059166fd1a5_9.confidentiality agreement.docx +2025-04-22 10:00:36,025 - root - INFO - Processing document 48480333-d451-4907-988b-f059166fd1a5 with content length: 161 +2025-04-22 10:00:36,689 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 10:00:37,466 - root - INFO - Stored embedding for document 48480333-d451-4907-988b-f059166fd1a5 +2025-04-22 10:01:31,476 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" +2025-04-22 10:02:20,575 - root - INFO - Document 48480333-d451-4907-988b-f059166fd1a5 processed successfully +2025-04-22 10:02:20,592 - root - INFO - Document 48480333-d451-4907-988b-f059166fd1a5 processed successfully +2025-04-22 10:06:04,139 - root - INFO - Deleted document f1d07dde-5de4-4bf6-a14d-b69a433aa855 from index +2025-04-22 10:06:04,142 - root - INFO - Removed document f1d07dde-5de4-4bf6-a14d-b69a433aa855 from vector store +2025-04-22 10:06:04,289 - root - ERROR - Error retrieving metadata for document f1d07dde-5de4-4bf6-a14d-b69a433aa855: Metadata not found for document f1d07dde-5de4-4bf6-a14d-b69a433aa855 +2025-04-22 10:06:04,289 - root - ERROR - Error deleting document: Metadata not found for document f1d07dde-5de4-4bf6-a14d-b69a433aa855 +2025-04-22 10:06:04,289 - root - ERROR - Traceback (most recent call last): + File "C:\Users\babaw\Documents\Work\Mana Knight Digital\ds_task_scp\src\main.py", line 213, in delete_document + metadata = database.get_metadata(doc_id) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\babaw\Documents\Work\Mana Knight Digital\ds_task_scp\src\services\database.py", line 114, in get_metadata + raise FileNotFoundError(f"Metadata not found for document {document_id}") +FileNotFoundError: Metadata not found for document f1d07dde-5de4-4bf6-a14d-b69a433aa855 + +2025-04-22 10:06:08,722 - root - INFO - Deleted document 82d9da57-6291-4ddb-b2e3-d43f467f4dd0 from index +2025-04-22 10:06:08,722 - root - INFO - Removed document 82d9da57-6291-4ddb-b2e3-d43f467f4dd0 from vector store +2025-04-22 10:06:13,324 - root - INFO - Deleted uploaded file: data/uploads/78afc395-9b7c-4388-8d8b-aa1d02fbf75f_2.Tender Specifications.docx +2025-04-22 10:06:13,533 - root - INFO - Deleted document 78afc395-9b7c-4388-8d8b-aa1d02fbf75f from index +2025-04-22 10:06:13,533 - root - INFO - Removed document 78afc395-9b7c-4388-8d8b-aa1d02fbf75f from vector store +2025-04-22 10:06:17,591 - root - INFO - Deleted uploaded file: data/uploads/aecbb62c-b7ed-4c2e-beff-fe5e292de9f1_4.Scope of Work.docx +2025-04-22 10:06:17,860 - root - INFO - Deleted document aecbb62c-b7ed-4c2e-beff-fe5e292de9f1 from index +2025-04-22 10:06:17,860 - root - INFO - Removed document aecbb62c-b7ed-4c2e-beff-fe5e292de9f1 from vector store +2025-04-22 10:06:22,432 - root - INFO - Deleted uploaded file: data/uploads/77063b1d-633c-421e-9591-cde2eb90a979_7.Supplier SQualification requirements.docx +2025-04-22 10:06:22,572 - root - INFO - Deleted document 77063b1d-633c-421e-9591-cde2eb90a979 from index +2025-04-22 10:06:22,572 - root - INFO - Removed document 77063b1d-633c-421e-9591-cde2eb90a979 from vector store +2025-04-22 10:06:26,959 - root - INFO - Deleted uploaded file: data/uploads/e79aeb90-799a-4d06-9efd-1d19315eebcc_2.Tender Specifications.docx +2025-04-22 10:06:27,114 - root - INFO - Deleted document e79aeb90-799a-4d06-9efd-1d19315eebcc from index +2025-04-22 10:06:27,119 - root - INFO - Removed document e79aeb90-799a-4d06-9efd-1d19315eebcc from vector store +2025-04-22 10:06:31,340 - root - INFO - Deleted uploaded file: data/uploads/e838ee14-75a7-483a-9a6e-46b218127dc5_7.Supplier SQualification requirements.docx +2025-04-22 10:06:31,518 - root - INFO - Deleted document e838ee14-75a7-483a-9a6e-46b218127dc5 from index +2025-04-22 10:06:31,520 - root - INFO - Removed document e838ee14-75a7-483a-9a6e-46b218127dc5 from vector store +2025-04-22 10:06:50,772 - root - INFO - Processing upload for document ID: 788f85e4-a873-407b-bfbc-0b7d6a676b9a +2025-04-22 10:06:50,779 - root - INFO - File saved to data/uploads/788f85e4-a873-407b-bfbc-0b7d6a676b9a_2.Tender Specifications.docx +2025-04-22 10:06:51,181 - root - INFO - Processing document 788f85e4-a873-407b-bfbc-0b7d6a676b9a with content length: 2363 +2025-04-22 10:06:51,477 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 10:06:52,399 - root - INFO - Stored embedding for document 788f85e4-a873-407b-bfbc-0b7d6a676b9a +2025-04-22 10:08:26,609 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" +2025-04-22 10:09:18,265 - root - INFO - Document 788f85e4-a873-407b-bfbc-0b7d6a676b9a processed successfully +2025-04-22 10:09:18,287 - root - INFO - Document 788f85e4-a873-407b-bfbc-0b7d6a676b9a processed successfully +2025-04-22 10:31:42,225 - root - INFO - Processing upload for document ID: 804d09d1-22b8-4e49-9bab-6fd7d34181b7 +2025-04-22 10:31:42,227 - root - INFO - File saved to data/uploads/804d09d1-22b8-4e49-9bab-6fd7d34181b7_7.Supplier SQualification requirements.docx +2025-04-22 10:31:42,381 - root - INFO - Processing document 804d09d1-22b8-4e49-9bab-6fd7d34181b7 with content length: 229 +2025-04-22 10:31:42,613 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 10:31:44,246 - root - INFO - Stored embedding for document 804d09d1-22b8-4e49-9bab-6fd7d34181b7 +2025-04-22 10:32:27,429 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" +2025-04-22 10:33:19,434 - root - INFO - Document 804d09d1-22b8-4e49-9bab-6fd7d34181b7 processed successfully +2025-04-22 10:33:19,446 - root - INFO - Document 804d09d1-22b8-4e49-9bab-6fd7d34181b7 processed successfully +2025-04-22 10:51:21,371 - root - INFO - Processing upload for document ID: 8d580b49-94bb-473e-90dd-66ee00f77048 +2025-04-22 10:51:21,372 - root - INFO - File saved to data/uploads/8d580b49-94bb-473e-90dd-66ee00f77048_8.form of tender.docx +2025-04-22 10:51:21,496 - root - INFO - Processing document 8d580b49-94bb-473e-90dd-66ee00f77048 with content length: 523 +2025-04-22 10:51:21,994 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 10:51:22,895 - root - INFO - Stored embedding for document 8d580b49-94bb-473e-90dd-66ee00f77048 +2025-04-22 10:52:05,978 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 400 Bad Request" +2025-04-22 10:52:05,978 - root - ERROR - Error using Cohere reranker: status_code: 400, body: {'message': 'invalid request: number of total max chunks (number of documents * max chunks per doc) must be less than 10000'} +2025-04-22 10:56:57,188 - root - INFO - Processing upload for document ID: 5ab90386-7d4e-45b2-a1a6-40ad23f59428 +2025-04-22 10:56:57,189 - root - INFO - File saved to data/uploads/5ab90386-7d4e-45b2-a1a6-40ad23f59428_8.form of tender.docx +2025-04-22 10:56:57,312 - root - INFO - Processing document 5ab90386-7d4e-45b2-a1a6-40ad23f59428 with content length: 523 +2025-04-22 10:56:57,661 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 10:56:58,794 - root - INFO - Stored embedding for document 5ab90386-7d4e-45b2-a1a6-40ad23f59428 +2025-04-22 10:57:57,544 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 400 Bad Request" +2025-04-22 10:57:57,544 - root - ERROR - Error using Cohere reranker: status_code: 400, body: {'message': 'invalid request: list of documents must not be empty'} +2025-04-22 10:57:57,563 - root - INFO - Document 5ab90386-7d4e-45b2-a1a6-40ad23f59428 processed successfully +2025-04-22 10:57:57,563 - root - INFO - Document 5ab90386-7d4e-45b2-a1a6-40ad23f59428 processed successfully +2025-04-22 11:04:23,803 - root - INFO - Processing upload for document ID: 3065e0dd-0b2e-454c-8f7e-dd0e464dbf7a +2025-04-22 11:04:23,806 - root - INFO - File saved to data/uploads/3065e0dd-0b2e-454c-8f7e-dd0e464dbf7a_3.Bill of Quantities.docx +2025-04-22 11:04:23,950 - root - INFO - Processing document 3065e0dd-0b2e-454c-8f7e-dd0e464dbf7a with content length: 2057 +2025-04-22 11:04:24,294 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 11:04:24,927 - root - INFO - Stored embedding for document 3065e0dd-0b2e-454c-8f7e-dd0e464dbf7a +2025-04-22 11:05:28,953 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 400 Bad Request" +2025-04-22 11:05:28,953 - root - ERROR - Error using Cohere reranker: status_code: 400, body: {'message': 'invalid request: list of documents must not be empty'} +2025-04-22 11:05:28,969 - root - INFO - Document 3065e0dd-0b2e-454c-8f7e-dd0e464dbf7a processed successfully +2025-04-22 11:05:28,980 - root - INFO - Document 3065e0dd-0b2e-454c-8f7e-dd0e464dbf7a processed successfully +2025-04-22 11:13:54,204 - root - INFO - Processing upload for document ID: 201e4896-3d89-466b-852d-783ef3e30f83 +2025-04-22 11:13:54,206 - root - INFO - File saved to data/uploads/201e4896-3d89-466b-852d-783ef3e30f83_7.Supplier SQualification requirements.docx +2025-04-22 11:13:54,311 - root - INFO - Processing document 201e4896-3d89-466b-852d-783ef3e30f83 with content length: 229 +2025-04-22 11:13:54,726 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 11:13:55,243 - root - INFO - Stored embedding for document 201e4896-3d89-466b-852d-783ef3e30f83 +2025-04-22 11:14:42,644 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 400 Bad Request" +2025-04-22 11:14:42,644 - root - ERROR - Error using Cohere reranker: status_code: 400, body: {'message': 'invalid request: list of documents must not be empty'} +2025-04-22 11:14:42,644 - root - INFO - Document 201e4896-3d89-466b-852d-783ef3e30f83 processed successfully +2025-04-22 11:14:42,659 - root - INFO - Document 201e4896-3d89-466b-852d-783ef3e30f83 processed successfully +2025-04-22 11:18:42,103 - root - INFO - Processing upload for document ID: bc1e71ac-6b65-4b2c-b1e8-81844b49ba5d +2025-04-22 11:18:42,104 - root - INFO - File saved to data/uploads/bc1e71ac-6b65-4b2c-b1e8-81844b49ba5d_7.Supplier SQualification requirements.docx +2025-04-22 11:18:42,225 - root - INFO - Processing document bc1e71ac-6b65-4b2c-b1e8-81844b49ba5d with content length: 229 +2025-04-22 11:18:42,443 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 11:18:45,759 - root - INFO - Stored embedding for document bc1e71ac-6b65-4b2c-b1e8-81844b49ba5d +2025-04-22 11:19:35,113 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" +2025-04-22 11:19:35,138 - root - INFO - Document bc1e71ac-6b65-4b2c-b1e8-81844b49ba5d processed successfully +2025-04-22 11:19:35,145 - root - INFO - Document bc1e71ac-6b65-4b2c-b1e8-81844b49ba5d processed successfully +2025-04-22 11:27:23,936 - root - INFO - Processing upload for document ID: a76f7f9c-59ec-4f7b-8b4e-168f2db5f92e +2025-04-22 11:27:23,938 - root - INFO - File saved to data/uploads/a76f7f9c-59ec-4f7b-8b4e-168f2db5f92e_7.Supplier SQualification requirements.docx +2025-04-22 11:27:24,051 - root - INFO - Processing document a76f7f9c-59ec-4f7b-8b4e-168f2db5f92e with content length: 229 +2025-04-22 11:27:24,259 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 11:27:24,775 - root - INFO - Stored embedding for document a76f7f9c-59ec-4f7b-8b4e-168f2db5f92e +2025-04-22 11:28:04,075 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" +2025-04-22 11:28:04,108 - root - INFO - Document a76f7f9c-59ec-4f7b-8b4e-168f2db5f92e processed successfully +2025-04-22 11:28:04,108 - root - INFO - Document a76f7f9c-59ec-4f7b-8b4e-168f2db5f92e processed successfully +2025-04-22 11:33:42,952 - root - INFO - Processing upload for document ID: 6fd270a4-e76a-4234-a4c4-a40d2ac64d56 +2025-04-22 11:33:42,954 - root - INFO - File saved to data/uploads/6fd270a4-e76a-4234-a4c4-a40d2ac64d56_4.Scope of Work.docx +2025-04-22 11:33:43,183 - root - INFO - Processing document 6fd270a4-e76a-4234-a4c4-a40d2ac64d56 with content length: 282 +2025-04-22 11:33:43,425 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 11:33:43,875 - root - INFO - Stored embedding for document 6fd270a4-e76a-4234-a4c4-a40d2ac64d56 +2025-04-22 11:34:30,942 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" +2025-04-22 11:34:30,960 - root - INFO - Document 6fd270a4-e76a-4234-a4c4-a40d2ac64d56 processed successfully +2025-04-22 11:34:30,960 - root - INFO - Document 6fd270a4-e76a-4234-a4c4-a40d2ac64d56 processed successfully +2025-04-22 11:43:04,651 - root - INFO - Processing upload for document ID: e92e078c-6d36-46b1-89ba-03f7387947e9 +2025-04-22 11:43:04,652 - root - INFO - File saved to data/uploads/e92e078c-6d36-46b1-89ba-03f7387947e9_9.confidentiality agreement.docx +2025-04-22 11:43:04,758 - root - INFO - Processing document e92e078c-6d36-46b1-89ba-03f7387947e9 with content length: 161 +2025-04-22 11:43:05,057 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 11:43:05,924 - root - INFO - Stored embedding for document e92e078c-6d36-46b1-89ba-03f7387947e9 +2025-04-22 11:43:46,791 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK" +2025-04-22 11:43:46,807 - root - INFO - Document e92e078c-6d36-46b1-89ba-03f7387947e9 processed successfully +2025-04-22 11:43:46,807 - root - INFO - Document e92e078c-6d36-46b1-89ba-03f7387947e9 processed successfully +2025-04-22 11:52:38,067 - root - INFO - Processing upload for document ID: 2813c3dc-8496-4aff-b945-0b04e5c439c0 +2025-04-22 11:52:38,069 - root - INFO - File saved to data/uploads/2813c3dc-8496-4aff-b945-0b04e5c439c0_7.Supplier SQualification requirements.docx +2025-04-22 11:52:38,184 - root - INFO - Processing document 2813c3dc-8496-4aff-b945-0b04e5c439c0 with content length: 229 +2025-04-22 11:52:38,574 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 11:52:39,190 - root - INFO - Stored embedding for document 2813c3dc-8496-4aff-b945-0b04e5c439c0 +2025-04-22 11:53:23,757 - root - ERROR - Error saving analysis for document 2813c3dc-8496-4aff-b945-0b04e5c439c0: table analysis has no column named issues_and_recommendations +2025-04-22 11:53:23,757 - root - ERROR - Error processing document 2813c3dc-8496-4aff-b945-0b04e5c439c0: table analysis has no column named issues_and_recommendations +2025-04-22 11:53:23,757 - root - ERROR - Error processing document: table analysis has no column named issues_and_recommendations +2025-04-22 11:53:23,783 - root - ERROR - Traceback (most recent call last): + File "C:\Users\babaw\Documents\Work\Mana Knight Digital\ds_task_scp\src\main.py", line 112, in upload_document + await document_processor.process_document(doc_id, file_path, document_type) + File "C:\Users\babaw\Documents\Work\Mana Knight Digital\ds_task_scp\src\services\document_processor.py", line 141, in process_document + self.database.save_analysis(doc_id, analysis) + File "C:\Users\babaw\Documents\Work\Mana Knight Digital\ds_task_scp\src\services\database.py", line 50, in save_analysis + cursor.execute(''' +sqlite3.OperationalError: table analysis has no column named issues_and_recommendations + +2025-04-22 11:55:32,835 - root - INFO - Processing upload for document ID: 9dc21524-8c93-427b-a6cc-04b7585a9545 +2025-04-22 11:55:32,836 - root - INFO - File saved to data/uploads/9dc21524-8c93-427b-a6cc-04b7585a9545_8.form of tender.docx +2025-04-22 11:55:32,958 - root - INFO - Processing document 9dc21524-8c93-427b-a6cc-04b7585a9545 with content length: 523 +2025-04-22 11:55:33,174 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK" +2025-04-22 11:55:33,740 - root - INFO - Stored embedding for document 9dc21524-8c93-427b-a6cc-04b7585a9545 +2025-04-22 11:56:27,580 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully +2025-04-22 11:56:27,588 - root - INFO - Document 9dc21524-8c93-427b-a6cc-04b7585a9545 processed successfully diff --git a/src/main.py b/src/main.py index a5c02f1..f435e1c 100644 --- a/src/main.py +++ b/src/main.py @@ -135,6 +135,7 @@ async def get_analysis(request: Request, doc_id: str): analysis = await document_processor.get_analysis(doc_id) metadata = database.get_metadata(doc_id) + print(f"analysis: {analysis}") return templates.TemplateResponse( "analysis.html", { diff --git a/src/services/database.py b/src/services/database.py index 547db65..4c468e6 100644 --- a/src/services/database.py +++ b/src/services/database.py @@ -16,16 +16,70 @@ class Database: with sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() - # Create analysis table - cursor.execute(''' - CREATE TABLE IF NOT EXISTS analysis ( - document_id TEXT PRIMARY KEY, - summary TEXT, - issues TEXT, - recommendations TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - ''') + # Check if we need to migrate the old schema + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='analysis'") + table_exists = cursor.fetchone() is not None + + if table_exists: + # Check if we need to migrate + cursor.execute("PRAGMA table_info(analysis)") + columns = [column[1] for column in cursor.fetchall()] + + if 'issues_and_recommendations' not in columns: + # Backup old data + cursor.execute("SELECT document_id, summary, issues, recommendations FROM analysis") + old_data = cursor.fetchall() + + # Drop the old table + cursor.execute("DROP TABLE analysis") + + # Create the new table + cursor.execute(''' + CREATE TABLE analysis ( + document_id TEXT PRIMARY KEY, + summary TEXT, + issues_and_recommendations TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + ''') + + # Migrate old data to new format + for row in old_data: + doc_id, summary, issues, recommendations = row + try: + old_issues = json.loads(issues) if issues else [] + old_recommendations = json.loads(recommendations) if recommendations else [] + + # Combine issues and recommendations + issues_and_recommendations = [] + for i in range(max(len(old_issues), len(old_recommendations))): + issue = old_issues[i]['issue'] if i < len(old_issues) else "Unknown Issue" + recommendation = old_recommendations[i] if i < len(old_recommendations) else "No recommendation provided" + issues_and_recommendations.append({ + "issue": issue, + "recommendation": recommendation + }) + + cursor.execute(''' + INSERT INTO analysis (document_id, summary, issues_and_recommendations) + VALUES (?, ?, ?) + ''', ( + doc_id, + summary, + json.dumps(issues_and_recommendations) + )) + except Exception as e: + logging.error(f"Error migrating data for document {doc_id}: {str(e)}") + else: + # Create the new table if it doesn't exist + cursor.execute(''' + CREATE TABLE IF NOT EXISTS analysis ( + document_id TEXT PRIMARY KEY, + summary TEXT, + issues_and_recommendations TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + ''') # Create metadata table cursor.execute(''' @@ -49,13 +103,12 @@ class Database: with sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() cursor.execute(''' - INSERT OR REPLACE INTO analysis (document_id, summary, issues, recommendations) - VALUES (?, ?, ?, ?) + INSERT OR REPLACE INTO analysis (document_id, summary, issues_and_recommendations) + VALUES (?, ?, ?) ''', ( document_id, analysis['summary'], - json.dumps(analysis['issues']), - json.dumps(analysis['recommendations']) + json.dumps(analysis['issues_and_recommendations']) )) conn.commit() except Exception as e: @@ -67,7 +120,7 @@ class Database: try: with sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() - cursor.execute('SELECT summary, issues, recommendations FROM analysis WHERE document_id = ?', (document_id,)) + cursor.execute('SELECT summary, issues_and_recommendations FROM analysis WHERE document_id = ?', (document_id,)) result = cursor.fetchone() if not result: @@ -76,8 +129,7 @@ class Database: return { 'document_id': document_id, 'summary': result[0], - 'issues': json.loads(result[1]), - 'recommendations': json.loads(result[2]) + 'issues_and_recommendations': json.loads(result[1]) } except Exception as e: logging.error(f"Error retrieving analysis for document {document_id}: {str(e)}") diff --git a/src/services/document_processor.py b/src/services/document_processor.py index f725992..df776cc 100644 --- a/src/services/document_processor.py +++ b/src/services/document_processor.py @@ -19,15 +19,16 @@ class DocumentProcessor: self.database = Database() async def process_document(self, doc_id: str, file_path: str, document_type: str, is_resubmission: bool = False): + try: # Read document content with error handling for encoding try: - with open(file_path, 'r', encoding='utf-8') as f: - content = f.read() - except UnicodeDecodeError: - # Try with a different encoding if UTF-8 fails - with open(file_path, 'r', encoding='latin-1') as f: - content = f.read() + import docx + doc = docx.Document(file_path) + content = "\n".join([para.text for para in doc.paragraphs]) + except Exception as e: + logging.error(f"Error reading Word document: {str(e)}") + content = "" logging.info(f"Processing document {doc_id} with content length: {len(content)}") @@ -54,7 +55,7 @@ class DocumentProcessor: "content": content } ], - "max_tokens": 4000 + "max_tokens": 1000 } # Make the API call with error handling @@ -84,21 +85,22 @@ class DocumentProcessor: logging.error(f"Error calling DeepSeek API: {str(e)}") summary = "Document analysis could not be completed due to API connection issues." - # Process with DeepSeek for deep reasoning using URL + # Process with DeepSeek for issues and recommendations deepseek_payload = { "model": "deepseek-chat", "messages": [ { "role": "system", - "content": "You are an expert in document compliance analysis. Analyze the following document for compliance issues and provide detailed feedback." + "content": f"You are an expert in document compliance analysis for this type of document: {document_type}. Analyze the following document for compliance issues and provide detailed feedback." }, { "role": "user", "content": f"""Analyze this type of document {document_type} for compliance issues and provide detailed feedback:\n\n{content} - and these are the main sections of the document:\n\n{summary}""" + and these are the main sections of the document:\n\n{summary}.. + Return the issues and recommendations in a structured format: 'Issue: . Recommendation: .'""" } ], - "max_tokens": 4000 + "max_tokens": 1000 } # Make the API call with error handling @@ -113,39 +115,26 @@ class DocumentProcessor: # Check if the response is successful if deepseek_response.status_code != 200: logging.error(f"DeepSeek API error: {deepseek_response.status_code} - {deepseek_response.text}") - # Use a fallback for issues if the API call fails - issues = ["Document analysis could not be completed due to API limitations."] + # Use a fallback for issues and recommendations if the API call fails + issues_and_recommendations = [{"issue": "Document analysis could not be completed due to API limitations.", "recommendation": "Please try again later."}] else: # Try to parse the JSON response try: deepseek_result = deepseek_response.json() - issues = self._extract_issues(deepseek_result['choices'][0]['message']['content']) + issues_and_recommendations = self._extract_issues_and_recommendations(deepseek_result['choices'][0]['message']['content']) except (json.JSONDecodeError, KeyError) as e: logging.error(f"Error parsing DeepSeek response: {str(e)}") logging.error(f"Response text: {deepseek_response.text}") - issues = ["Document analysis could not be completed due to parsing errors."] + issues_and_recommendations = [{"issue": "Document analysis could not be completed due to parsing errors.", "recommendation": "Please try again later."}] except requests.exceptions.RequestException as e: logging.error(f"Error calling DeepSeek API: {str(e)}") - issues = ["Document analysis could not be completed due to API connection issues."] - - # Use Cohere reranker to prioritize issues - try: - reranked_issues = self.cohere_client.rerank( - query="Compliance issues in technical document", - documents=issues, - model=config.COHERE_RERANKER_MODEL - ) - except Exception as e: - logging.error(f"Error using Cohere reranker: {str(e)}") - # Create a simple reranked issues list if Cohere fails - reranked_issues = [type('obj', (object,), {'document': issue, 'index': i}) for i, issue in enumerate(issues)] + issues_and_recommendations = [{"issue": "Document analysis could not be completed due to API connection issues.", "recommendation": "Please try again later."}] # Store analysis results analysis = { "document_id": doc_id, "summary": summary, - "issues": self._format_issues(reranked_issues), - "recommendations": self._generate_recommendations(reranked_issues) + "issues_and_recommendations": issues_and_recommendations } # Save analysis to database @@ -176,73 +165,94 @@ class DocumentProcessor: async def get_analysis(self, doc_id: str) -> Dict[str, Any]: return self.database.get_analysis(doc_id) - def _extract_issues(self, deepseek_response: str) -> List[str]: - # Simple extraction of issues from DeepSeek's response - # In a real implementation, this would be more sophisticated - print(deepseek_response) - return [issue.strip() for issue in re.split(r'\d+\.', deepseek_response) if issue.strip()] + def _extract_issues_and_recommendations(self, deepseek_response: str) -> List[Dict[str, str]]: + # Extract issues and recommendations from DeepSeek's response - def _format_issues(self, reranked_issues) -> List[Dict[str, Any]]: - return [ - { - "issue": issue[0] if isinstance(issue, tuple) else issue.document, - "severity": "high" if i < 3 else "medium" if i < 6 else "low", - "rank": i + 1 - } - for i, issue in enumerate(reranked_issues) - ] + issues_and_recommendations = [] - def _generate_recommendations(self, reranked_issues) -> List[str]: - # Generate specific recommendations for each issue - recommendations = [] - print(f"Generating recommendations for {reranked_issues} issues") - # Extract the results from the RerankResponse object - results = reranked_issues.results if hasattr(reranked_issues, 'results') else reranked_issues + # Split the response into lines + lines = deepseek_response.split('\n') - for issue in results[:5]: # Focus on top 5 issues - recommendation_payload = { - "model": "deepseek-chat", - "messages": [ - { - "role": "system", - "content": "You are an expert in document compliance. Provide specific, actionable recommendations to fix compliance issues." - }, - { - "role": "user", - "content": f"Provide a specific, actionable recommendation to fix this compliance issue: {issue}" - } - ], - "max_tokens": 1000 - } + current_issue = None + current_recommendation = None + + for i, line in enumerate(lines): + line = line.strip() - # Make the API call with error handling - try: - recommendation_response = requests.post( - self.deepseek_url, - json=recommendation_payload, - headers=self.deepseek_headers, - timeout=60 # Add timeout - ) + # Check if this line contains an issue + if '**Issue:**' in line: + # If we already have an issue and recommendation, add them to the list + if current_issue and current_recommendation: + issues_and_recommendations.append({ + 'issue': current_issue, + 'recommendation': current_recommendation + }) - # Check if the response is successful - if recommendation_response.status_code != 200: - logging.error(f"DeepSeek API error: {recommendation_response.status_code} - {recommendation_response.text}") - recommendations.append("Recommendation could not be generated due to API limitations.") - else: - # Try to parse the JSON response - try: - recommendation_result = recommendation_response.json() - recommendations.append(recommendation_result['choices'][0]['message']['content']) - except (json.JSONDecodeError, KeyError) as e: - logging.error(f"Error parsing DeepSeek response: {str(e)}") - logging.error(f"Response text: {recommendation_response.text}") - recommendations.append("Recommendation could not be generated due to parsing errors.") - except requests.exceptions.RequestException as e: - logging.error(f"Error calling DeepSeek API: {str(e)}") - recommendations.append("Recommendation could not be generated due to API connection issues.") - - return recommendations - + # Extract the issue text + issue_text = line.split('**Issue:**')[1].strip() + current_issue = issue_text + current_recommendation = None + + # Check if this line contains a recommendation + elif '**Recommendation:**' in line: + # Extract the recommendation text + recommendation_text = line.split('**Recommendation:**')[1].strip() + current_recommendation = recommendation_text + + # If we're at the last line and have both issue and recommendation, add them + elif i == len(lines) - 1 and current_issue and current_recommendation: + issues_and_recommendations.append({ + 'issue': current_issue, + 'recommendation': current_recommendation + }) + + # If we still have an issue and recommendation at the end, add them + if current_issue and current_recommendation: + issues_and_recommendations.append({ + 'issue': current_issue, + 'recommendation': current_recommendation + }) + + # If no issues were found, try an alternative approach + if not issues_and_recommendations: + # Look for numbered issues in the format "1. **Issue:** ... **Recommendation:** ..." + for line in lines: + if line.strip().startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')): + if '**Issue:**' in line and '**Recommendation:**' in line: + parts = line.split('**Recommendation:**') + if len(parts) == 2: + issue_part = parts[0] + recommendation_part = parts[1] + + # Clean up the issue text + issue = issue_part.split('**Issue:**')[1].strip() + + # Clean up the recommendation text + recommendation = recommendation_part.strip() + + issues_and_recommendations.append({'issue': issue, 'recommendation': recommendation}) + + # If still no issues found, try one more approach + if not issues_and_recommendations: + # Look for any line containing both Issue and Recommendation + for line in lines: + if 'Issue:' in line and 'Recommendation:' in line: + parts = line.split('Recommendation:') + if len(parts) == 2: + issue_part = parts[0] + recommendation_part = parts[1] + + # Clean up the issue text + issue = issue_part.replace('Issue:', '').strip() + + # Clean up the recommendation text + recommendation = recommendation_part.strip() + + issues_and_recommendations.append({'issue': issue, 'recommendation': recommendation}) + + print(f"issues_and_recommendations: {issues_and_recommendations}") + return issues_and_recommendations + def _store_document(self, doc_id: str, file_path: str): # save document to vector store self.vector_store.add_document(doc_id, file_path) \ No newline at end of file diff --git a/src/templates/analysis.html b/src/templates/analysis.html index 372bb19..c82b9ef 100644 --- a/src/templates/analysis.html +++ b/src/templates/analysis.html @@ -30,25 +30,16 @@ -
Compliance Issues
-
- {% for issue in analysis.issues %} -
-

- -

-
-
-

Rank: {{ issue.rank }}

-

Recommendation:

-
- {{ analysis.recommendations[loop.index0]|markdown|safe }} -
+
Issues and Recommendations
+
+ {% for item in analysis.issues_and_recommendations %} +
+
+ Issue: +

{{ item.issue }}

+ Recommendation: +
+ {{ item.recommendation|markdown|safe }}
diff --git a/src/templates/index.html b/src/templates/index.html index 862c6ce..ecdaf0c 100644 --- a/src/templates/index.html +++ b/src/templates/index.html @@ -27,7 +27,7 @@
-
Supported formats: PDF, DOCX, TXT, MD
+
Supported formats: DOCX