diff --git a/__pycache__/utils.cpython-311.pyc b/__pycache__/utils.cpython-311.pyc index e43127e3..98455ab4 100644 Binary files a/__pycache__/utils.cpython-311.pyc and b/__pycache__/utils.cpython-311.pyc differ diff --git a/data/documents.json b/data/documents.json deleted file mode 100644 index 40071c7b..00000000 --- a/data/documents.json +++ /dev/null @@ -1 +0,0 @@ -{"doc_names": ["audio-2", "data\\Car-Repair-Receipt-repair", "data\\Car-Repair-Receipt-service", "data\\Car-Repair-Receipt-tire", "data\\Car-Repair-Receipt-tuning", "data\\Car-Repair-Receipt-wash", "data\\corolla-2020-toyota-owners-manual", "data\\dodge-challenger-auto-body-repair-after", "data\\dodge-challenger-auto-body-repair-before", "data\\How to change engine oil and filter on TOYOTA Corolla", "data\\How to change front brake pads on TOYOTA Corolla", "How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]", "How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]", "data\\How to change rear windshield wipers on TOYOTA Corolla", "data\\How to change spark plugs on TOYOTA COROLLA", "data\\hyundai-sonata-auto-body-repair-after", "data\\hyundai-sonata-auto-body-repair-before", "data\\IMG_1436", "data\\IMG_1437", "data\\IMG_1438", "data\\IMG_1440", "data\\IMG_1441", "data\\IMG_1442", "data\\IMG_1443", "data\\IMG_1444", "data\\pontiac-vibe-auto-body-repair-after", "data\\pontiac-vibe-auto-body-repair-before", "test_rec", "data\\toyota-tacoma-auto-body-repair-after", "data\\toyota-tacoma-auto-body-repair-before"], "docs_id": ["e7280b3ec313491f8ce5c5d59b52788e", "880e0ad1d2ce43c39716e8d45d584000", "3b1c312fe9d0490ba3b7a841ce3fb136", "a6f5d1b0ba8d4fa7a828fdad8ae17bb7", "41f450a6b2c24af6a45065b2d3eba6e5", "d7af8ae82db341bf818e8f16420a9570", "eb95b3ced46548ba9d18eacba3c3e00c", "806b676fecfd47339c506ece2af3122d", "fcdc8661e11541c0a56c825626f2467e", "5ab12ad4f0ce4ce98e27e56fe5663ebd", "adf95aa758254b069cac03f52f0993b8", "1c3ef8dd242f40418dd34bfd3ce95b8a", "47d4abbea3da4121930fc33b3c472fdf", "10963a5e3985497888ce30cc66ae6314", "111c4a4e07534f47b1ed671ee2b684df", "2aa09a5f3fb54728b8370875ddc3d1fa", "7f902694851e41218ef8498b6c5c1553", "e1b0b06f6dad4c56bf976f6c2135df35", "00564531a33c4896b0d329170ce1f04f", "3aa1149c564c47c4b7b1f6a641e37770", "34a3e18b638b454ba283ba5741bad3a7", "0c7b7bf52d684f198dee19e87a5000bc", "9f1fc0cdcf104e4d948321f0ee5875b9", "113fab4522f94b7b956236b769e391c7", "8a75069c830640b5bddd661adcd612a9", "428d5aa38b9148ef95cf0662d50e0e18", "4cbb1c9f8d8a44feb44b554f9a39bd59", "04fe5073479c462aa819b0131fc9bc9b", "037fd77b66f045a0ac0682a07d57885b", "299625168b8e492baafdbb2a871cd061"], "num_pages": [7, 1, 2, 2, 2, 1, 588, 1, 1, 6, 7, 21, 12, 6, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} \ No newline at end of file diff --git a/index/faiss_index_data/index.faiss b/index/faiss_index_data/index.faiss index 90f5f6b4..2fe59d49 100644 Binary files a/index/faiss_index_data/index.faiss and b/index/faiss_index_data/index.faiss differ diff --git a/index/faiss_index_data/index.pkl b/index/faiss_index_data/index.pkl index fd62b2c3..bcc36d55 100644 Binary files a/index/faiss_index_data/index.pkl and b/index/faiss_index_data/index.pkl differ diff --git a/loggings/app.log b/loggings/app.log index f3e75e05..17a9c530 100644 --- a/loggings/app.log +++ b/loggings/app.log @@ -533,3 +533,131 @@ To avoid injury, hold up the wheel when unscrewing the bolts. 2024-08-15 12:50:08,732 - INFO - Vector store created 2024-08-15 12:50:08,732 - INFO - Saving the vector store 2024-08-15 12:50:08,745 - INFO - Vector store saved +2024-08-15 22:27:53,640 - INFO - Loading data from ./data +2024-08-15 22:28:26,454 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:15,773 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:18,312 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:21,194 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:23,563 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:25,707 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:28,079 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:30,046 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:32,244 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:33,863 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:34,996 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:37,244 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:39,593 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:41,416 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:42,381 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:43,283 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:45,605 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:46,734 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:50,144 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:53,144 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:54,149 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:29:56,683 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:01,059 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:04,416 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:07,578 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:08,304 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:12,904 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:16,304 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:19,593 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:20,690 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:22,950 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:27,134 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:29,830 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 503 Service Unavailable" +2024-08-15 22:30:31,234 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:44,310 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:53,513 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:30:57,355 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:30:57,849 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:30:58,324 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:30:58,763 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:30:59,189 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:30:59,624 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:31:00,064 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:31:00,434 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:31:00,843 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:31:01,323 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:31:01,834 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:38:18,873 - INFO - Loading data from ./data +2024-08-15 22:38:45,628 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:39:49,550 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:39:54,348 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:39:57,845 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:01,631 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:04,163 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:06,901 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:07,582 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:08,282 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:09,997 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:12,182 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:12,765 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:15,845 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:17,389 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:18,451 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:21,247 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:21,334 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:23,360 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:24,218 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:24,552 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:28,755 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:33,043 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:35,045 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:37,390 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:41,155 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:45,315 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:49,518 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:50,481 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:53,073 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:40:56,877 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:01,323 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:02,802 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:05,165 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:09,849 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:13,516 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:13,872 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:16,590 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:20,432 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:23,509 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:25,284 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:25,999 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK" +2024-08-15 22:41:32,085 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:32,567 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:32,987 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:33,440 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:33,916 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:34,331 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:34,824 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:35,191 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:35,573 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:36,235 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:36,731 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:37,163 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:37,556 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:38,165 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:38,601 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:38,987 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:39,409 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:39,901 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:40,290 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:40,640 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:41,031 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:41,393 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:41,835 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:42,190 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:42,604 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:42,974 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:43,339 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:43,773 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:44,140 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:44,574 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK" +2024-08-15 22:41:44,574 - INFO - Data loaded +2024-08-15 22:41:44,574 - INFO - Creating vector store +2024-08-15 22:41:46,104 - WARNING - C:\Users\timmy_3aupohg\anaconda3\envs\smog_env\Lib\site-packages\transformers\models\bert\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\transformers\cuda\sdp_utils.cpp:555.) + attn_output = torch.nn.functional.scaled_dot_product_attention( + +2024-08-15 22:41:56,183 - INFO - Vector store created +2024-08-15 22:41:56,198 - INFO - Saving the vector store +2024-08-15 22:41:56,198 - INFO - Vector store saved diff --git a/requirements.txt b/requirements.txt index 0cf4c531..13f49dfa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,5 @@ groq python-dotenv pydub moviepy -ffmpeg-python \ No newline at end of file +ffmpeg-python +langchain-groq \ No newline at end of file diff --git a/search_note.ipynb b/search_note.ipynb index ad86dd88..a00b0774 100644 --- a/search_note.ipynb +++ b/search_note.ipynb @@ -6,30 +6,299 @@ "metadata": {}, "outputs": [], "source": [ - "from utils import search\n", - "import sys, os" + "# !pip install langchain-groq" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from utils import search\n", + "import sys, os\n", + "from dotenv import load_dotenv\n", + "from langchain_groq import ChatGroq\n", + "from langchain_core.prompts.prompt import PromptTemplate\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "load_dotenv()" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# setting up groq api key\n", + "os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "\n", + "# chat set up\n", + "GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n", + "\n", + "\n", + "### Chains #####\n", + "# Initiator\n", + "def doc_summarizer(document_page: list) -> str:\n", + " initiator_prompt = PromptTemplate(\n", + " template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n", + " Create a short summary of the document based on the provided text. \n", + " \n", + " Start with: This document is about...\n", + " \n", + " <|eot_id|><|start_header_id|>user<|end_header_id|>\n", + " DOCUMENT: {document_page} \\n\n", + " \n", + " <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n", + " input_variables=[\"document_page\"],\n", + " )\n", + "\n", + " initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n", + " output = initiator_router.invoke({\"document_page\":document_page})\n", + " return output\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "document_page = 'How to change the engine oil of a toyota corrolla.'\n", + "# testing the function\n", + "summary = doc_summarizer(document_page)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "docs = search(document_page)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", + " 'page': 1,\n", + " 'file_type': 'text'},\n", + " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n", + " {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", + " 'page': 3,\n", + " 'file_type': 'text'},\n", + " {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", + " 'page': 2,\n", + " 'file_type': 'text'},\n", + " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n", + " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", + " 'page': 2,\n", + " 'file_type': 'text'},\n", + " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", + " 'page': 3,\n", + " 'file_type': 'text'},\n", + " {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", + " 'page': 0,\n", + " 'file_type': 'text'},\n", + " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", + " 'page': 5,\n", + " 'file_type': 'text'},\n", + " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", + " 'page': 6,\n", + " 'file_type': 'text'},\n", + " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n", + " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n", + " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n", + " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n", + " {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n", + " {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n", + " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n", + " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n", + " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n", + " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "def transform_file_data(input_data):\n", + " # Create a dictionary to aggregate data by filename\n", + " aggregated_data = defaultdict(lambda: {\n", + " 'filename': '',\n", + " 'pages': [],\n", + " 'timestamps': [],\n", + " 'description': 'lorem ipsum',\n", + " 'filetype': '',\n", + " 'thumbnail': '',\n", + " 'track_id': 123\n", + " })\n", + "\n", + " for item in input_data:\n", + " if 'source' in item:\n", + " file_path = item['source']\n", + " filename = file_path.split('\\\\')[-1]\n", + " extension = filename.split('.')[-1]\n", + "\n", + " aggregated_data[filename]['filename'] = filename\n", + " aggregated_data[filename]['filetype'] = extension\n", + " aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n", + "\n", + " if extension in ['pdf', 'txt', 'docx']:\n", + " aggregated_data[filename]['pages'].append(item['page'])\n", + " elif extension in ['mp4', 'mkv', 'flv']:\n", + " aggregated_data[filename]['timestamps'].append(item['page'])\n", + " elif extension in ['mp3', 'wav', 'flac']:\n", + " aggregated_data[filename]['timestamps'].append(item['page'])\n", + " elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n", + " aggregated_data[filename].pop('pages', None) # Remove pages if it's an image\n", + " aggregated_data[filename].pop('timestamps', None) # Remove timestamps if it's an image\n", + "\n", + " elif 'filename' in item:\n", + " filename = item['filename']\n", + " extension = item['file_type']\n", + " aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n", + " aggregated_data[filename]['filetype'] = extension\n", + " aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n", + " if 'duration' in item:\n", + " start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n", + " aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n", + "\n", + " # Convert aggregated data to the desired output format\n", + " output_data = []\n", + " for filename, data in aggregated_data.items():\n", + " # Remove empty lists for pages and timestamps\n", + " if not data['pages']:\n", + " data.pop('pages', None)\n", + " if not data['timestamps']:\n", + " data.pop('timestamps', None)\n", + " output_data.append(data)\n", + "\n", + " return output_data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n", + "{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n", + "{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n", + "{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n" + ] + } + ], + "source": [ + "output = transform_file_data(docs)\n", + "for item in output:\n", + " print(item)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n", + " 'pages': [1, 3, 2, 0],\n", + " 'description': 'lorem ipsum',\n", + " 'filetype': 'txt',\n", + " 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n", + " 'track_id': 123},\n", + " {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n", + " 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n", + " 'description': 'lorem ipsum',\n", + " 'filetype': 'pdf',\n", + " 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n", + " 'track_id': 123},\n", + " {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n", + " 'pages': [2, 3, 5, 6],\n", + " 'description': 'lorem ipsum',\n", + " 'filetype': 'docx',\n", + " 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n", + " 'track_id': 123},\n", + " {'filename': 'audio-2.audio',\n", + " 'timestamps': [(0, 3), (3, 6)],\n", + " 'description': 'lorem ipsum',\n", + " 'filetype': 'audio',\n", + " 'thumbnail': 'audio-2.jpg',\n", + " 'track_id': 123}]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output" + ] }, { "cell_type": "code", diff --git a/utils.py b/utils.py index 499ff917..9f79117c 100644 --- a/utils.py +++ b/utils.py @@ -6,14 +6,19 @@ from langchain_community.vectorstores import FAISS from langchain_community.document_loaders import PyPDFLoader from langchain_community.document_loaders import TextLoader from langchain_community.document_loaders import Docx2txtLoader +from langchain_groq import ChatGroq +from langchain_core.prompts.prompt import PromptTemplate +from langchain_core.output_parsers import StrOutputParser from uuid import uuid4 from langchain_core.documents import Document from text_extractor import TextExtractor import os +from concurrent.futures import ThreadPoolExecutor import math import json from groq import Groq import re +import time import shutil import numpy as np from pydub import AudioSegment @@ -26,10 +31,15 @@ load_dotenv() # OpenAI API Key api_key = os.getenv('OPENAI_API_KEY') +# setting up groq api key +os.environ["GROQ_API_KEY"] = os.getenv('GROQ_API_KEY') client = Groq(api_key = os.getenv('GROQ_API_KEY')) model = 'whisper-large-v3' +# chat set up +GROQ_LLM = ChatGroq(temperature=0, model_name="llama3-8b-8192", max_tokens=100) + # ---------------------------------------------------------------------------------------------------- # loading the embedding model def load_embedding_model(): @@ -337,6 +347,25 @@ def preprocess_video_data(video_path: str, time_interval: int): return documents +#----------------------------------------------------DOC SUMMARIZER -------------------------------------------------- +def doc_summarizer(document_page: list) -> str: + initiator_prompt = PromptTemplate( + template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> + Create a short summary of the document based on the provided text. + + Start with: This document is about... + + <|eot_id|><|start_header_id|>user<|end_header_id|> + DOCUMENT: {document_page} \n + + <|eot_id|><|start_header_id|>assistant<|end_header_id|>""", + input_variables=["document_page"], + ) + + initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser() + output = initiator_router.invoke({"document_page":document_page}) + return output + #-----------------------------------------------------OTHERS-------------------------------------------------------------- @@ -348,88 +377,86 @@ def load_embedded_data(embeddings=embeddings, key="data"): embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True) return embed_db +#-----------------------------------------------------Data Loading Process---------------------------------------------------- # creating a function to load all documents from a directory. +def process_document(path, extension, text_doc, image_doc, audio_doc, video_doc): + doc_name = os.path.basename(path).split('.')[0] + + process_map = { + "text": load_document, + "image": create_image_document, + "audio": create_audio_document, + "video": preprocess_video_data + } + + if extension in text_doc: + doc = process_map["text"](path) + num_pages = len(doc) + elif extension in image_doc: + doc = process_map["image"](path) + num_pages = 1 + doc_name = doc[0].metadata['filename'] + elif extension in audio_doc: + doc = process_map["audio"](path) + num_pages = len(doc) + doc_name = doc[0].metadata['filename'] + elif extension in video_doc: + doc = process_map["video"](path, time_interval=30) + num_pages = len(doc) + doc_name = doc[0].metadata['filename'] + else: + return None, None, None # Unhandled extension + + print(f"Document {doc_name} loaded") + return doc, doc_name, num_pages + def load_documents_from_directory(directory_path: str): text_doc = ['pdf', 'txt', 'docx', 'doc', 'md'] image_doc = ['jpg', 'jpeg', 'png', 'gif', 'bmp'] audio_doc = ['mp3', 'wav', 'flac', 'ogg', 'm4a'] video_doc = ['mp4', 'avi', 'mkv', 'flv', 'mov'] - # accessing the name of the files in the directory files = os.listdir(directory_path) - # creating a list to store the documents documents = [] - # another list for the document names doc_names = [] - # counting the number of pages in the document - num_pages= [] - # iterating through the files in the directory - for file in files: - # updating the path - path = os.path.join(directory_path, file) - # getting the file extension and doc name - doc_name, extension = path.split('/')[-1].split('.')[0] , file.split('.')[-1] - # checking if the file is a text document - if extension in text_doc: - # loading the document - doc = load_document(path) - # appending the document to the documents list + num_pages = [] + doc_summary = [] + + def process_with_delay(file): + result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc) + time.sleep(0.1) # Introduce a 0.1s delay between processing each document + return result + + with ThreadPoolExecutor() as executor: + results = executor.map(process_with_delay, files) + + for doc, doc_name, pages in results: + if doc is not None: documents.append(doc) - # appending the number of pages in the document - num_pages.append(len(doc)) - # adding the document name to the doc_names list doc_names.append(doc_name) - print(f"Document {doc_name} loaded") - elif extension in image_doc: - # creating an image document - doc = create_image_document(path) - # appending the document to the documents list - documents.append(doc) - # appending the number of pages in the document - num_pages.append(1) - # adding the document name to the doc_names list - doc_names.append(doc[0].metadata['filename']) - print(f"Document {doc[0].metadata['filename']} loaded") - elif extension in audio_doc: - # creating an audio document - doc = create_audio_document(path) - # appending the document to the documents list - documents.append(doc) - # appending the number of pages in the document - num_pages.append(len(doc)) - # adding the document name to the doc_names list - doc_names.append(doc[0].metadata['filename']) - print(f"Document {doc[0].metadata['filename']} loaded") - elif extension in video_doc: - # creating a video document - doc = preprocess_video_data(path, time_interval=30) - # appending the document to the documents list - documents.append(doc) - # appending the number of pages in the document - num_pages.append(len(doc)) - # adding the document name to the doc_names list - doc_names.append(doc[0].metadata['filename']) - print(f"Document {doc[0].metadata['filename']} loaded") + num_pages.append(pages) - # so we need to create a document id for each document - docs_id = [uuid4().hex for i in range(len(documents))] - # creating a json file to store the documents, checking if it exists then open it, else create it - json_file = f"{directory_path}/documents.json" + # creating doc summary + first_page = doc[0].page_content + summary = doc_summarizer(first_page) + doc_summary.append(summary) + + docs_id = [uuid4().hex for _ in range(len(documents))] + + json_file = os.path.join(directory_path, 'data.json') + data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages, 'doc_summaary': doc_summary} + if os.path.exists(json_file): - with open(json_file, 'r') as f: - data = json.load(f) - data['doc_names'] = doc_names - data['docs_id'] = docs_id - data['num_pages'] = num_pages - with open(json_file, 'w') as f: - json.dump(data, f) + with open(json_file, 'r+') as f: + existing_data = json.load(f) + existing_data.update(data) + f.seek(0) + json.dump(existing_data, f) else: - data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages} with open(json_file, 'w') as f: json.dump(data, f) - # returning the documents, and doc ids return documents, docs_id, num_pages @@ -475,6 +502,6 @@ def search(query, k=20): all = [] info = [] for doc in docs: - all.append({doc.page_content}) + # all.append({doc.page_content}) info.append(dict(doc.metadata)) - return docs[0].page_content, all, info + return info