parallel processing added
This commit is contained in:
Binary file not shown.
@@ -1 +0,0 @@
|
|||||||
{"doc_names": ["audio-2", "data\\Car-Repair-Receipt-repair", "data\\Car-Repair-Receipt-service", "data\\Car-Repair-Receipt-tire", "data\\Car-Repair-Receipt-tuning", "data\\Car-Repair-Receipt-wash", "data\\corolla-2020-toyota-owners-manual", "data\\dodge-challenger-auto-body-repair-after", "data\\dodge-challenger-auto-body-repair-before", "data\\How to change engine oil and filter on TOYOTA Corolla", "data\\How to change front brake pads on TOYOTA Corolla", "How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]", "How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]", "data\\How to change rear windshield wipers on TOYOTA Corolla", "data\\How to change spark plugs on TOYOTA COROLLA", "data\\hyundai-sonata-auto-body-repair-after", "data\\hyundai-sonata-auto-body-repair-before", "data\\IMG_1436", "data\\IMG_1437", "data\\IMG_1438", "data\\IMG_1440", "data\\IMG_1441", "data\\IMG_1442", "data\\IMG_1443", "data\\IMG_1444", "data\\pontiac-vibe-auto-body-repair-after", "data\\pontiac-vibe-auto-body-repair-before", "test_rec", "data\\toyota-tacoma-auto-body-repair-after", "data\\toyota-tacoma-auto-body-repair-before"], "docs_id": ["e7280b3ec313491f8ce5c5d59b52788e", "880e0ad1d2ce43c39716e8d45d584000", "3b1c312fe9d0490ba3b7a841ce3fb136", "a6f5d1b0ba8d4fa7a828fdad8ae17bb7", "41f450a6b2c24af6a45065b2d3eba6e5", "d7af8ae82db341bf818e8f16420a9570", "eb95b3ced46548ba9d18eacba3c3e00c", "806b676fecfd47339c506ece2af3122d", "fcdc8661e11541c0a56c825626f2467e", "5ab12ad4f0ce4ce98e27e56fe5663ebd", "adf95aa758254b069cac03f52f0993b8", "1c3ef8dd242f40418dd34bfd3ce95b8a", "47d4abbea3da4121930fc33b3c472fdf", "10963a5e3985497888ce30cc66ae6314", "111c4a4e07534f47b1ed671ee2b684df", "2aa09a5f3fb54728b8370875ddc3d1fa", "7f902694851e41218ef8498b6c5c1553", "e1b0b06f6dad4c56bf976f6c2135df35", "00564531a33c4896b0d329170ce1f04f", "3aa1149c564c47c4b7b1f6a641e37770", "34a3e18b638b454ba283ba5741bad3a7", "0c7b7bf52d684f198dee19e87a5000bc", "9f1fc0cdcf104e4d948321f0ee5875b9", "113fab4522f94b7b956236b769e391c7", "8a75069c830640b5bddd661adcd612a9", "428d5aa38b9148ef95cf0662d50e0e18", "4cbb1c9f8d8a44feb44b554f9a39bd59", "04fe5073479c462aa819b0131fc9bc9b", "037fd77b66f045a0ac0682a07d57885b", "299625168b8e492baafdbb2a871cd061"], "num_pages": [7, 1, 2, 2, 2, 1, 588, 1, 1, 6, 7, 21, 12, 6, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
|
|
||||||
Binary file not shown.
Binary file not shown.
@@ -533,3 +533,131 @@ To avoid injury, hold up the wheel when unscrewing the bolts.
|
|||||||
2024-08-15 12:50:08,732 - INFO - Vector store created
|
2024-08-15 12:50:08,732 - INFO - Vector store created
|
||||||
2024-08-15 12:50:08,732 - INFO - Saving the vector store
|
2024-08-15 12:50:08,732 - INFO - Saving the vector store
|
||||||
2024-08-15 12:50:08,745 - INFO - Vector store saved
|
2024-08-15 12:50:08,745 - INFO - Vector store saved
|
||||||
|
2024-08-15 22:27:53,640 - INFO - Loading data from ./data
|
||||||
|
2024-08-15 22:28:26,454 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:15,773 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:18,312 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:21,194 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:23,563 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:25,707 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:28,079 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:30,046 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:32,244 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:33,863 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:34,996 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:37,244 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:39,593 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:41,416 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:42,381 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:43,283 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:45,605 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:46,734 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:50,144 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:53,144 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:54,149 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:29:56,683 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:01,059 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:04,416 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:07,578 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:08,304 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:12,904 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:16,304 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:19,593 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:20,690 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:22,950 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:27,134 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:29,830 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 503 Service Unavailable"
|
||||||
|
2024-08-15 22:30:31,234 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:44,310 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:53,513 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:57,355 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:57,849 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:58,324 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:58,763 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:59,189 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:30:59,624 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:31:00,064 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:31:00,434 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:31:00,843 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:31:01,323 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:31:01,834 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:38:18,873 - INFO - Loading data from ./data
|
||||||
|
2024-08-15 22:38:45,628 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:39:49,550 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:39:54,348 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:39:57,845 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:01,631 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:04,163 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:06,901 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:07,582 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:08,282 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:09,997 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:12,182 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:12,765 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:15,845 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:17,389 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:18,451 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:21,247 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:21,334 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:23,360 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:24,218 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:24,552 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:28,755 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:33,043 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:35,045 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:37,390 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:41,155 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:45,315 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:49,518 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:50,481 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:53,073 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:40:56,877 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:01,323 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:02,802 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:05,165 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:09,849 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:13,516 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:13,872 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:16,590 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:20,432 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:23,509 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:25,284 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:25,999 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:32,085 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:32,567 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:32,987 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:33,440 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:33,916 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:34,331 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:34,824 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:35,191 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:35,573 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:36,235 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:36,731 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:37,163 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:37,556 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:38,165 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:38,601 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:38,987 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:39,409 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:39,901 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:40,290 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:40,640 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:41,031 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:41,393 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:41,835 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:42,190 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:42,604 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:42,974 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:43,339 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:43,773 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:44,140 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:44,574 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
|
||||||
|
2024-08-15 22:41:44,574 - INFO - Data loaded
|
||||||
|
2024-08-15 22:41:44,574 - INFO - Creating vector store
|
||||||
|
2024-08-15 22:41:46,104 - WARNING - C:\Users\timmy_3aupohg\anaconda3\envs\smog_env\Lib\site-packages\transformers\models\bert\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\transformers\cuda\sdp_utils.cpp:555.)
|
||||||
|
attn_output = torch.nn.functional.scaled_dot_product_attention(
|
||||||
|
|
||||||
|
2024-08-15 22:41:56,183 - INFO - Vector store created
|
||||||
|
2024-08-15 22:41:56,198 - INFO - Saving the vector store
|
||||||
|
2024-08-15 22:41:56,198 - INFO - Vector store saved
|
||||||
|
|||||||
+2
-1
@@ -18,4 +18,5 @@ groq
|
|||||||
python-dotenv
|
python-dotenv
|
||||||
pydub
|
pydub
|
||||||
moviepy
|
moviepy
|
||||||
ffmpeg-python
|
ffmpeg-python
|
||||||
|
langchain-groq
|
||||||
+278
-9
@@ -6,30 +6,299 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from utils import search\n",
|
"# !pip install langchain-groq"
|
||||||
"import sys, os"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
"source": []
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from utils import search\n",
|
||||||
|
"import sys, os\n",
|
||||||
|
"from dotenv import load_dotenv\n",
|
||||||
|
"from langchain_groq import ChatGroq\n",
|
||||||
|
"from langchain_core.prompts.prompt import PromptTemplate\n",
|
||||||
|
"from langchain_core.output_parsers import StrOutputParser\n",
|
||||||
|
"load_dotenv()"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": [
|
||||||
|
"# setting up groq api key\n",
|
||||||
|
"os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 4,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"# chat set up\n",
|
||||||
|
"GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"### Chains #####\n",
|
||||||
|
"# Initiator\n",
|
||||||
|
"def doc_summarizer(document_page: list) -> str:\n",
|
||||||
|
" initiator_prompt = PromptTemplate(\n",
|
||||||
|
" template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
|
||||||
|
" Create a short summary of the document based on the provided text. \n",
|
||||||
|
" \n",
|
||||||
|
" Start with: This document is about...\n",
|
||||||
|
" \n",
|
||||||
|
" <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
|
||||||
|
" DOCUMENT: {document_page} \\n\n",
|
||||||
|
" \n",
|
||||||
|
" <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
|
||||||
|
" input_variables=[\"document_page\"],\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
|
||||||
|
" output = initiator_router.invoke({\"document_page\":document_page})\n",
|
||||||
|
" return output\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"document_page = 'How to change the engine oil of a toyota corrolla.'\n",
|
||||||
|
"# testing the function\n",
|
||||||
|
"summary = doc_summarizer(document_page)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"summary"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = search(document_page)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||||
|
" 'page': 1,\n",
|
||||||
|
" 'file_type': 'text'},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n",
|
||||||
|
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||||
|
" 'page': 3,\n",
|
||||||
|
" 'file_type': 'text'},\n",
|
||||||
|
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||||
|
" 'page': 2,\n",
|
||||||
|
" 'file_type': 'text'},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n",
|
||||||
|
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||||
|
" 'page': 2,\n",
|
||||||
|
" 'file_type': 'text'},\n",
|
||||||
|
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||||
|
" 'page': 3,\n",
|
||||||
|
" 'file_type': 'text'},\n",
|
||||||
|
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||||
|
" 'page': 0,\n",
|
||||||
|
" 'file_type': 'text'},\n",
|
||||||
|
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||||
|
" 'page': 5,\n",
|
||||||
|
" 'file_type': 'text'},\n",
|
||||||
|
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||||
|
" 'page': 6,\n",
|
||||||
|
" 'file_type': 'text'},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n",
|
||||||
|
" {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n",
|
||||||
|
" {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"docs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from collections import defaultdict\n",
|
||||||
|
"\n",
|
||||||
|
"def transform_file_data(input_data):\n",
|
||||||
|
" # Create a dictionary to aggregate data by filename\n",
|
||||||
|
" aggregated_data = defaultdict(lambda: {\n",
|
||||||
|
" 'filename': '',\n",
|
||||||
|
" 'pages': [],\n",
|
||||||
|
" 'timestamps': [],\n",
|
||||||
|
" 'description': 'lorem ipsum',\n",
|
||||||
|
" 'filetype': '',\n",
|
||||||
|
" 'thumbnail': '',\n",
|
||||||
|
" 'track_id': 123\n",
|
||||||
|
" })\n",
|
||||||
|
"\n",
|
||||||
|
" for item in input_data:\n",
|
||||||
|
" if 'source' in item:\n",
|
||||||
|
" file_path = item['source']\n",
|
||||||
|
" filename = file_path.split('\\\\')[-1]\n",
|
||||||
|
" extension = filename.split('.')[-1]\n",
|
||||||
|
"\n",
|
||||||
|
" aggregated_data[filename]['filename'] = filename\n",
|
||||||
|
" aggregated_data[filename]['filetype'] = extension\n",
|
||||||
|
" aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n",
|
||||||
|
"\n",
|
||||||
|
" if extension in ['pdf', 'txt', 'docx']:\n",
|
||||||
|
" aggregated_data[filename]['pages'].append(item['page'])\n",
|
||||||
|
" elif extension in ['mp4', 'mkv', 'flv']:\n",
|
||||||
|
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
|
||||||
|
" elif extension in ['mp3', 'wav', 'flac']:\n",
|
||||||
|
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
|
||||||
|
" elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n",
|
||||||
|
" aggregated_data[filename].pop('pages', None) # Remove pages if it's an image\n",
|
||||||
|
" aggregated_data[filename].pop('timestamps', None) # Remove timestamps if it's an image\n",
|
||||||
|
"\n",
|
||||||
|
" elif 'filename' in item:\n",
|
||||||
|
" filename = item['filename']\n",
|
||||||
|
" extension = item['file_type']\n",
|
||||||
|
" aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n",
|
||||||
|
" aggregated_data[filename]['filetype'] = extension\n",
|
||||||
|
" aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n",
|
||||||
|
" if 'duration' in item:\n",
|
||||||
|
" start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n",
|
||||||
|
" aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n",
|
||||||
|
"\n",
|
||||||
|
" # Convert aggregated data to the desired output format\n",
|
||||||
|
" output_data = []\n",
|
||||||
|
" for filename, data in aggregated_data.items():\n",
|
||||||
|
" # Remove empty lists for pages and timestamps\n",
|
||||||
|
" if not data['pages']:\n",
|
||||||
|
" data.pop('pages', None)\n",
|
||||||
|
" if not data['timestamps']:\n",
|
||||||
|
" data.pop('timestamps', None)\n",
|
||||||
|
" output_data.append(data)\n",
|
||||||
|
"\n",
|
||||||
|
" return output_data\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n",
|
||||||
|
"{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n",
|
||||||
|
"{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n",
|
||||||
|
"{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"output = transform_file_data(docs)\n",
|
||||||
|
"for item in output:\n",
|
||||||
|
" print(item)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||||
|
" 'pages': [1, 3, 2, 0],\n",
|
||||||
|
" 'description': 'lorem ipsum',\n",
|
||||||
|
" 'filetype': 'txt',\n",
|
||||||
|
" 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n",
|
||||||
|
" 'track_id': 123},\n",
|
||||||
|
" {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
|
||||||
|
" 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n",
|
||||||
|
" 'description': 'lorem ipsum',\n",
|
||||||
|
" 'filetype': 'pdf',\n",
|
||||||
|
" 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n",
|
||||||
|
" 'track_id': 123},\n",
|
||||||
|
" {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||||
|
" 'pages': [2, 3, 5, 6],\n",
|
||||||
|
" 'description': 'lorem ipsum',\n",
|
||||||
|
" 'filetype': 'docx',\n",
|
||||||
|
" 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n",
|
||||||
|
" 'track_id': 123},\n",
|
||||||
|
" {'filename': 'audio-2.audio',\n",
|
||||||
|
" 'timestamps': [(0, 3), (3, 6)],\n",
|
||||||
|
" 'description': 'lorem ipsum',\n",
|
||||||
|
" 'filetype': 'audio',\n",
|
||||||
|
" 'thumbnail': 'audio-2.jpg',\n",
|
||||||
|
" 'track_id': 123}]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"output"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
|||||||
@@ -6,14 +6,19 @@ from langchain_community.vectorstores import FAISS
|
|||||||
from langchain_community.document_loaders import PyPDFLoader
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
from langchain_community.document_loaders import TextLoader
|
from langchain_community.document_loaders import TextLoader
|
||||||
from langchain_community.document_loaders import Docx2txtLoader
|
from langchain_community.document_loaders import Docx2txtLoader
|
||||||
|
from langchain_groq import ChatGroq
|
||||||
|
from langchain_core.prompts.prompt import PromptTemplate
|
||||||
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from text_extractor import TextExtractor
|
from text_extractor import TextExtractor
|
||||||
import os
|
import os
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
import math
|
import math
|
||||||
import json
|
import json
|
||||||
from groq import Groq
|
from groq import Groq
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
import shutil
|
import shutil
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
@@ -26,10 +31,15 @@ load_dotenv()
|
|||||||
|
|
||||||
# OpenAI API Key
|
# OpenAI API Key
|
||||||
api_key = os.getenv('OPENAI_API_KEY')
|
api_key = os.getenv('OPENAI_API_KEY')
|
||||||
|
# setting up groq api key
|
||||||
|
os.environ["GROQ_API_KEY"] = os.getenv('GROQ_API_KEY')
|
||||||
client = Groq(api_key = os.getenv('GROQ_API_KEY'))
|
client = Groq(api_key = os.getenv('GROQ_API_KEY'))
|
||||||
model = 'whisper-large-v3'
|
model = 'whisper-large-v3'
|
||||||
|
|
||||||
|
|
||||||
|
# chat set up
|
||||||
|
GROQ_LLM = ChatGroq(temperature=0, model_name="llama3-8b-8192", max_tokens=100)
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
# loading the embedding model
|
# loading the embedding model
|
||||||
def load_embedding_model():
|
def load_embedding_model():
|
||||||
@@ -337,6 +347,25 @@ def preprocess_video_data(video_path: str, time_interval: int):
|
|||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
|
||||||
|
#----------------------------------------------------DOC SUMMARIZER --------------------------------------------------
|
||||||
|
def doc_summarizer(document_page: list) -> str:
|
||||||
|
initiator_prompt = PromptTemplate(
|
||||||
|
template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
||||||
|
Create a short summary of the document based on the provided text.
|
||||||
|
|
||||||
|
Start with: This document is about...
|
||||||
|
|
||||||
|
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||||
|
DOCUMENT: {document_page} \n
|
||||||
|
|
||||||
|
<|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
|
||||||
|
input_variables=["document_page"],
|
||||||
|
)
|
||||||
|
|
||||||
|
initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()
|
||||||
|
output = initiator_router.invoke({"document_page":document_page})
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
#-----------------------------------------------------OTHERS--------------------------------------------------------------
|
#-----------------------------------------------------OTHERS--------------------------------------------------------------
|
||||||
|
|
||||||
@@ -348,88 +377,86 @@ def load_embedded_data(embeddings=embeddings, key="data"):
|
|||||||
embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
|
embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
|
||||||
return embed_db
|
return embed_db
|
||||||
|
|
||||||
|
#-----------------------------------------------------Data Loading Process----------------------------------------------------
|
||||||
|
|
||||||
# creating a function to load all documents from a directory.
|
# creating a function to load all documents from a directory.
|
||||||
|
def process_document(path, extension, text_doc, image_doc, audio_doc, video_doc):
|
||||||
|
doc_name = os.path.basename(path).split('.')[0]
|
||||||
|
|
||||||
|
process_map = {
|
||||||
|
"text": load_document,
|
||||||
|
"image": create_image_document,
|
||||||
|
"audio": create_audio_document,
|
||||||
|
"video": preprocess_video_data
|
||||||
|
}
|
||||||
|
|
||||||
|
if extension in text_doc:
|
||||||
|
doc = process_map["text"](path)
|
||||||
|
num_pages = len(doc)
|
||||||
|
elif extension in image_doc:
|
||||||
|
doc = process_map["image"](path)
|
||||||
|
num_pages = 1
|
||||||
|
doc_name = doc[0].metadata['filename']
|
||||||
|
elif extension in audio_doc:
|
||||||
|
doc = process_map["audio"](path)
|
||||||
|
num_pages = len(doc)
|
||||||
|
doc_name = doc[0].metadata['filename']
|
||||||
|
elif extension in video_doc:
|
||||||
|
doc = process_map["video"](path, time_interval=30)
|
||||||
|
num_pages = len(doc)
|
||||||
|
doc_name = doc[0].metadata['filename']
|
||||||
|
else:
|
||||||
|
return None, None, None # Unhandled extension
|
||||||
|
|
||||||
|
print(f"Document {doc_name} loaded")
|
||||||
|
return doc, doc_name, num_pages
|
||||||
|
|
||||||
def load_documents_from_directory(directory_path: str):
|
def load_documents_from_directory(directory_path: str):
|
||||||
text_doc = ['pdf', 'txt', 'docx', 'doc', 'md']
|
text_doc = ['pdf', 'txt', 'docx', 'doc', 'md']
|
||||||
image_doc = ['jpg', 'jpeg', 'png', 'gif', 'bmp']
|
image_doc = ['jpg', 'jpeg', 'png', 'gif', 'bmp']
|
||||||
audio_doc = ['mp3', 'wav', 'flac', 'ogg', 'm4a']
|
audio_doc = ['mp3', 'wav', 'flac', 'ogg', 'm4a']
|
||||||
video_doc = ['mp4', 'avi', 'mkv', 'flv', 'mov']
|
video_doc = ['mp4', 'avi', 'mkv', 'flv', 'mov']
|
||||||
|
|
||||||
# accessing the name of the files in the directory
|
|
||||||
files = os.listdir(directory_path)
|
files = os.listdir(directory_path)
|
||||||
# creating a list to store the documents
|
|
||||||
documents = []
|
documents = []
|
||||||
# another list for the document names
|
|
||||||
doc_names = []
|
doc_names = []
|
||||||
# counting the number of pages in the document
|
num_pages = []
|
||||||
num_pages= []
|
doc_summary = []
|
||||||
# iterating through the files in the directory
|
|
||||||
for file in files:
|
def process_with_delay(file):
|
||||||
# updating the path
|
result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc)
|
||||||
path = os.path.join(directory_path, file)
|
time.sleep(0.1) # Introduce a 0.1s delay between processing each document
|
||||||
# getting the file extension and doc name
|
return result
|
||||||
doc_name, extension = path.split('/')[-1].split('.')[0] , file.split('.')[-1]
|
|
||||||
# checking if the file is a text document
|
with ThreadPoolExecutor() as executor:
|
||||||
if extension in text_doc:
|
results = executor.map(process_with_delay, files)
|
||||||
# loading the document
|
|
||||||
doc = load_document(path)
|
for doc, doc_name, pages in results:
|
||||||
# appending the document to the documents list
|
if doc is not None:
|
||||||
documents.append(doc)
|
documents.append(doc)
|
||||||
# appending the number of pages in the document
|
|
||||||
num_pages.append(len(doc))
|
|
||||||
# adding the document name to the doc_names list
|
|
||||||
doc_names.append(doc_name)
|
doc_names.append(doc_name)
|
||||||
print(f"Document {doc_name} loaded")
|
num_pages.append(pages)
|
||||||
elif extension in image_doc:
|
|
||||||
# creating an image document
|
|
||||||
doc = create_image_document(path)
|
|
||||||
# appending the document to the documents list
|
|
||||||
documents.append(doc)
|
|
||||||
# appending the number of pages in the document
|
|
||||||
num_pages.append(1)
|
|
||||||
# adding the document name to the doc_names list
|
|
||||||
doc_names.append(doc[0].metadata['filename'])
|
|
||||||
print(f"Document {doc[0].metadata['filename']} loaded")
|
|
||||||
elif extension in audio_doc:
|
|
||||||
# creating an audio document
|
|
||||||
doc = create_audio_document(path)
|
|
||||||
# appending the document to the documents list
|
|
||||||
documents.append(doc)
|
|
||||||
# appending the number of pages in the document
|
|
||||||
num_pages.append(len(doc))
|
|
||||||
# adding the document name to the doc_names list
|
|
||||||
doc_names.append(doc[0].metadata['filename'])
|
|
||||||
print(f"Document {doc[0].metadata['filename']} loaded")
|
|
||||||
elif extension in video_doc:
|
|
||||||
# creating a video document
|
|
||||||
doc = preprocess_video_data(path, time_interval=30)
|
|
||||||
# appending the document to the documents list
|
|
||||||
documents.append(doc)
|
|
||||||
# appending the number of pages in the document
|
|
||||||
num_pages.append(len(doc))
|
|
||||||
# adding the document name to the doc_names list
|
|
||||||
doc_names.append(doc[0].metadata['filename'])
|
|
||||||
print(f"Document {doc[0].metadata['filename']} loaded")
|
|
||||||
|
|
||||||
# so we need to create a document id for each document
|
# creating doc summary
|
||||||
docs_id = [uuid4().hex for i in range(len(documents))]
|
first_page = doc[0].page_content
|
||||||
# creating a json file to store the documents, checking if it exists then open it, else create it
|
summary = doc_summarizer(first_page)
|
||||||
json_file = f"{directory_path}/documents.json"
|
doc_summary.append(summary)
|
||||||
|
|
||||||
|
docs_id = [uuid4().hex for _ in range(len(documents))]
|
||||||
|
|
||||||
|
json_file = os.path.join(directory_path, 'data.json')
|
||||||
|
data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages, 'doc_summaary': doc_summary}
|
||||||
|
|
||||||
if os.path.exists(json_file):
|
if os.path.exists(json_file):
|
||||||
with open(json_file, 'r') as f:
|
with open(json_file, 'r+') as f:
|
||||||
data = json.load(f)
|
existing_data = json.load(f)
|
||||||
data['doc_names'] = doc_names
|
existing_data.update(data)
|
||||||
data['docs_id'] = docs_id
|
f.seek(0)
|
||||||
data['num_pages'] = num_pages
|
json.dump(existing_data, f)
|
||||||
with open(json_file, 'w') as f:
|
|
||||||
json.dump(data, f)
|
|
||||||
else:
|
else:
|
||||||
data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages}
|
|
||||||
with open(json_file, 'w') as f:
|
with open(json_file, 'w') as f:
|
||||||
json.dump(data, f)
|
json.dump(data, f)
|
||||||
|
|
||||||
# returning the documents, and doc ids
|
|
||||||
return documents, docs_id, num_pages
|
return documents, docs_id, num_pages
|
||||||
|
|
||||||
|
|
||||||
@@ -475,6 +502,6 @@ def search(query, k=20):
|
|||||||
all = []
|
all = []
|
||||||
info = []
|
info = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
all.append({doc.page_content})
|
# all.append({doc.page_content})
|
||||||
info.append(dict(doc.metadata))
|
info.append(dict(doc.metadata))
|
||||||
return docs[0].page_content, all, info
|
return info
|
||||||
|
|||||||
Reference in New Issue
Block a user