parallel processing added

This commit is contained in:
timothyafolami
2024-08-15 23:17:17 +01:00
parent 179e51070a
commit 713354371e
8 changed files with 500 additions and 76 deletions
Binary file not shown.
-1
View File
@@ -1 +0,0 @@
{"doc_names": ["audio-2", "data\\Car-Repair-Receipt-repair", "data\\Car-Repair-Receipt-service", "data\\Car-Repair-Receipt-tire", "data\\Car-Repair-Receipt-tuning", "data\\Car-Repair-Receipt-wash", "data\\corolla-2020-toyota-owners-manual", "data\\dodge-challenger-auto-body-repair-after", "data\\dodge-challenger-auto-body-repair-before", "data\\How to change engine oil and filter on TOYOTA Corolla", "data\\How to change front brake pads on TOYOTA Corolla", "How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]", "How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]", "data\\How to change rear windshield wipers on TOYOTA Corolla", "data\\How to change spark plugs on TOYOTA COROLLA", "data\\hyundai-sonata-auto-body-repair-after", "data\\hyundai-sonata-auto-body-repair-before", "data\\IMG_1436", "data\\IMG_1437", "data\\IMG_1438", "data\\IMG_1440", "data\\IMG_1441", "data\\IMG_1442", "data\\IMG_1443", "data\\IMG_1444", "data\\pontiac-vibe-auto-body-repair-after", "data\\pontiac-vibe-auto-body-repair-before", "test_rec", "data\\toyota-tacoma-auto-body-repair-after", "data\\toyota-tacoma-auto-body-repair-before"], "docs_id": ["e7280b3ec313491f8ce5c5d59b52788e", "880e0ad1d2ce43c39716e8d45d584000", "3b1c312fe9d0490ba3b7a841ce3fb136", "a6f5d1b0ba8d4fa7a828fdad8ae17bb7", "41f450a6b2c24af6a45065b2d3eba6e5", "d7af8ae82db341bf818e8f16420a9570", "eb95b3ced46548ba9d18eacba3c3e00c", "806b676fecfd47339c506ece2af3122d", "fcdc8661e11541c0a56c825626f2467e", "5ab12ad4f0ce4ce98e27e56fe5663ebd", "adf95aa758254b069cac03f52f0993b8", "1c3ef8dd242f40418dd34bfd3ce95b8a", "47d4abbea3da4121930fc33b3c472fdf", "10963a5e3985497888ce30cc66ae6314", "111c4a4e07534f47b1ed671ee2b684df", "2aa09a5f3fb54728b8370875ddc3d1fa", "7f902694851e41218ef8498b6c5c1553", "e1b0b06f6dad4c56bf976f6c2135df35", "00564531a33c4896b0d329170ce1f04f", "3aa1149c564c47c4b7b1f6a641e37770", "34a3e18b638b454ba283ba5741bad3a7", "0c7b7bf52d684f198dee19e87a5000bc", "9f1fc0cdcf104e4d948321f0ee5875b9", "113fab4522f94b7b956236b769e391c7", "8a75069c830640b5bddd661adcd612a9", "428d5aa38b9148ef95cf0662d50e0e18", "4cbb1c9f8d8a44feb44b554f9a39bd59", "04fe5073479c462aa819b0131fc9bc9b", "037fd77b66f045a0ac0682a07d57885b", "299625168b8e492baafdbb2a871cd061"], "num_pages": [7, 1, 2, 2, 2, 1, 588, 1, 1, 6, 7, 21, 12, 6, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Binary file not shown.
Binary file not shown.
+128
View File
@@ -533,3 +533,131 @@ To avoid injury, hold up the wheel when unscrewing the bolts.
2024-08-15 12:50:08,732 - INFO - Vector store created
2024-08-15 12:50:08,732 - INFO - Saving the vector store
2024-08-15 12:50:08,745 - INFO - Vector store saved
2024-08-15 22:27:53,640 - INFO - Loading data from ./data
2024-08-15 22:28:26,454 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:15,773 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:18,312 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:21,194 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:23,563 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:25,707 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:28,079 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:30,046 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:32,244 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:33,863 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:34,996 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:37,244 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:39,593 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:41,416 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:42,381 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:43,283 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:45,605 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:46,734 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:50,144 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:53,144 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:54,149 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:29:56,683 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:01,059 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:04,416 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:07,578 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:08,304 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:12,904 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:16,304 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:19,593 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:20,690 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:22,950 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:27,134 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:29,830 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 503 Service Unavailable"
2024-08-15 22:30:31,234 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:44,310 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:53,513 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:30:57,355 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:30:57,849 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:30:58,324 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:30:58,763 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:30:59,189 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:30:59,624 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:31:00,064 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:31:00,434 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:31:00,843 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:31:01,323 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:31:01,834 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:38:18,873 - INFO - Loading data from ./data
2024-08-15 22:38:45,628 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:39:49,550 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:39:54,348 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:39:57,845 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:01,631 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:04,163 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:06,901 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:07,582 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:08,282 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:09,997 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:12,182 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:12,765 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:15,845 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:17,389 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:18,451 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:21,247 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:21,334 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:23,360 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:24,218 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:24,552 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:28,755 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:33,043 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:35,045 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:37,390 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:41,155 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:45,315 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:49,518 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:50,481 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:53,073 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:40:56,877 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:01,323 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:02,802 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:05,165 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:09,849 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:13,516 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:13,872 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:16,590 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:20,432 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:23,509 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:25,284 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:25,999 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/audio/translations "HTTP/1.1 200 OK"
2024-08-15 22:41:32,085 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:32,567 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:32,987 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:33,440 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:33,916 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:34,331 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:34,824 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:35,191 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:35,573 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:36,235 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:36,731 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:37,163 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:37,556 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:38,165 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:38,601 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:38,987 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:39,409 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:39,901 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:40,290 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:40,640 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:41,031 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:41,393 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:41,835 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:42,190 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:42,604 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:42,974 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:43,339 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:43,773 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:44,140 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:44,574 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-15 22:41:44,574 - INFO - Data loaded
2024-08-15 22:41:44,574 - INFO - Creating vector store
2024-08-15 22:41:46,104 - WARNING - C:\Users\timmy_3aupohg\anaconda3\envs\smog_env\Lib\site-packages\transformers\models\bert\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\transformers\cuda\sdp_utils.cpp:555.)
attn_output = torch.nn.functional.scaled_dot_product_attention(
2024-08-15 22:41:56,183 - INFO - Vector store created
2024-08-15 22:41:56,198 - INFO - Saving the vector store
2024-08-15 22:41:56,198 - INFO - Vector store saved
+2 -1
View File
@@ -18,4 +18,5 @@ groq
python-dotenv
pydub
moviepy
ffmpeg-python
ffmpeg-python
langchain-groq
+278 -9
View File
@@ -6,30 +6,299 @@
"metadata": {},
"outputs": [],
"source": [
"from utils import search\n",
"import sys, os"
"# !pip install langchain-groq"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from utils import search\n",
"import sys, os\n",
"from dotenv import load_dotenv\n",
"from langchain_groq import ChatGroq\n",
"from langchain_core.prompts.prompt import PromptTemplate\n",
"from langchain_core.output_parsers import StrOutputParser\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# setting up groq api key\n",
"os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": []
"source": [
"\n",
"# chat set up\n",
"GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
"\n",
"\n",
"### Chains #####\n",
"# Initiator\n",
"def doc_summarizer(document_page: list) -> str:\n",
" initiator_prompt = PromptTemplate(\n",
" template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
" Create a short summary of the document based on the provided text. \n",
" \n",
" Start with: This document is about...\n",
" \n",
" <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
" DOCUMENT: {document_page} \\n\n",
" \n",
" <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
" input_variables=[\"document_page\"],\n",
" )\n",
"\n",
" initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
" output = initiator_router.invoke({\"document_page\":document_page})\n",
" return output\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"document_page = 'How to change the engine oil of a toyota corrolla.'\n",
"# testing the function\n",
"summary = doc_summarizer(document_page)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summary"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"docs = search(document_page)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
" 'page': 1,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n",
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
" 'page': 3,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
" 'page': 2,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'page': 2,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'page': 3,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
" 'page': 0,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'page': 5,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'page': 6,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n",
" {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n",
" {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"\n",
"def transform_file_data(input_data):\n",
" # Create a dictionary to aggregate data by filename\n",
" aggregated_data = defaultdict(lambda: {\n",
" 'filename': '',\n",
" 'pages': [],\n",
" 'timestamps': [],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': '',\n",
" 'thumbnail': '',\n",
" 'track_id': 123\n",
" })\n",
"\n",
" for item in input_data:\n",
" if 'source' in item:\n",
" file_path = item['source']\n",
" filename = file_path.split('\\\\')[-1]\n",
" extension = filename.split('.')[-1]\n",
"\n",
" aggregated_data[filename]['filename'] = filename\n",
" aggregated_data[filename]['filetype'] = extension\n",
" aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n",
"\n",
" if extension in ['pdf', 'txt', 'docx']:\n",
" aggregated_data[filename]['pages'].append(item['page'])\n",
" elif extension in ['mp4', 'mkv', 'flv']:\n",
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
" elif extension in ['mp3', 'wav', 'flac']:\n",
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
" elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n",
" aggregated_data[filename].pop('pages', None) # Remove pages if it's an image\n",
" aggregated_data[filename].pop('timestamps', None) # Remove timestamps if it's an image\n",
"\n",
" elif 'filename' in item:\n",
" filename = item['filename']\n",
" extension = item['file_type']\n",
" aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n",
" aggregated_data[filename]['filetype'] = extension\n",
" aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n",
" if 'duration' in item:\n",
" start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n",
" aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n",
"\n",
" # Convert aggregated data to the desired output format\n",
" output_data = []\n",
" for filename, data in aggregated_data.items():\n",
" # Remove empty lists for pages and timestamps\n",
" if not data['pages']:\n",
" data.pop('pages', None)\n",
" if not data['timestamps']:\n",
" data.pop('timestamps', None)\n",
" output_data.append(data)\n",
"\n",
" return output_data\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n",
"{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n",
"{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n",
"{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n"
]
}
],
"source": [
"output = transform_file_data(docs)\n",
"for item in output:\n",
" print(item)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n",
" 'pages': [1, 3, 2, 0],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'txt',\n",
" 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n",
" 'track_id': 123},\n",
" {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
" 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'pdf',\n",
" 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n",
" 'track_id': 123},\n",
" {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'pages': [2, 3, 5, 6],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'docx',\n",
" 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n",
" 'track_id': 123},\n",
" {'filename': 'audio-2.audio',\n",
" 'timestamps': [(0, 3), (3, 6)],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'audio',\n",
" 'thumbnail': 'audio-2.jpg',\n",
" 'track_id': 123}]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output"
]
},
{
"cell_type": "code",
+92 -65
View File
@@ -6,14 +6,19 @@ from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_groq import ChatGroq
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from uuid import uuid4
from langchain_core.documents import Document
from text_extractor import TextExtractor
import os
from concurrent.futures import ThreadPoolExecutor
import math
import json
from groq import Groq
import re
import time
import shutil
import numpy as np
from pydub import AudioSegment
@@ -26,10 +31,15 @@ load_dotenv()
# OpenAI API Key
api_key = os.getenv('OPENAI_API_KEY')
# setting up groq api key
os.environ["GROQ_API_KEY"] = os.getenv('GROQ_API_KEY')
client = Groq(api_key = os.getenv('GROQ_API_KEY'))
model = 'whisper-large-v3'
# chat set up
GROQ_LLM = ChatGroq(temperature=0, model_name="llama3-8b-8192", max_tokens=100)
# ----------------------------------------------------------------------------------------------------
# loading the embedding model
def load_embedding_model():
@@ -337,6 +347,25 @@ def preprocess_video_data(video_path: str, time_interval: int):
return documents
#----------------------------------------------------DOC SUMMARIZER --------------------------------------------------
def doc_summarizer(document_page: list) -> str:
initiator_prompt = PromptTemplate(
template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Create a short summary of the document based on the provided text.
Start with: This document is about...
<|eot_id|><|start_header_id|>user<|end_header_id|>
DOCUMENT: {document_page} \n
<|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
input_variables=["document_page"],
)
initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()
output = initiator_router.invoke({"document_page":document_page})
return output
#-----------------------------------------------------OTHERS--------------------------------------------------------------
@@ -348,88 +377,86 @@ def load_embedded_data(embeddings=embeddings, key="data"):
embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
return embed_db
#-----------------------------------------------------Data Loading Process----------------------------------------------------
# creating a function to load all documents from a directory.
def process_document(path, extension, text_doc, image_doc, audio_doc, video_doc):
doc_name = os.path.basename(path).split('.')[0]
process_map = {
"text": load_document,
"image": create_image_document,
"audio": create_audio_document,
"video": preprocess_video_data
}
if extension in text_doc:
doc = process_map["text"](path)
num_pages = len(doc)
elif extension in image_doc:
doc = process_map["image"](path)
num_pages = 1
doc_name = doc[0].metadata['filename']
elif extension in audio_doc:
doc = process_map["audio"](path)
num_pages = len(doc)
doc_name = doc[0].metadata['filename']
elif extension in video_doc:
doc = process_map["video"](path, time_interval=30)
num_pages = len(doc)
doc_name = doc[0].metadata['filename']
else:
return None, None, None # Unhandled extension
print(f"Document {doc_name} loaded")
return doc, doc_name, num_pages
def load_documents_from_directory(directory_path: str):
text_doc = ['pdf', 'txt', 'docx', 'doc', 'md']
image_doc = ['jpg', 'jpeg', 'png', 'gif', 'bmp']
audio_doc = ['mp3', 'wav', 'flac', 'ogg', 'm4a']
video_doc = ['mp4', 'avi', 'mkv', 'flv', 'mov']
# accessing the name of the files in the directory
files = os.listdir(directory_path)
# creating a list to store the documents
documents = []
# another list for the document names
doc_names = []
# counting the number of pages in the document
num_pages= []
# iterating through the files in the directory
for file in files:
# updating the path
path = os.path.join(directory_path, file)
# getting the file extension and doc name
doc_name, extension = path.split('/')[-1].split('.')[0] , file.split('.')[-1]
# checking if the file is a text document
if extension in text_doc:
# loading the document
doc = load_document(path)
# appending the document to the documents list
num_pages = []
doc_summary = []
def process_with_delay(file):
result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc)
time.sleep(0.1) # Introduce a 0.1s delay between processing each document
return result
with ThreadPoolExecutor() as executor:
results = executor.map(process_with_delay, files)
for doc, doc_name, pages in results:
if doc is not None:
documents.append(doc)
# appending the number of pages in the document
num_pages.append(len(doc))
# adding the document name to the doc_names list
doc_names.append(doc_name)
print(f"Document {doc_name} loaded")
elif extension in image_doc:
# creating an image document
doc = create_image_document(path)
# appending the document to the documents list
documents.append(doc)
# appending the number of pages in the document
num_pages.append(1)
# adding the document name to the doc_names list
doc_names.append(doc[0].metadata['filename'])
print(f"Document {doc[0].metadata['filename']} loaded")
elif extension in audio_doc:
# creating an audio document
doc = create_audio_document(path)
# appending the document to the documents list
documents.append(doc)
# appending the number of pages in the document
num_pages.append(len(doc))
# adding the document name to the doc_names list
doc_names.append(doc[0].metadata['filename'])
print(f"Document {doc[0].metadata['filename']} loaded")
elif extension in video_doc:
# creating a video document
doc = preprocess_video_data(path, time_interval=30)
# appending the document to the documents list
documents.append(doc)
# appending the number of pages in the document
num_pages.append(len(doc))
# adding the document name to the doc_names list
doc_names.append(doc[0].metadata['filename'])
print(f"Document {doc[0].metadata['filename']} loaded")
num_pages.append(pages)
# so we need to create a document id for each document
docs_id = [uuid4().hex for i in range(len(documents))]
# creating a json file to store the documents, checking if it exists then open it, else create it
json_file = f"{directory_path}/documents.json"
# creating doc summary
first_page = doc[0].page_content
summary = doc_summarizer(first_page)
doc_summary.append(summary)
docs_id = [uuid4().hex for _ in range(len(documents))]
json_file = os.path.join(directory_path, 'data.json')
data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages, 'doc_summaary': doc_summary}
if os.path.exists(json_file):
with open(json_file, 'r') as f:
data = json.load(f)
data['doc_names'] = doc_names
data['docs_id'] = docs_id
data['num_pages'] = num_pages
with open(json_file, 'w') as f:
json.dump(data, f)
with open(json_file, 'r+') as f:
existing_data = json.load(f)
existing_data.update(data)
f.seek(0)
json.dump(existing_data, f)
else:
data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages}
with open(json_file, 'w') as f:
json.dump(data, f)
# returning the documents, and doc ids
return documents, docs_id, num_pages
@@ -475,6 +502,6 @@ def search(query, k=20):
all = []
info = []
for doc in docs:
all.append({doc.page_content})
# all.append({doc.page_content})
info.append(dict(doc.metadata))
return docs[0].page_content, all, info
return info