parallel processing added
This commit is contained in:
+278
-9
@@ -6,30 +6,299 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from utils import search\n",
|
||||
"import sys, os"
|
||||
"# !pip install langchain-groq"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from utils import search\n",
|
||||
"import sys, os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from langchain_groq import ChatGroq\n",
|
||||
"from langchain_core.prompts.prompt import PromptTemplate\n",
|
||||
"from langchain_core.output_parsers import StrOutputParser\n",
|
||||
"load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"# setting up groq api key\n",
|
||||
"os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"\n",
|
||||
"# chat set up\n",
|
||||
"GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### Chains #####\n",
|
||||
"# Initiator\n",
|
||||
"def doc_summarizer(document_page: list) -> str:\n",
|
||||
" initiator_prompt = PromptTemplate(\n",
|
||||
" template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
|
||||
" Create a short summary of the document based on the provided text. \n",
|
||||
" \n",
|
||||
" Start with: This document is about...\n",
|
||||
" \n",
|
||||
" <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
|
||||
" DOCUMENT: {document_page} \\n\n",
|
||||
" \n",
|
||||
" <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
|
||||
" input_variables=[\"document_page\"],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
|
||||
" output = initiator_router.invoke({\"document_page\":document_page})\n",
|
||||
" return output\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"document_page = 'How to change the engine oil of a toyota corrolla.'\n",
|
||||
"# testing the function\n",
|
||||
"summary = doc_summarizer(document_page)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = search(document_page)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||
" 'page': 1,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n",
|
||||
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||
" 'page': 3,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||
" 'page': 2,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n",
|
||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||
" 'page': 2,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||
" 'page': 3,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||
" 'page': 0,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||
" 'page': 5,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||
" 'page': 6,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n",
|
||||
" {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n",
|
||||
" {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from collections import defaultdict\n",
|
||||
"\n",
|
||||
"def transform_file_data(input_data):\n",
|
||||
" # Create a dictionary to aggregate data by filename\n",
|
||||
" aggregated_data = defaultdict(lambda: {\n",
|
||||
" 'filename': '',\n",
|
||||
" 'pages': [],\n",
|
||||
" 'timestamps': [],\n",
|
||||
" 'description': 'lorem ipsum',\n",
|
||||
" 'filetype': '',\n",
|
||||
" 'thumbnail': '',\n",
|
||||
" 'track_id': 123\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" for item in input_data:\n",
|
||||
" if 'source' in item:\n",
|
||||
" file_path = item['source']\n",
|
||||
" filename = file_path.split('\\\\')[-1]\n",
|
||||
" extension = filename.split('.')[-1]\n",
|
||||
"\n",
|
||||
" aggregated_data[filename]['filename'] = filename\n",
|
||||
" aggregated_data[filename]['filetype'] = extension\n",
|
||||
" aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n",
|
||||
"\n",
|
||||
" if extension in ['pdf', 'txt', 'docx']:\n",
|
||||
" aggregated_data[filename]['pages'].append(item['page'])\n",
|
||||
" elif extension in ['mp4', 'mkv', 'flv']:\n",
|
||||
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
|
||||
" elif extension in ['mp3', 'wav', 'flac']:\n",
|
||||
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
|
||||
" elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n",
|
||||
" aggregated_data[filename].pop('pages', None) # Remove pages if it's an image\n",
|
||||
" aggregated_data[filename].pop('timestamps', None) # Remove timestamps if it's an image\n",
|
||||
"\n",
|
||||
" elif 'filename' in item:\n",
|
||||
" filename = item['filename']\n",
|
||||
" extension = item['file_type']\n",
|
||||
" aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n",
|
||||
" aggregated_data[filename]['filetype'] = extension\n",
|
||||
" aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n",
|
||||
" if 'duration' in item:\n",
|
||||
" start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n",
|
||||
" aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n",
|
||||
"\n",
|
||||
" # Convert aggregated data to the desired output format\n",
|
||||
" output_data = []\n",
|
||||
" for filename, data in aggregated_data.items():\n",
|
||||
" # Remove empty lists for pages and timestamps\n",
|
||||
" if not data['pages']:\n",
|
||||
" data.pop('pages', None)\n",
|
||||
" if not data['timestamps']:\n",
|
||||
" data.pop('timestamps', None)\n",
|
||||
" output_data.append(data)\n",
|
||||
"\n",
|
||||
" return output_data\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n",
|
||||
"{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n",
|
||||
"{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n",
|
||||
"{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"output = transform_file_data(docs)\n",
|
||||
"for item in output:\n",
|
||||
" print(item)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||
" 'pages': [1, 3, 2, 0],\n",
|
||||
" 'description': 'lorem ipsum',\n",
|
||||
" 'filetype': 'txt',\n",
|
||||
" 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n",
|
||||
" 'track_id': 123},\n",
|
||||
" {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
|
||||
" 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n",
|
||||
" 'description': 'lorem ipsum',\n",
|
||||
" 'filetype': 'pdf',\n",
|
||||
" 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n",
|
||||
" 'track_id': 123},\n",
|
||||
" {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||
" 'pages': [2, 3, 5, 6],\n",
|
||||
" 'description': 'lorem ipsum',\n",
|
||||
" 'filetype': 'docx',\n",
|
||||
" 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n",
|
||||
" 'track_id': 123},\n",
|
||||
" {'filename': 'audio-2.audio',\n",
|
||||
" 'timestamps': [(0, 3), (3, 6)],\n",
|
||||
" 'description': 'lorem ipsum',\n",
|
||||
" 'filetype': 'audio',\n",
|
||||
" 'thumbnail': 'audio-2.jpg',\n",
|
||||
" 'track_id': 123}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"output"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
|
||||
Reference in New Issue
Block a user