Files
ds_fire_fighter/search_note.ipynb
T

333 lines
11 KiB
Plaintext
Raw Normal View History

2024-08-14 23:09:10 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
2024-08-15 23:17:17 +01:00
"source": [
"# !pip install langchain-groq"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
2024-08-14 23:09:10 +01:00
"source": [
"from utils import search\n",
2024-08-15 23:17:17 +01:00
"import sys, os\n",
"from dotenv import load_dotenv\n",
"from langchain_groq import ChatGroq\n",
"from langchain_core.prompts.prompt import PromptTemplate\n",
"from langchain_core.output_parsers import StrOutputParser\n",
"load_dotenv()"
2024-08-14 23:09:10 +01:00
]
},
{
"cell_type": "code",
2024-08-15 23:17:17 +01:00
"execution_count": 3,
2024-08-14 23:09:10 +01:00
"metadata": {},
"outputs": [],
2024-08-15 23:17:17 +01:00
"source": [
"# setting up groq api key\n",
"os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
]
2024-08-14 23:09:10 +01:00
},
{
"cell_type": "code",
2024-08-15 23:17:17 +01:00
"execution_count": 4,
2024-08-14 23:09:10 +01:00
"metadata": {},
"outputs": [],
2024-08-15 23:17:17 +01:00
"source": [
"\n",
"# chat set up\n",
"GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
"\n",
"\n",
"### Chains #####\n",
"# Initiator\n",
"def doc_summarizer(document_page: list) -> str:\n",
" initiator_prompt = PromptTemplate(\n",
" template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
" Create a short summary of the document based on the provided text. \n",
" \n",
" Start with: This document is about...\n",
" \n",
" <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
" DOCUMENT: {document_page} \\n\n",
" \n",
" <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
" input_variables=[\"document_page\"],\n",
" )\n",
"\n",
" initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
" output = initiator_router.invoke({\"document_page\":document_page})\n",
" return output\n"
]
2024-08-14 23:09:10 +01:00
},
{
"cell_type": "code",
2024-08-15 23:17:17 +01:00
"execution_count": 5,
2024-08-14 23:09:10 +01:00
"metadata": {},
"outputs": [],
2024-08-15 23:17:17 +01:00
"source": [
"document_page = 'How to change the engine oil of a toyota corrolla.'\n",
"# testing the function\n",
"summary = doc_summarizer(document_page)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summary"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"docs = search(document_page)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
" 'page': 1,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n",
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
" 'page': 3,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
" 'page': 2,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'page': 2,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'page': 3,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
" 'page': 0,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'page': 5,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'page': 6,\n",
" 'file_type': 'text'},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n",
" {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n",
" {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"\n",
"def transform_file_data(input_data):\n",
" # Create a dictionary to aggregate data by filename\n",
" aggregated_data = defaultdict(lambda: {\n",
" 'filename': '',\n",
" 'pages': [],\n",
" 'timestamps': [],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': '',\n",
" 'thumbnail': '',\n",
" 'track_id': 123\n",
" })\n",
"\n",
" for item in input_data:\n",
" if 'source' in item:\n",
" file_path = item['source']\n",
" filename = file_path.split('\\\\')[-1]\n",
" extension = filename.split('.')[-1]\n",
"\n",
" aggregated_data[filename]['filename'] = filename\n",
" aggregated_data[filename]['filetype'] = extension\n",
" aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n",
"\n",
" if extension in ['pdf', 'txt', 'docx']:\n",
" aggregated_data[filename]['pages'].append(item['page'])\n",
" elif extension in ['mp4', 'mkv', 'flv']:\n",
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
" elif extension in ['mp3', 'wav', 'flac']:\n",
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
" elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n",
" aggregated_data[filename].pop('pages', None) # Remove pages if it's an image\n",
" aggregated_data[filename].pop('timestamps', None) # Remove timestamps if it's an image\n",
"\n",
" elif 'filename' in item:\n",
" filename = item['filename']\n",
" extension = item['file_type']\n",
" aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n",
" aggregated_data[filename]['filetype'] = extension\n",
" aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n",
" if 'duration' in item:\n",
" start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n",
" aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n",
"\n",
" # Convert aggregated data to the desired output format\n",
" output_data = []\n",
" for filename, data in aggregated_data.items():\n",
" # Remove empty lists for pages and timestamps\n",
" if not data['pages']:\n",
" data.pop('pages', None)\n",
" if not data['timestamps']:\n",
" data.pop('timestamps', None)\n",
" output_data.append(data)\n",
"\n",
" return output_data\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n",
"{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n",
"{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n",
"{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n"
]
}
],
"source": [
"output = transform_file_data(docs)\n",
"for item in output:\n",
" print(item)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n",
" 'pages': [1, 3, 2, 0],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'txt',\n",
" 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n",
" 'track_id': 123},\n",
" {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
" 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'pdf',\n",
" 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n",
" 'track_id': 123},\n",
" {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'pages': [2, 3, 5, 6],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'docx',\n",
" 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n",
" 'track_id': 123},\n",
" {'filename': 'audio-2.audio',\n",
" 'timestamps': [(0, 3), (3, 6)],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'audio',\n",
" 'thumbnail': 'audio-2.jpg',\n",
" 'track_id': 123}]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output"
]
2024-08-14 23:09:10 +01:00
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "smog_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}