333 lines
11 KiB
Plaintext
333 lines
11 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# !pip install langchain-groq"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from utils import search\n",
|
|
"import sys, os\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"from langchain_groq import ChatGroq\n",
|
|
"from langchain_core.prompts.prompt import PromptTemplate\n",
|
|
"from langchain_core.output_parsers import StrOutputParser\n",
|
|
"load_dotenv()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# setting up groq api key\n",
|
|
"os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"# chat set up\n",
|
|
"GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
|
|
"\n",
|
|
"\n",
|
|
"### Chains #####\n",
|
|
"# Initiator\n",
|
|
"def doc_summarizer(document_page: list) -> str:\n",
|
|
" initiator_prompt = PromptTemplate(\n",
|
|
" template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
|
|
" Create a short summary of the document based on the provided text. \n",
|
|
" \n",
|
|
" Start with: This document is about...\n",
|
|
" \n",
|
|
" <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
|
|
" DOCUMENT: {document_page} \\n\n",
|
|
" \n",
|
|
" <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
|
|
" input_variables=[\"document_page\"],\n",
|
|
" )\n",
|
|
"\n",
|
|
" initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
|
|
" output = initiator_router.invoke({\"document_page\":document_page})\n",
|
|
" return output\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"document_page = 'How to change the engine oil of a toyota corrolla.'\n",
|
|
"# testing the function\n",
|
|
"summary = doc_summarizer(document_page)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"summary"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"docs = search(document_page)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
|
" 'page': 1,\n",
|
|
" 'file_type': 'text'},\n",
|
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n",
|
|
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
|
" 'page': 3,\n",
|
|
" 'file_type': 'text'},\n",
|
|
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
|
" 'page': 2,\n",
|
|
" 'file_type': 'text'},\n",
|
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n",
|
|
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
|
" 'page': 2,\n",
|
|
" 'file_type': 'text'},\n",
|
|
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
|
" 'page': 3,\n",
|
|
" 'file_type': 'text'},\n",
|
|
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
|
" 'page': 0,\n",
|
|
" 'file_type': 'text'},\n",
|
|
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
|
" 'page': 5,\n",
|
|
" 'file_type': 'text'},\n",
|
|
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
|
" 'page': 6,\n",
|
|
" 'file_type': 'text'},\n",
|
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n",
|
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n",
|
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n",
|
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n",
|
|
" {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n",
|
|
" {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n",
|
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n",
|
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n",
|
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n",
|
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"docs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from collections import defaultdict\n",
|
|
"\n",
|
|
"def transform_file_data(input_data):\n",
|
|
" # Create a dictionary to aggregate data by filename\n",
|
|
" aggregated_data = defaultdict(lambda: {\n",
|
|
" 'filename': '',\n",
|
|
" 'pages': [],\n",
|
|
" 'timestamps': [],\n",
|
|
" 'description': 'lorem ipsum',\n",
|
|
" 'filetype': '',\n",
|
|
" 'thumbnail': '',\n",
|
|
" 'track_id': 123\n",
|
|
" })\n",
|
|
"\n",
|
|
" for item in input_data:\n",
|
|
" if 'source' in item:\n",
|
|
" file_path = item['source']\n",
|
|
" filename = file_path.split('\\\\')[-1]\n",
|
|
" extension = filename.split('.')[-1]\n",
|
|
"\n",
|
|
" aggregated_data[filename]['filename'] = filename\n",
|
|
" aggregated_data[filename]['filetype'] = extension\n",
|
|
" aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n",
|
|
"\n",
|
|
" if extension in ['pdf', 'txt', 'docx']:\n",
|
|
" aggregated_data[filename]['pages'].append(item['page'])\n",
|
|
" elif extension in ['mp4', 'mkv', 'flv']:\n",
|
|
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
|
|
" elif extension in ['mp3', 'wav', 'flac']:\n",
|
|
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
|
|
" elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n",
|
|
" aggregated_data[filename].pop('pages', None) # Remove pages if it's an image\n",
|
|
" aggregated_data[filename].pop('timestamps', None) # Remove timestamps if it's an image\n",
|
|
"\n",
|
|
" elif 'filename' in item:\n",
|
|
" filename = item['filename']\n",
|
|
" extension = item['file_type']\n",
|
|
" aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n",
|
|
" aggregated_data[filename]['filetype'] = extension\n",
|
|
" aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n",
|
|
" if 'duration' in item:\n",
|
|
" start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n",
|
|
" aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n",
|
|
"\n",
|
|
" # Convert aggregated data to the desired output format\n",
|
|
" output_data = []\n",
|
|
" for filename, data in aggregated_data.items():\n",
|
|
" # Remove empty lists for pages and timestamps\n",
|
|
" if not data['pages']:\n",
|
|
" data.pop('pages', None)\n",
|
|
" if not data['timestamps']:\n",
|
|
" data.pop('timestamps', None)\n",
|
|
" output_data.append(data)\n",
|
|
"\n",
|
|
" return output_data\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n",
|
|
"{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n",
|
|
"{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n",
|
|
"{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"output = transform_file_data(docs)\n",
|
|
"for item in output:\n",
|
|
" print(item)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
|
" 'pages': [1, 3, 2, 0],\n",
|
|
" 'description': 'lorem ipsum',\n",
|
|
" 'filetype': 'txt',\n",
|
|
" 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n",
|
|
" 'track_id': 123},\n",
|
|
" {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
|
|
" 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n",
|
|
" 'description': 'lorem ipsum',\n",
|
|
" 'filetype': 'pdf',\n",
|
|
" 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n",
|
|
" 'track_id': 123},\n",
|
|
" {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
|
" 'pages': [2, 3, 5, 6],\n",
|
|
" 'description': 'lorem ipsum',\n",
|
|
" 'filetype': 'docx',\n",
|
|
" 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n",
|
|
" 'track_id': 123},\n",
|
|
" {'filename': 'audio-2.audio',\n",
|
|
" 'timestamps': [(0, 3), (3, 6)],\n",
|
|
" 'description': 'lorem ipsum',\n",
|
|
" 'filetype': 'audio',\n",
|
|
" 'thumbnail': 'audio-2.jpg',\n",
|
|
" 'track_id': 123}]"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"output"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "smog_env",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|