{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# !pip install langchain-groq" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from utils import search\n", "import sys, os\n", "from dotenv import load_dotenv\n", "from langchain_groq import ChatGroq\n", "from langchain_core.prompts.prompt import PromptTemplate\n", "from langchain_core.output_parsers import StrOutputParser\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# setting up groq api key\n", "os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "\n", "# chat set up\n", "GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n", "\n", "\n", "### Chains #####\n", "# Initiator\n", "def doc_summarizer(document_page: list) -> str:\n", " initiator_prompt = PromptTemplate(\n", " template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n", " Create a short summary of the document based on the provided text. \n", " \n", " Start with: This document is about...\n", " \n", " <|eot_id|><|start_header_id|>user<|end_header_id|>\n", " DOCUMENT: {document_page} \\n\n", " \n", " <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n", " input_variables=[\"document_page\"],\n", " )\n", "\n", " initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n", " output = initiator_router.invoke({\"document_page\":document_page})\n", " return output\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "document_page = 'How to change the engine oil of a toyota corrolla.'\n", "# testing the function\n", "summary = doc_summarizer(document_page)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "docs = search(document_page)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", " 'page': 1,\n", " 'file_type': 'text'},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n", " {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", " 'page': 3,\n", " 'file_type': 'text'},\n", " {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", " 'page': 2,\n", " 'file_type': 'text'},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n", " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", " 'page': 2,\n", " 'file_type': 'text'},\n", " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", " 'page': 3,\n", " 'file_type': 'text'},\n", " {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", " 'page': 0,\n", " 'file_type': 'text'},\n", " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", " 'page': 5,\n", " 'file_type': 'text'},\n", " {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", " 'page': 6,\n", " 'file_type': 'text'},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n", " {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n", " {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "docs" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "\n", "def transform_file_data(input_data):\n", " # Create a dictionary to aggregate data by filename\n", " aggregated_data = defaultdict(lambda: {\n", " 'filename': '',\n", " 'pages': [],\n", " 'timestamps': [],\n", " 'description': 'lorem ipsum',\n", " 'filetype': '',\n", " 'thumbnail': '',\n", " 'track_id': 123\n", " })\n", "\n", " for item in input_data:\n", " if 'source' in item:\n", " file_path = item['source']\n", " filename = file_path.split('\\\\')[-1]\n", " extension = filename.split('.')[-1]\n", "\n", " aggregated_data[filename]['filename'] = filename\n", " aggregated_data[filename]['filetype'] = extension\n", " aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n", "\n", " if extension in ['pdf', 'txt', 'docx']:\n", " aggregated_data[filename]['pages'].append(item['page'])\n", " elif extension in ['mp4', 'mkv', 'flv']:\n", " aggregated_data[filename]['timestamps'].append(item['page'])\n", " elif extension in ['mp3', 'wav', 'flac']:\n", " aggregated_data[filename]['timestamps'].append(item['page'])\n", " elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n", " aggregated_data[filename].pop('pages', None) # Remove pages if it's an image\n", " aggregated_data[filename].pop('timestamps', None) # Remove timestamps if it's an image\n", "\n", " elif 'filename' in item:\n", " filename = item['filename']\n", " extension = item['file_type']\n", " aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n", " aggregated_data[filename]['filetype'] = extension\n", " aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n", " if 'duration' in item:\n", " start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n", " aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n", "\n", " # Convert aggregated data to the desired output format\n", " output_data = []\n", " for filename, data in aggregated_data.items():\n", " # Remove empty lists for pages and timestamps\n", " if not data['pages']:\n", " data.pop('pages', None)\n", " if not data['timestamps']:\n", " data.pop('timestamps', None)\n", " output_data.append(data)\n", "\n", " return output_data\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n", "{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n", "{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n", "{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n" ] } ], "source": [ "output = transform_file_data(docs)\n", "for item in output:\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n", " 'pages': [1, 3, 2, 0],\n", " 'description': 'lorem ipsum',\n", " 'filetype': 'txt',\n", " 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n", " 'track_id': 123},\n", " {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n", " 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n", " 'description': 'lorem ipsum',\n", " 'filetype': 'pdf',\n", " 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n", " 'track_id': 123},\n", " {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n", " 'pages': [2, 3, 5, 6],\n", " 'description': 'lorem ipsum',\n", " 'filetype': 'docx',\n", " 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n", " 'track_id': 123},\n", " {'filename': 'audio-2.audio',\n", " 'timestamps': [(0, 3), (3, 6)],\n", " 'description': 'lorem ipsum',\n", " 'filetype': 'audio',\n", " 'thumbnail': 'audio-2.jpg',\n", " 'track_id': 123}]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "smog_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }