AI indexing completed
This commit is contained in:
+311
-202
@@ -11,16 +11,29 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2024-08-16 16:06:32,880 - INFO - Loading the embedding model\n",
|
||||
"2024-08-16 16:06:38,758 - WARNING - c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
|
||||
" from tqdm.autonotebook import tqdm, trange\n",
|
||||
"\n",
|
||||
"2024-08-16 16:06:47,268 - INFO - PyTorch version 2.4.0+cu124 available.\n",
|
||||
"2024-08-16 16:06:47,868 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en\n",
|
||||
"2024-08-16 16:06:55,638 - INFO - Embedding model loaded\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -32,17 +45,51 @@
|
||||
"from langchain_groq import ChatGroq\n",
|
||||
"from langchain_core.prompts.prompt import PromptTemplate\n",
|
||||
"from langchain_core.output_parsers import StrOutputParser\n",
|
||||
"from collections import defaultdict\n",
|
||||
"import json\n",
|
||||
"load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# setting up groq api key\n",
|
||||
"# os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# setting up groq api key\n",
|
||||
"os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
|
||||
"\n",
|
||||
"# # chat set up\n",
|
||||
"# GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# ### Chains #####\n",
|
||||
"# # Initiator\n",
|
||||
"# def doc_summarizer(document_page: list) -> str:\n",
|
||||
"# initiator_prompt = PromptTemplate(\n",
|
||||
"# template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
|
||||
"# Create a short summary of the document based on the provided text. \n",
|
||||
" \n",
|
||||
"# Start with: This document is about...\n",
|
||||
" \n",
|
||||
"# <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
|
||||
"# DOCUMENT: {document_page} \\n\n",
|
||||
" \n",
|
||||
"# <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
|
||||
"# input_variables=[\"document_page\"],\n",
|
||||
"# )\n",
|
||||
"\n",
|
||||
"# initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
|
||||
"# output = initiator_router.invoke({\"document_page\":document_page})\n",
|
||||
"# return output\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -51,41 +98,31 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# chat set up\n",
|
||||
"GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### Chains #####\n",
|
||||
"# Initiator\n",
|
||||
"def doc_summarizer(document_page: list) -> str:\n",
|
||||
" initiator_prompt = PromptTemplate(\n",
|
||||
" template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
|
||||
" Create a short summary of the document based on the provided text. \n",
|
||||
" \n",
|
||||
" Start with: This document is about...\n",
|
||||
" \n",
|
||||
" <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
|
||||
" DOCUMENT: {document_page} \\n\n",
|
||||
" \n",
|
||||
" <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
|
||||
" input_variables=[\"document_page\"],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
|
||||
" output = initiator_router.invoke({\"document_page\":document_page})\n",
|
||||
" return output\n"
|
||||
"document_page = 'Wirebrush WD 40'\n",
|
||||
"# testing the function\n",
|
||||
"# summary = doc_summarizer(document_page)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2024-08-16 16:06:55,717 - INFO - Searching for Wirebrush WD 40\n",
|
||||
"2024-08-16 16:06:55,717 - INFO - Loading embedded data\n",
|
||||
"2024-08-16 16:06:56,487 - WARNING - c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
|
||||
" attn_output = torch.nn.functional.scaled_dot_product_attention(\n",
|
||||
"\n",
|
||||
"2024-08-16 16:06:56,628 - INFO - Search completed\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"document_page = 'How to change the engine oil of a toyota corrolla.'\n",
|
||||
"# testing the function\n",
|
||||
"summary = doc_summarizer(document_page)"
|
||||
"docs = search(document_page)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -96,7 +133,46 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'"
|
||||
"[{'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 424},\n",
|
||||
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'timestamp': '0:30-1:0',\n",
|
||||
" 'file_type': 'video'},\n",
|
||||
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'timestamp': '2:00-2:00',\n",
|
||||
" 'file_type': 'video'},\n",
|
||||
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'timestamp': '2:30-3:0',\n",
|
||||
" 'file_type': 'video'},\n",
|
||||
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'timestamp': '4:00-4:00',\n",
|
||||
" 'file_type': 'video'},\n",
|
||||
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'timestamp': '5:30-6:0',\n",
|
||||
" 'file_type': 'video'},\n",
|
||||
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'timestamp': '8:00-8:00',\n",
|
||||
" 'file_type': 'video'},\n",
|
||||
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'timestamp': '8:30-9:0',\n",
|
||||
" 'file_type': 'video'},\n",
|
||||
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'timestamp': '10:00-10:00',\n",
|
||||
" 'file_type': 'video'},\n",
|
||||
" {'source': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'timestamp': '3:30-4:0',\n",
|
||||
" 'file_type': 'video'},\n",
|
||||
" {'source': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'timestamp': '5:30-6:0',\n",
|
||||
" 'file_type': 'video'},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 329},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 264},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 290},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 201},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 326},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 315},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 317},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 325},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
@@ -104,202 +180,235 @@
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = search(document_page)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||
" 'page': 1,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n",
|
||||
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||
" 'page': 3,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||
" 'page': 2,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n",
|
||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||
" 'page': 2,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||
" 'page': 3,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||
" 'page': 0,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||
" 'page': 5,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||
" 'page': 6,\n",
|
||||
" 'file_type': 'text'},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n",
|
||||
" {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n",
|
||||
" {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n",
|
||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from collections import defaultdict\n",
|
||||
"\n",
|
||||
"def transform_file_data(input_data):\n",
|
||||
" # Create a dictionary to aggregate data by filename\n",
|
||||
" aggregated_data = defaultdict(lambda: {\n",
|
||||
" 'filename': '',\n",
|
||||
" 'pages': [],\n",
|
||||
" 'timestamps': [],\n",
|
||||
" 'description': 'lorem ipsum',\n",
|
||||
" 'filetype': '',\n",
|
||||
" 'thumbnail': '',\n",
|
||||
" 'track_id': 123\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" for item in input_data:\n",
|
||||
" if 'source' in item:\n",
|
||||
" file_path = item['source']\n",
|
||||
" filename = file_path.split('\\\\')[-1]\n",
|
||||
" extension = filename.split('.')[-1]\n",
|
||||
"\n",
|
||||
" aggregated_data[filename]['filename'] = filename\n",
|
||||
" aggregated_data[filename]['filetype'] = extension\n",
|
||||
" aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n",
|
||||
"\n",
|
||||
" if extension in ['pdf', 'txt', 'docx']:\n",
|
||||
" aggregated_data[filename]['pages'].append(item['page'])\n",
|
||||
" elif extension in ['mp4', 'mkv', 'flv']:\n",
|
||||
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
|
||||
" elif extension in ['mp3', 'wav', 'flac']:\n",
|
||||
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
|
||||
" elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n",
|
||||
" aggregated_data[filename].pop('pages', None) # Remove pages if it's an image\n",
|
||||
" aggregated_data[filename].pop('timestamps', None) # Remove timestamps if it's an image\n",
|
||||
"\n",
|
||||
" elif 'filename' in item:\n",
|
||||
" filename = item['filename']\n",
|
||||
" extension = item['file_type']\n",
|
||||
" aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n",
|
||||
" aggregated_data[filename]['filetype'] = extension\n",
|
||||
" aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n",
|
||||
" if 'duration' in item:\n",
|
||||
" start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n",
|
||||
" aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n",
|
||||
"\n",
|
||||
" # Convert aggregated data to the desired output format\n",
|
||||
" output_data = []\n",
|
||||
" for filename, data in aggregated_data.items():\n",
|
||||
" # Remove empty lists for pages and timestamps\n",
|
||||
" if not data['pages']:\n",
|
||||
" data.pop('pages', None)\n",
|
||||
" if not data['timestamps']:\n",
|
||||
" data.pop('timestamps', None)\n",
|
||||
" output_data.append(data)\n",
|
||||
"\n",
|
||||
" return output_data\n"
|
||||
"# a function to get data description\n",
|
||||
"def get_data_description(data_path):\n",
|
||||
" # ensuring no // or / or extension is present\n",
|
||||
" data_name = data_path.split('/')[-1].split('\\\\')[-1].split('.')[0]\n",
|
||||
" # print(data_name)\n",
|
||||
" # open the data.json file\n",
|
||||
" with open('data/data.json') as f:\n",
|
||||
" data = json.load(f)\n",
|
||||
" existing_data = data.keys()\n",
|
||||
" if data_name in existing_data:\n",
|
||||
" return data[data_name]['doc_summary']\n",
|
||||
" else:\n",
|
||||
" return 'No description available'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n",
|
||||
"{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n",
|
||||
"{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n",
|
||||
"{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"output = transform_file_data(docs)\n",
|
||||
"for item in output:\n",
|
||||
" print(item)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
||||
" 'pages': [1, 3, 2, 0],\n",
|
||||
" 'description': 'lorem ipsum',\n",
|
||||
" 'filetype': 'txt',\n",
|
||||
" 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n",
|
||||
" 'track_id': 123},\n",
|
||||
" {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
|
||||
" 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n",
|
||||
" 'description': 'lorem ipsum',\n",
|
||||
" 'filetype': 'pdf',\n",
|
||||
" 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n",
|
||||
" 'track_id': 123},\n",
|
||||
" {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
||||
" 'pages': [2, 3, 5, 6],\n",
|
||||
" 'description': 'lorem ipsum',\n",
|
||||
" 'filetype': 'docx',\n",
|
||||
" 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n",
|
||||
" 'track_id': 123},\n",
|
||||
" {'filename': 'audio-2.audio',\n",
|
||||
" 'timestamps': [(0, 3), (3, 6)],\n",
|
||||
" 'description': 'lorem ipsum',\n",
|
||||
" 'filetype': 'audio',\n",
|
||||
" 'thumbnail': 'audio-2.jpg',\n",
|
||||
" 'track_id': 123}]"
|
||||
"\"This document is about a video tutorial series on replacing car parts, specifically the latest installment of AutoDoc's video tutorials.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"output"
|
||||
"get_data_description('How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# getting data thumbnais. \n",
|
||||
"def get_data_thumbnail(data_path, timestamp = None):\n",
|
||||
" # ensuring no // or / or extension is present\n",
|
||||
" file_name = data_path.split('/')[-1].split('\\\\')[-1].split('.')[0]\n",
|
||||
" # first check is to see if the file_name has a .png image in the thumbnail folder\n",
|
||||
" if os.path.exists(f'data/thumbnails/{file_name}.png'):\n",
|
||||
" return f'data/thumbnails/{file_name}.png'\n",
|
||||
" # the second check is to see if we have a folder with this file_name\n",
|
||||
" elif os.path.exists(f'data/{file_name}'):\n",
|
||||
" # so now we want to access the first timestamp\n",
|
||||
" if timestamp:\n",
|
||||
" first = timestamp[0]\n",
|
||||
" # split by -\n",
|
||||
" start, end = first.split('-')\n",
|
||||
" # we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds\n",
|
||||
" start = int(start.split(':')[0])*60 + int(start.split(':')[1])\n",
|
||||
" end = int(end.split(':')[0])*60 + int(end.split(':')[1])\n",
|
||||
" # bringing them together\n",
|
||||
" image_file = f\"{start}-{end}s.png\"\n",
|
||||
" # niw checkin if the file exists\n",
|
||||
" if os.path.exists(f'data/{file_name}/{image_file}'):\n",
|
||||
" return f'data/{file_name}/{image_file}'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'data/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/210-240s.png'"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"get_data_thumbnail('How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]', timestamp=['3:30-4:0', '5:30-6:0'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'data/thumbnails/corolla-2020-toyota-owners-manual.png'"
|
||||
]
|
||||
},
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"get_data_thumbnail(\"./data\\\\corolla-2020-toyota-owners-manual.pdf'\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarize_doc_search(data):\n",
|
||||
" summary = {}\n",
|
||||
"\n",
|
||||
" for item in data:\n",
|
||||
" source = item['source']\n",
|
||||
" if source not in summary:\n",
|
||||
" summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}\n",
|
||||
" \n",
|
||||
" if 'page' in item:\n",
|
||||
" summary[source]['pages'].append(item['page'])\n",
|
||||
" if 'timestamp' in item:\n",
|
||||
" summary[source]['timestamps'].append(item['timestamp'])\n",
|
||||
" \n",
|
||||
" # Formatting the summary as a list of dictionaries\n",
|
||||
" summarized_list = [\n",
|
||||
" {'filename': key.split(\"\\\\\")[-1], \n",
|
||||
" 'pages': value['pages'], \n",
|
||||
" 'timestamps': value['timestamps'], \n",
|
||||
" 'file_type': value['file_type']}\n",
|
||||
" for key, value in summary.items()\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
" # getting the file description and thumbnail\n",
|
||||
" for item in summarized_list:\n",
|
||||
" item['description'] = get_data_description(item['filename'])\n",
|
||||
" # ehcking if we have an empty timestamp list\n",
|
||||
" if len(item['timestamps']) > 0:\n",
|
||||
" item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])\n",
|
||||
" else:\n",
|
||||
" item['thumbnail'] = get_data_thumbnail(item['filename'])\n",
|
||||
" \n",
|
||||
" return summarized_list"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"doc_summary = summarize_doc_search(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
|
||||
" 'pages': [424, 329, 264, 290, 201, 326, 315, 317, 325, 422],\n",
|
||||
" 'timestamps': [],\n",
|
||||
" 'file_type': 'pdf',\n",
|
||||
" 'description': \"This document is about the user manual for a Toyota Corolla, providing information and instructions on various aspects of the vehicle, including safety and security, vehicle status, driving operations, interior features, maintenance, and troubleshooting. The manual covers topics such as child seat installation, theft deterrent systems, reading driving-related information, operating the Entune audio system, and caring for the vehicle's interior and exterior. It also includes information on reporting safety defects and provides instructions for Canadian owners on seat belt and SRS air\",\n",
|
||||
" 'thumbnail': 'data/thumbnails/corolla-2020-toyota-owners-manual.png'},\n",
|
||||
" {'filename': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'pages': [],\n",
|
||||
" 'timestamps': ['0:30-1:0',\n",
|
||||
" '2:00-2:00',\n",
|
||||
" '2:30-3:0',\n",
|
||||
" '4:00-4:00',\n",
|
||||
" '5:30-6:0',\n",
|
||||
" '8:00-8:00',\n",
|
||||
" '8:30-9:0',\n",
|
||||
" '10:00-10:00'],\n",
|
||||
" 'file_type': 'video',\n",
|
||||
" 'description': \"This document is about a video tutorial series on replacing car parts, specifically the latest installment of AutoDoc's video tutorials.\",\n",
|
||||
" 'thumbnail': 'data/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/30-60s.png'},\n",
|
||||
" {'filename': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||
" 'pages': [],\n",
|
||||
" 'timestamps': ['3:30-4:0', '5:30-6:0'],\n",
|
||||
" 'file_type': 'video',\n",
|
||||
" 'description': \"This document is about a video tutorial series on replacing car parts, specifically the latest installment of Auto-Doc's video tutorials.\",\n",
|
||||
" 'thumbnail': 'data/How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/210-240s.png'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"doc_summary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
Reference in New Issue
Block a user