From eb50aed3b13bd5fe4a5177bddb6f83f9800fb7fb Mon Sep 17 00:00:00 2001 From: timothyafolami Date: Mon, 12 Aug 2024 22:33:39 +0100 Subject: [PATCH] video preprocessing pipeline completed --- audio_experiment.ipynb | 12 +-- loggings/app.log | 7 ++ video_experiment.ipynb | 171 ++++++++++++++++++++++++++++++++++------- 3 files changed, 157 insertions(+), 33 deletions(-) diff --git a/audio_experiment.ipynb b/audio_experiment.ipynb index d83ea4de..76d1d01b 100644 --- a/audio_experiment.ipynb +++ b/audio_experiment.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -73,25 +73,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# testing the function\n", - "path = \"data/test_rec.m4a\"\n", + "path = \"data/3.0.m4a\"\n", "transcript = audio_to_text(path)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " hello everyone so i'm timothy i'm doing this recording to talk about something i need about cars okay let's say very correct okay so what is a car the guy is a mechanical object designed to transport people from one location to the other so that's like how i would define a car for you right okay so what do i need about a car how do i fix something I kind of know little and that little is a much on its own right I have very good understanding of like service is very cool, a car, you know changing the oil, removing the oil filter, changing the oil filter like that like that right so I think um yeah yeah yeah yeah so what else you don't see what else should I say? alright this is good it's good enough yeah thanks bye stop\n" + " Alright, so now it's time to talk about the model training part. So I used the same code for the three of them. Alright, so I'm not going to go through one of them, okay? So here's what happened. Remember, after we flimmed the files for each of the datasets, they were all saved with a unique name. Right? So immediately when you get to the training phase, the dataset was loaded. The clean dataset for the corresponding data was loaded right then there was only four practically no need for some extra processing but if there's a need probably to convert from numerical to categorical as expected all right so then the next thing was to select because at every point in time you are selecting an attack as zeroed in so the thing is that we are going to um actually i'm going to select a part of the normal case as a as an addition to the zero day okay so let's say for example we're working with analysis right i'm working on analysis so i'm going to say that okay uh if analysis initially add let's say 200 000 rows in the data set all right i'm going to select corresponding 200 000 rows in the data set for the normal case so that makes it for my zero d attack i have 200 000 normal keys and also seven thousand notes sorry and also 200 000 attack case which is the zero d and in this case analysis so the other ones that were not selected from the first one that is the other the remaining normal ones from the 600k rows right they will be used in training the in the model right while the remaining attacks all right will be used as a target variable to train the model so what happens is that you have to split our data into the train set and zero ticked set so definitely this train set is going to have its own test set from its own self right but we initially want to like separate this data so that you know you know that we have a different data for train sets you also have a different data for what for for testing the zero attacks in this case I pick one of the attack as zero D all right so then we go on to do that to proceed with zero D normal with yeah I just described I just described how that went right so in this case we have analysis and letting turn red key as as my as my 0d normal right then I'm using all of the other attacks excluding sorry and I'm using only 0d only the analysis attack as my 0d attack all right so so on that key normal and analysis as my zero day attack okay all right then um so the many ones are used right the meaning no more are used for training why do you mean your tax I use for training so that we can only have analysis as our zero day as expected here right Alright, so the next thing to do here is to transform our data right? We use standard scalar to process data to ensure that our data is much more distributed Alright, so that's what we do, we use standard scalar So we scale on the train set and we infer on the zero day set Alright, so the train set is itself split in two now you should notice that the train sets okay we only have two categories in the train sets you only have the normal attack because we know this are the we are assuming that the other attacks are attacks so we are categories we are categorizing them as just a single attack so we have normal and attack and in the zero day data so we have the normal and the attack each with two unique um two unique um target variables which is either normal attack okay so We created a metric which is called the calculateZDR metric. So this metric is responsible to calculate the 0D rates. So I can share the code to that if you need it. So then after we've processed our data, we've trained a machine learning model which is random forest um classifier model on the attack now this first training here is to dust off the data set in view all right so we trained with the model with the thought of wanting to get the top features so automatically after training the model you perform feature importance to extract the top 30 features all right 30 features special top 10 features for the UNSW data center shot it up 30 features but for the CI CDS we're starting the top 40 features because it has more attacks than the like it has more more more features rather okay and this one so we're dealing with okay so we train the machine learning model after the model is done training we infer immediately right we infer immediately to get the top 30 features then we now train a new machine learning model again so that's gonna give us a model that is much more robust in the sense right to only some to only the top important features we don need the lesser ones all right so then apparently the model kind of like you know it performed well in some cases in some other cases like for attacks like fuzza attacks the model didn't perform so well right that is the machine learning model so the next one wants to create a deep learning model all right so in this case we are using pytosh as our as our programming language or selected as our deep learning framework all right then we have uh we have five forwards so looking at talking about the layout of this model right we have um we have five layers we have five layers which is the patch normal yeah so this is the mlp the multi-layer uh perception model okay built with pythons so we did python rather so we have five layers and for each layer there's a dropout it's a 50 dropout all right so then that's for the data that's for the model for the python so this model apparently appears to be one of the best model we have so far because when it comes to zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero zero rate detection it's performed very well right so the other one that we worked with is the cnn module and this model is also similar to that model it also has five layers but now it has um it has a patch normalization layer and also a dropout layer added to it so we are using the convolutional 1d layer right then immediately after each of the layer we have the patch normalization 1d layer and also the dropout layer so that's for the models and lastly we have the voting classifier to combine these models together and depending on the kind of aggregating voting method that was used in this case we used soft all right to ensure that uh we are not too strict on coming up with a final uh a final approach all right so that's all that's all this is a bit longer already that's all\n" ] } ], @@ -101,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ diff --git a/loggings/app.log b/loggings/app.log index f7bc1441..d2247fe7 100644 --- a/loggings/app.log +++ b/loggings/app.log @@ -134,3 +134,10 @@ 2024-08-09 16:32:44,664 - INFO - Search completed 2024-08-09 16:32:44,665 - INFO - Page content: Oilchange pouring new oil in the engine +2024-08-09 16:52:52,869 - INFO - Receiving the search query +2024-08-09 16:53:04,624 - INFO - Searching for Anti-squeal paste +2024-08-09 16:53:05,451 - INFO - Search completed +2024-08-09 16:54:31,010 - INFO - Receiving the search query +2024-08-09 16:54:37,491 - INFO - Searching for Welcome back to Toyota Maintenance YouTube channel +2024-08-09 16:54:38,251 - INFO - Search completed +2024-08-09 16:54:38,251 - INFO - Page content: this without any hesitation buy one of these to just get to the work and back and save a lot of money on the gas that's my feeling you please share underneath the video your experience with others because that's why we filming this creating community of Toyota enthusiasts who share their own own experience if you like the video give it a thumb up and be subscribed i will always have a lot of new stuff coming your way thank you for watching and have a great day my friend diff --git a/video_experiment.ipynb b/video_experiment.ipynb index 62f01dbc..afce66e0 100644 --- a/video_experiment.ipynb +++ b/video_experiment.ipynb @@ -2,52 +2,155 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# !pip install moviepy\n", + "# !pip install ffmpeg-python" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from moviepy.editor import VideoFileClip\n", + "import os\n", + "import ffmpeg\n", + "# importing module that prerocess the audio file \n", + "from data_ingestion.utils import create_audio_document\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Video Preprocessing Pipeline" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "video_path = 'data/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC].mp4'\n", + "\n", + "\n", + "\n", + "# creating a function to preprocess the videos\n", + "def preprocess_video_data(video_path: str, time_interval: int):\n", + " \n", + " # Load the video file\n", + " video = VideoFileClip(video_path)\n", + " \n", + " # Get the duration of the video\n", + " duration = video.duration\n", + " \n", + " # create an audio version of the video\n", + " audio_path = video_path.replace('.mp4', '.mp3')\n", + " _ = video.audio.write_audiofile(audio_path)\n", + " \n", + " # creating a snapshot of the videos at the time interval\n", + " # Extract the video filename without extension\n", + " video_name = os.path.splitext(os.path.basename(video_path))[0]\n", + "\n", + " # Create a directory for snapshots using the video name\n", + " snapshot_dir = os.path.join(os.path.dirname(video_path), f\"{video_name}_snapshots\")\n", + " os.makedirs(snapshot_dir, exist_ok=True)\n", + "\n", + " # Set the interval to 3 minutes (180 seconds)\n", + " interval = 180\n", + "\n", + " # Get the duration of the video using ffmpeg\n", + " probe = ffmpeg.probe(video_path)\n", + " duration = float(probe['format']['duration'])\n", + "\n", + " # Loop through the video and take snapshots at 0s, 3min, 6min, etc.\n", + " for i in range(0, int(duration), interval):\n", + " # Calculate the time for the current frame\n", + " frame_time = i\n", + " # Save the snapshot as an image file in the created folder\n", + " frame_img = os.path.join(snapshot_dir, f\"frame_at_{frame_time//60}min.png\")\n", + " \n", + " # Extract the frame using ffmpeg\n", + " (\n", + " ffmpeg\n", + " .input(video_path, ss=frame_time)\n", + " .output(frame_img, vframes=1)\n", + " .run()\n", + " )\n", + "\n", + " print(f\"Snapshots saved in {snapshot_dir}.\")\n", + " \n", + " \n", + " # now creating document from the audio file\n", + " documents = create_audio_document(audio_path)\n", + " return documents\n", + "\n" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MoviePy - Writing audio in data/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC].mp3\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MoviePy - Done.\n", + "Snapshots saved in data\\How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]_snapshots.\n", + "Exporting How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]_chunks/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]_chunk1.mp3\n", + "Exporting How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]_chunks/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]_chunk2.mp3\n", + "Exporting How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]_chunks/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]_chunk3.mp3\n", + "Exporting How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]_chunks/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]_chunk4.mp3\n" + ] + } + ], + "source": [ + "# Implementing the function\n", + "documents = preprocess_video_data(video_path, 180)" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'filename': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]', 'duration': '0-3 minutes'}, page_content=\" Hi everybody! Here's the latest installment of AutoDoc's video tutorials on replacing car parts. The channel so you never miss a video. We post new ones every week!\"),\n", + " Document(metadata={'filename': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]', 'duration': '3-6 minutes'}, page_content=\" How to make a You can buy spare parts from us on our website or in the Autodoc app. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. 15. Remove the rear window cover. 16. Remove the rear window cover. 17. Remove the rear window cover. 18. Remove the rear window cover. 19. Remove the rear window cover. I'll see you next time.\"),\n", + " Document(metadata={'filename': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]', 'duration': '6-9 minutes'}, page_content=\" How to make a Are you interested in this product? All links can be found in the description. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. I'm going to make a hole in the bottom of the box. Add the so I going to make a fire with a fire extinguisher I going to make a fire with a fire extinguisher I'm going to make a fire with a fire extinguisher. I'm going to make a fire with a fire extinguisher. I'm going to make a fire with a fire extinguisher. The\"),\n", + " Document(metadata={'filename': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]', 'duration': '9-12 minutes'}, page_content=\" I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. I'm going to make a new one. Thank you for watching our video tutorials. If you enjoyed watching, click thumbs up and share it with your friends. Have a nice day! Follow us on social media. Find us on Instagram and TikTok!\")]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents" + ] }, { "cell_type": "code", @@ -93,8 +196,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "smog_env", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" } }, "nbformat": 4,