Indexing completed.

2024-08-13 21:30:01 +01:00
parent eb50aed3b1
commit 8150b0a132
9 changed files with 195 additions and 100 deletions
@@ -18,6 +18,8 @@ import numpy as np
 from pydub import AudioSegment
 import base64
 import requests
+from moviepy.editor import VideoFileClip
+import ffmpeg
 from dotenv import load_dotenv
 load_dotenv()

@@ -44,7 +46,7 @@ embeddings = load_embedding_model()


 # --------------------------------------------------------TEXT PREPROCESSING--------------------------------------------
-def create_documents(doc):
+def create_documents(doc, file_type='text'):
    text = doc[0].page_content
    metadata = doc[0].metadata
    text_splitter = RecursiveCharacterTextSplitter(
@@ -60,6 +62,7 @@ def create_documents(doc):
        # Increment page number based on the chunk index
        doc_metadata = metadata.copy()
        doc_metadata['page'] = i  # Assign page number based on chunk index
+        doc_metadata['file_type'] = file_type
        document = Document(page_content=chunk.page_content, metadata=doc_metadata)
        documents.append(document)
    return documents
@@ -159,11 +162,11 @@ def process_image(image_path):


 # create image document
-def create_image_document(image_path):
+def create_image_document(image_path, file_type='image'):
    # getting the image name from the image path
    image_name = image_path.split('/')[-1].split('.')[0]
    # setting image name as metadata
-    metadata = {'filename': image_name}
+    metadata = {'filename': image_name, 'file_type': file_type}
    text_extractor = TextExtractor()
    text = text_extractor.read_text_from_image(image_path)
    # removing special characters and line breaks
@@ -232,7 +235,7 @@ def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_outpu

    return chunk_folder, chunk_paths

-def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes):
+def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'):
    # Split the audio file into chunks
    chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes)

@@ -260,7 +263,8 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes):
        # Create a document with the transcript and metadata
        metadata = {
                "filename": base_filename,
-                "duration": f"{start_min}-{end_min} minutes"
+                "duration": f"{start_min}-{end_min} minutes", 
+                "file_type": file_type,
            }
        document = Document(page_content=transcript, metadata=metadata)
        documents.append(document)
@@ -271,11 +275,63 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes):
    return documents


+
 # creating a function to create audio document
-def create_audio_document(audio_file_path, chunk_duration_minutes=3):
-    documents = transcribe_audio_chunks(audio_file_path, chunk_duration_minutes)
+def create_audio_document(audio_file_path, chunk_duration_minutes=3, file_type='audio'):
+    documents = transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type)
    return documents

+# ------------------------------------------------VIDEO PROCESSING-----------------------------------------------------
+def preprocess_video_data(video_path: str, time_interval: int):
+    
+    # Load the video file
+    video = VideoFileClip(video_path)
+    
+    # Get the duration of the video
+    duration = video.duration
+    
+    # create an audio version of the video
+    audio_path = video_path.replace('.mp4', '.mp3')
+    _ = video.audio.write_audiofile(audio_path)
+    
+    # creating a snapshot of the videos at the time interval
+    # Extract the video filename without extension
+    video_name = os.path.splitext(os.path.basename(video_path))[0]
+
+    # Create a directory for snapshots using the video name
+    snapshot_dir = os.path.join(os.path.dirname(video_path), f"{video_name}_snapshots")
+    os.makedirs(snapshot_dir, exist_ok=True)
+
+    # Set the interval to 3 minutes (180 seconds)
+    interval = 180
+
+    # Get the duration of the video using ffmpeg
+    probe = ffmpeg.probe(video_path)
+    duration = float(probe['format']['duration'])
+
+    # Loop through the video and take snapshots at 0s, 3min, 6min, etc.
+    for i in range(0, int(duration), interval):
+        # Calculate the time for the current frame
+        frame_time = i
+        # Save the snapshot as an image file in the created folder
+        frame_img = os.path.join(snapshot_dir, f"frame_at_{frame_time//60}min.png")
+        
+        # Extract the frame using ffmpeg
+        (
+            ffmpeg
+            .input(video_path, ss=frame_time)
+            .output(frame_img, vframes=1)
+            .run()
+        )
+
+    print(f"Snapshots saved in {snapshot_dir}.")
+    
+    
+    # now creating document from the audio file
+    documents = create_audio_document(audio_path, file_type='video')
+    return documents
+
+

 #-----------------------------------------------------OTHERS--------------------------------------------------------------

@@ -340,6 +396,16 @@ def load_documents_from_directory(directory_path: str):
            # adding the document name to the doc_names list
            doc_names.append(doc[0].metadata['filename'])
            print(f"Document {doc[0].metadata['filename']} loaded")
+        elif extension in video_doc:
+            # creating a video document
+            doc = preprocess_video_data(path, time_interval=180)
+            # appending the document to the documents list
+            documents.append(doc)
+            # appending the number of pages in the document
+            num_pages.append(len(doc))
+            # adding the document name to the doc_names list
+            doc_names.append(doc[0].metadata['filename'])
+            print(f"Document {doc[0].metadata['filename']} loaded")
            
    # so we need to create a document id for each document
    docs_id = [uuid4().hex for i in range(len(documents))]