Indexing completed.
This commit is contained in:
Binary file not shown.
+73
-7
@@ -18,6 +18,8 @@ import numpy as np
|
||||
from pydub import AudioSegment
|
||||
import base64
|
||||
import requests
|
||||
from moviepy.editor import VideoFileClip
|
||||
import ffmpeg
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
@@ -44,7 +46,7 @@ embeddings = load_embedding_model()
|
||||
|
||||
|
||||
# --------------------------------------------------------TEXT PREPROCESSING--------------------------------------------
|
||||
def create_documents(doc):
|
||||
def create_documents(doc, file_type='text'):
|
||||
text = doc[0].page_content
|
||||
metadata = doc[0].metadata
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
@@ -60,6 +62,7 @@ def create_documents(doc):
|
||||
# Increment page number based on the chunk index
|
||||
doc_metadata = metadata.copy()
|
||||
doc_metadata['page'] = i # Assign page number based on chunk index
|
||||
doc_metadata['file_type'] = file_type
|
||||
document = Document(page_content=chunk.page_content, metadata=doc_metadata)
|
||||
documents.append(document)
|
||||
return documents
|
||||
@@ -159,11 +162,11 @@ def process_image(image_path):
|
||||
|
||||
|
||||
# create image document
|
||||
def create_image_document(image_path):
|
||||
def create_image_document(image_path, file_type='image'):
|
||||
# getting the image name from the image path
|
||||
image_name = image_path.split('/')[-1].split('.')[0]
|
||||
# setting image name as metadata
|
||||
metadata = {'filename': image_name}
|
||||
metadata = {'filename': image_name, 'file_type': file_type}
|
||||
text_extractor = TextExtractor()
|
||||
text = text_extractor.read_text_from_image(image_path)
|
||||
# removing special characters and line breaks
|
||||
@@ -232,7 +235,7 @@ def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_outpu
|
||||
|
||||
return chunk_folder, chunk_paths
|
||||
|
||||
def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes):
|
||||
def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'):
|
||||
# Split the audio file into chunks
|
||||
chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes)
|
||||
|
||||
@@ -260,7 +263,8 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes):
|
||||
# Create a document with the transcript and metadata
|
||||
metadata = {
|
||||
"filename": base_filename,
|
||||
"duration": f"{start_min}-{end_min} minutes"
|
||||
"duration": f"{start_min}-{end_min} minutes",
|
||||
"file_type": file_type,
|
||||
}
|
||||
document = Document(page_content=transcript, metadata=metadata)
|
||||
documents.append(document)
|
||||
@@ -271,11 +275,63 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes):
|
||||
return documents
|
||||
|
||||
|
||||
|
||||
# creating a function to create audio document
|
||||
def create_audio_document(audio_file_path, chunk_duration_minutes=3):
|
||||
documents = transcribe_audio_chunks(audio_file_path, chunk_duration_minutes)
|
||||
def create_audio_document(audio_file_path, chunk_duration_minutes=3, file_type='audio'):
|
||||
documents = transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type)
|
||||
return documents
|
||||
|
||||
# ------------------------------------------------VIDEO PROCESSING-----------------------------------------------------
|
||||
def preprocess_video_data(video_path: str, time_interval: int):
|
||||
|
||||
# Load the video file
|
||||
video = VideoFileClip(video_path)
|
||||
|
||||
# Get the duration of the video
|
||||
duration = video.duration
|
||||
|
||||
# create an audio version of the video
|
||||
audio_path = video_path.replace('.mp4', '.mp3')
|
||||
_ = video.audio.write_audiofile(audio_path)
|
||||
|
||||
# creating a snapshot of the videos at the time interval
|
||||
# Extract the video filename without extension
|
||||
video_name = os.path.splitext(os.path.basename(video_path))[0]
|
||||
|
||||
# Create a directory for snapshots using the video name
|
||||
snapshot_dir = os.path.join(os.path.dirname(video_path), f"{video_name}_snapshots")
|
||||
os.makedirs(snapshot_dir, exist_ok=True)
|
||||
|
||||
# Set the interval to 3 minutes (180 seconds)
|
||||
interval = 180
|
||||
|
||||
# Get the duration of the video using ffmpeg
|
||||
probe = ffmpeg.probe(video_path)
|
||||
duration = float(probe['format']['duration'])
|
||||
|
||||
# Loop through the video and take snapshots at 0s, 3min, 6min, etc.
|
||||
for i in range(0, int(duration), interval):
|
||||
# Calculate the time for the current frame
|
||||
frame_time = i
|
||||
# Save the snapshot as an image file in the created folder
|
||||
frame_img = os.path.join(snapshot_dir, f"frame_at_{frame_time//60}min.png")
|
||||
|
||||
# Extract the frame using ffmpeg
|
||||
(
|
||||
ffmpeg
|
||||
.input(video_path, ss=frame_time)
|
||||
.output(frame_img, vframes=1)
|
||||
.run()
|
||||
)
|
||||
|
||||
print(f"Snapshots saved in {snapshot_dir}.")
|
||||
|
||||
|
||||
# now creating document from the audio file
|
||||
documents = create_audio_document(audio_path, file_type='video')
|
||||
return documents
|
||||
|
||||
|
||||
|
||||
#-----------------------------------------------------OTHERS--------------------------------------------------------------
|
||||
|
||||
@@ -340,6 +396,16 @@ def load_documents_from_directory(directory_path: str):
|
||||
# adding the document name to the doc_names list
|
||||
doc_names.append(doc[0].metadata['filename'])
|
||||
print(f"Document {doc[0].metadata['filename']} loaded")
|
||||
elif extension in video_doc:
|
||||
# creating a video document
|
||||
doc = preprocess_video_data(path, time_interval=180)
|
||||
# appending the document to the documents list
|
||||
documents.append(doc)
|
||||
# appending the number of pages in the document
|
||||
num_pages.append(len(doc))
|
||||
# adding the document name to the doc_names list
|
||||
doc_names.append(doc[0].metadata['filename'])
|
||||
print(f"Document {doc[0].metadata['filename']} loaded")
|
||||
|
||||
# so we need to create a document id for each document
|
||||
docs_id = [uuid4().hex for i in range(len(documents))]
|
||||
|
||||
Reference in New Issue
Block a user