Indexing completed.

This commit is contained in:
timothyafolami
2024-08-13 21:30:01 +01:00
parent eb50aed3b1
commit 8150b0a132
9 changed files with 195 additions and 100 deletions
Binary file not shown.
+73 -7
View File
@@ -18,6 +18,8 @@ import numpy as np
from pydub import AudioSegment
import base64
import requests
from moviepy.editor import VideoFileClip
import ffmpeg
from dotenv import load_dotenv
load_dotenv()
@@ -44,7 +46,7 @@ embeddings = load_embedding_model()
# --------------------------------------------------------TEXT PREPROCESSING--------------------------------------------
def create_documents(doc):
def create_documents(doc, file_type='text'):
text = doc[0].page_content
metadata = doc[0].metadata
text_splitter = RecursiveCharacterTextSplitter(
@@ -60,6 +62,7 @@ def create_documents(doc):
# Increment page number based on the chunk index
doc_metadata = metadata.copy()
doc_metadata['page'] = i # Assign page number based on chunk index
doc_metadata['file_type'] = file_type
document = Document(page_content=chunk.page_content, metadata=doc_metadata)
documents.append(document)
return documents
@@ -159,11 +162,11 @@ def process_image(image_path):
# create image document
def create_image_document(image_path):
def create_image_document(image_path, file_type='image'):
# getting the image name from the image path
image_name = image_path.split('/')[-1].split('.')[0]
# setting image name as metadata
metadata = {'filename': image_name}
metadata = {'filename': image_name, 'file_type': file_type}
text_extractor = TextExtractor()
text = text_extractor.read_text_from_image(image_path)
# removing special characters and line breaks
@@ -232,7 +235,7 @@ def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_outpu
return chunk_folder, chunk_paths
def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes):
def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'):
# Split the audio file into chunks
chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes)
@@ -260,7 +263,8 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes):
# Create a document with the transcript and metadata
metadata = {
"filename": base_filename,
"duration": f"{start_min}-{end_min} minutes"
"duration": f"{start_min}-{end_min} minutes",
"file_type": file_type,
}
document = Document(page_content=transcript, metadata=metadata)
documents.append(document)
@@ -271,11 +275,63 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes):
return documents
# creating a function to create audio document
def create_audio_document(audio_file_path, chunk_duration_minutes=3):
documents = transcribe_audio_chunks(audio_file_path, chunk_duration_minutes)
def create_audio_document(audio_file_path, chunk_duration_minutes=3, file_type='audio'):
documents = transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type)
return documents
# ------------------------------------------------VIDEO PROCESSING-----------------------------------------------------
def preprocess_video_data(video_path: str, time_interval: int):
# Load the video file
video = VideoFileClip(video_path)
# Get the duration of the video
duration = video.duration
# create an audio version of the video
audio_path = video_path.replace('.mp4', '.mp3')
_ = video.audio.write_audiofile(audio_path)
# creating a snapshot of the videos at the time interval
# Extract the video filename without extension
video_name = os.path.splitext(os.path.basename(video_path))[0]
# Create a directory for snapshots using the video name
snapshot_dir = os.path.join(os.path.dirname(video_path), f"{video_name}_snapshots")
os.makedirs(snapshot_dir, exist_ok=True)
# Set the interval to 3 minutes (180 seconds)
interval = 180
# Get the duration of the video using ffmpeg
probe = ffmpeg.probe(video_path)
duration = float(probe['format']['duration'])
# Loop through the video and take snapshots at 0s, 3min, 6min, etc.
for i in range(0, int(duration), interval):
# Calculate the time for the current frame
frame_time = i
# Save the snapshot as an image file in the created folder
frame_img = os.path.join(snapshot_dir, f"frame_at_{frame_time//60}min.png")
# Extract the frame using ffmpeg
(
ffmpeg
.input(video_path, ss=frame_time)
.output(frame_img, vframes=1)
.run()
)
print(f"Snapshots saved in {snapshot_dir}.")
# now creating document from the audio file
documents = create_audio_document(audio_path, file_type='video')
return documents
#-----------------------------------------------------OTHERS--------------------------------------------------------------
@@ -340,6 +396,16 @@ def load_documents_from_directory(directory_path: str):
# adding the document name to the doc_names list
doc_names.append(doc[0].metadata['filename'])
print(f"Document {doc[0].metadata['filename']} loaded")
elif extension in video_doc:
# creating a video document
doc = preprocess_video_data(path, time_interval=180)
# appending the document to the documents list
documents.append(doc)
# appending the number of pages in the document
num_pages.append(len(doc))
# adding the document name to the doc_names list
doc_names.append(doc[0].metadata['filename'])
print(f"Document {doc[0].metadata['filename']} loaded")
# so we need to create a document id for each document
docs_id = [uuid4().hex for i in range(len(documents))]