AI indexing completed

This commit is contained in:
timothyafolami
2024-08-16 17:37:28 +01:00
parent 713354371e
commit cff9511d86
13 changed files with 2843 additions and 257 deletions
+130 -30
View File
@@ -12,7 +12,11 @@ from langchain_core.output_parsers import StrOutputParser
from uuid import uuid4
from langchain_core.documents import Document
from text_extractor import TextExtractor
import os
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from loggings.logging_config import logger
import random
from PIL import Image, ImageDraw, ImageFont
from concurrent.futures import ThreadPoolExecutor
import math
import json
@@ -29,6 +33,7 @@ import ffmpeg
from dotenv import load_dotenv
load_dotenv()
# OpenAI API Key
api_key = os.getenv('OPENAI_API_KEY')
# setting up groq api key
@@ -53,11 +58,14 @@ def load_embedding_model():
# ----------------------------------------------------------------------------------------------------
# loading the embedding model
logger.info("Loading the embedding model")
embeddings = load_embedding_model()
logger.info("Embedding model loaded")
# --------------------------------------------------------TEXT PREPROCESSING--------------------------------------------
def create_documents(doc, file_type='text'):
logger.info(f"Creating documents from text")
text = doc[0].page_content
metadata = doc[0].metadata
text_splitter = RecursiveCharacterTextSplitter(
@@ -80,6 +88,7 @@ def create_documents(doc, file_type='text'):
def load_txt_document(document_path):
logger.info(f"Loading text document from {document_path}")
try:
txt_doc = TextLoader(document_path)
text = txt_doc.load()
@@ -91,6 +100,7 @@ def load_txt_document(document_path):
def load_docx_document(document_path):
logger.info(f"Loading docx document from {document_path}")
try:
docx_doc = Docx2txtLoader(document_path)
text = docx_doc.load()
@@ -103,6 +113,7 @@ def load_docx_document(document_path):
# creating a function that checks the document type and loads the document
def load_pdf_document(document_path):
logger.info(f"Loading pdf document from {document_path}")
try:
pdf_doc = PyPDFLoader(document_path)
pages = pdf_doc.load_and_split()
@@ -125,11 +136,13 @@ def load_document(document_path):
# ----------------------------------------------------IMAGE PROCESSING------------------------------------------------
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
logger.info(f"Encoding image {image_path}")
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Vision API to process the image
def process_image(image_path):
logger.info(f"Processing image {image_path}")
global api_key
# Getting the base64 string
@@ -174,10 +187,11 @@ def process_image(image_path):
# create image document
def create_image_document(image_path, file_type='image'):
logger.info(f"Creating image document from {image_path}")
# getting the image name from the image path
image_name = image_path.split('/')[-1].split('.')[0]
image_name = image_path.split('\\')[-1].split('.')[0]
# setting image name as metadata
metadata = {'filename': image_name, 'file_type': file_type}
metadata = {'source': image_name, 'file_type': file_type}
text_extractor = TextExtractor()
text = text_extractor.read_text_from_image(image_path)
# removing special characters and line breaks
@@ -199,6 +213,7 @@ def create_image_document(image_path, file_type='image'):
# -----------------------------------------------AUDIO PROCESSING-----------------------------------------------------
# Audio to Text
def audio_to_text(filepath):
logger.info(f"Transcribing audio file {filepath}")
with open(filepath, "rb") as file:
translation = client.audio.translations.create(
file=(filepath, file.read()),
@@ -208,6 +223,7 @@ def audio_to_text(filepath):
def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_output=True):
logger.info(f"Splitting audio file {audio_file_path} by duration")
# Convert chunk duration to milliseconds
chunk_length_ms = chunk_duration_minutes * 60 * 1000
@@ -247,6 +263,7 @@ def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_outpu
return chunk_folder, chunk_paths
def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'):
logger.info(f"Transcribing audio chunks from {audio_file_path}")
# Split the audio file into chunks
chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes)
@@ -270,11 +287,25 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
start_min = (chunk_index - 1) * chunk_duration_minutes
end_min = chunk_index * chunk_duration_minutes
actual_end_min = min(end_min, (len(AudioSegment.from_file(audio_file_path)) // 60000)) # To handle the last chunk's actual duration
# preparing the start and end min in a timestamp format, also also catching cases of decimal, making it a real time
if start_min % 1 == 0:
start_min = f"{int(start_min)}:00"
end_min = f"{int(end_min)}:00"
else:
# splitting the decimal part of the start and end min
start_min_int, start_min_dec = str(start_min).split('.')
end_min_int, end_min_dec = str(end_min).split('.')
# converting the decimal part to seconds
start_sec = int(start_min_dec) * 6
end_sec = int(end_min_dec) * 6
start_min = f"{start_min_int}:{start_sec}"
end_min = f"{end_min_int}:{end_sec}"
# Create a document with the transcript and metadata
metadata = {
"filename": base_filename,
"duration": f"{start_min}-{end_min} minutes",
"source": base_filename,
"timestamp": f"{start_min}-{end_min}",
"file_type": file_type,
}
document = Document(page_content=transcript, metadata=metadata)
@@ -282,6 +313,9 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
# Delete the chunk folder after processing
shutil.rmtree(chunk_folder)
# adding a delay
time.sleep(0.2)
return documents
@@ -294,7 +328,7 @@ def create_audio_document(audio_file_path, chunk_duration_minutes=3, file_type='
# ------------------------------------------------VIDEO PROCESSING-----------------------------------------------------
def preprocess_video_data(video_path: str, time_interval: int):
logger.info(f"Preprocessing video data from {video_path}")
# Load the video file
video = VideoFileClip(video_path)
@@ -341,6 +375,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
# now creating document from the audio file
documents = create_audio_document(audio_path, chunk_duration_minutes=0.5, file_type='video')
logger.info(f"Documents created from video {video_path}")
# deleting the audio file
os.remove(audio_path)
@@ -349,6 +384,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
#----------------------------------------------------DOC SUMMARIZER --------------------------------------------------
def doc_summarizer(document_page: list) -> str:
logger.info(f"Summarizing document")
initiator_prompt = PromptTemplate(
template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Create a short summary of the document based on the provided text.
@@ -370,12 +406,15 @@ def doc_summarizer(document_page: list) -> str:
#-----------------------------------------------------OTHERS--------------------------------------------------------------
def save_embedded_data(embeddings, key="data"):
embeddings.save_local(f"index/faiss_index_{key}")
print("Embeddings saved")
logger.info(f"Saving embeddings")
embeddings.save_local(f"index/faiss_index_{key}")
print("Embeddings saved")
return 'saved'
def load_embedded_data(embeddings=embeddings, key="data"):
embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
return embed_db
logger.info(f"Loading embedded data")
embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
return embed_db
#-----------------------------------------------------Data Loading Process----------------------------------------------------
@@ -396,15 +435,15 @@ def process_document(path, extension, text_doc, image_doc, audio_doc, video_doc)
elif extension in image_doc:
doc = process_map["image"](path)
num_pages = 1
doc_name = doc[0].metadata['filename']
doc_name = doc[0].metadata['source'].split('\\')[-1]
elif extension in audio_doc:
doc = process_map["audio"](path)
num_pages = len(doc)
doc_name = doc[0].metadata['filename']
doc_name = doc[0].metadata['source']
elif extension in video_doc:
doc = process_map["video"](path, time_interval=30)
num_pages = len(doc)
doc_name = doc[0].metadata['filename']
doc_name = doc[0].metadata['source']
else:
return None, None, None # Unhandled extension
@@ -425,7 +464,7 @@ def load_documents_from_directory(directory_path: str):
def process_with_delay(file):
result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc)
time.sleep(0.1) # Introduce a 0.1s delay between processing each document
time.sleep(0.4) # Introduce a 0.4s delay between processing each document
return result
with ThreadPoolExecutor() as executor:
@@ -441,27 +480,31 @@ def load_documents_from_directory(directory_path: str):
first_page = doc[0].page_content
summary = doc_summarizer(first_page)
doc_summary.append(summary)
# adding some delay
time.sleep(0.5)
docs_id = [uuid4().hex for _ in range(len(documents))]
json_file = os.path.join(directory_path, 'data.json')
data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages, 'doc_summaary': doc_summary}
if os.path.exists(json_file):
with open(json_file, 'r+') as f:
existing_data = json.load(f)
existing_data.update(data)
f.seek(0)
json.dump(existing_data, f)
else:
with open(json_file, 'w') as f:
json.dump(data, f)
# creating a dictionary for each document in the json file
for i in range(len(documents)):
data = {doc_names[i].split("\\")[-1]: {'doc_id':docs_id[i], 'num_pages': num_pages[i], 'doc_summary': doc_summary[i]}}
if os.path.exists(json_file):
with open(json_file, 'r+') as f:
existing_data = json.load(f)
existing_data.update(data)
f.seek(0)
json.dump(existing_data, f)
else:
with open(json_file, 'w') as f:
json.dump(data, f)
return documents, docs_id, num_pages
# A function to create vector store
def create_vector_store(documents: list, docs_id: list, num_pages: list):
logger.info(f"Creating vector store")
# index set up with the embedding dimension
index = faiss.IndexFlatL2(384)
# Initialize the FAISS vector store
@@ -476,10 +519,11 @@ def create_vector_store(documents: list, docs_id: list, num_pages: list):
doc_id = docs_id[i]
page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
vector_store.add_documents(documents=documents[i], ids=page_ids)
logger.info(f"Vector store created")
logger.info(f"Saving the vector store")
# saving the vector store automatically
save_embedded_data(vector_store, key="data")
logger.info(f"Vector store saved")
return vector_store
# creating a function to add documents to the vector store
@@ -491,14 +535,70 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
vector_store.add_documents(documents=documents[i], ids=page_ids)
print ("Documents added to the vector store")
#----------------------------------------------------------Thumbnail Generator-----------------------------------------------------
def create_text_thumbnail(file_path):
logger.info(f"Creating thumbnail for {file_path}")
# Create a folder for thumbnails if it doesn't exist
thumbnail_folder = os.path.join(os.path.dirname(file_path), 'thumbnails')
os.makedirs(thumbnail_folder, exist_ok=True)
# Extract file name (without extension)
file_name = os.path.splitext(os.path.basename(file_path))[0]
# Create a random background color
background_color = tuple(random.randint(0, 255) for _ in range(3))
# Create an image with the random background color
img = Image.new('RGB', (800, 400), color=background_color)
# Initialize drawing context
d = ImageDraw.Draw(img)
# Load a font
try:
font = ImageFont.truetype("arial.ttf", 25) # Adjust the font size as needed
except IOError:
font = ImageFont.load_default()
# Get the bounding box of the text
text_bbox = d.textbbox((0, 0), file_name, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
# Calculate the position to center the text
text_x = (img.width - text_width) / 2
text_y = (img.height - text_height) / 2
# Draw the text onto the image
d.text((text_x, text_y), file_name, font=font, fill=(255, 255, 255)) # White text
# Save the image
thumbnail_path = os.path.join(thumbnail_folder, f"{file_name}.png")
img.save(thumbnail_path)
print(f"Thumbnail created: {thumbnail_path}")
def process_directory(directory_path):
supported_extensions = ['.txt', '.pdf', '.docx', '.mp3', '.m4a']
for file in os.listdir(directory_path):
file_path = os.path.join(directory_path, file)
if os.path.isfile(file_path):
file_extension = os.path.splitext(file)[1].lower()
if file_extension in supported_extensions:
create_text_thumbnail(file_path)
return "Done"
#-----------------------------------------------------------SEARCH-------------------------------------------------------
# A document search function
def search(query, k=20):
logger.info(f"Searching for {query}")
# loading the embedded data
embed_db = load_embedded_data()
db = embed_db
docs = db.similarity_search(query, k)
logger.info(f"Search completed")
all = []
info = []
for doc in docs: