AI indexing completed
This commit is contained in:
@@ -12,7 +12,11 @@ from langchain_core.output_parsers import StrOutputParser
|
||||
from uuid import uuid4
|
||||
from langchain_core.documents import Document
|
||||
from text_extractor import TextExtractor
|
||||
import os
|
||||
import os, sys
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
from loggings.logging_config import logger
|
||||
import random
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import math
|
||||
import json
|
||||
@@ -29,6 +33,7 @@ import ffmpeg
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# OpenAI API Key
|
||||
api_key = os.getenv('OPENAI_API_KEY')
|
||||
# setting up groq api key
|
||||
@@ -53,11 +58,14 @@ def load_embedding_model():
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
# loading the embedding model
|
||||
logger.info("Loading the embedding model")
|
||||
embeddings = load_embedding_model()
|
||||
logger.info("Embedding model loaded")
|
||||
|
||||
|
||||
# --------------------------------------------------------TEXT PREPROCESSING--------------------------------------------
|
||||
def create_documents(doc, file_type='text'):
|
||||
logger.info(f"Creating documents from text")
|
||||
text = doc[0].page_content
|
||||
metadata = doc[0].metadata
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
@@ -80,6 +88,7 @@ def create_documents(doc, file_type='text'):
|
||||
|
||||
|
||||
def load_txt_document(document_path):
|
||||
logger.info(f"Loading text document from {document_path}")
|
||||
try:
|
||||
txt_doc = TextLoader(document_path)
|
||||
text = txt_doc.load()
|
||||
@@ -91,6 +100,7 @@ def load_txt_document(document_path):
|
||||
|
||||
|
||||
def load_docx_document(document_path):
|
||||
logger.info(f"Loading docx document from {document_path}")
|
||||
try:
|
||||
docx_doc = Docx2txtLoader(document_path)
|
||||
text = docx_doc.load()
|
||||
@@ -103,6 +113,7 @@ def load_docx_document(document_path):
|
||||
|
||||
# creating a function that checks the document type and loads the document
|
||||
def load_pdf_document(document_path):
|
||||
logger.info(f"Loading pdf document from {document_path}")
|
||||
try:
|
||||
pdf_doc = PyPDFLoader(document_path)
|
||||
pages = pdf_doc.load_and_split()
|
||||
@@ -125,11 +136,13 @@ def load_document(document_path):
|
||||
# ----------------------------------------------------IMAGE PROCESSING------------------------------------------------
|
||||
# Function to encode the image
|
||||
def encode_image(image_path):
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
logger.info(f"Encoding image {image_path}")
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
|
||||
# Vision API to process the image
|
||||
def process_image(image_path):
|
||||
logger.info(f"Processing image {image_path}")
|
||||
global api_key
|
||||
|
||||
# Getting the base64 string
|
||||
@@ -174,10 +187,11 @@ def process_image(image_path):
|
||||
|
||||
# create image document
|
||||
def create_image_document(image_path, file_type='image'):
|
||||
logger.info(f"Creating image document from {image_path}")
|
||||
# getting the image name from the image path
|
||||
image_name = image_path.split('/')[-1].split('.')[0]
|
||||
image_name = image_path.split('\\')[-1].split('.')[0]
|
||||
# setting image name as metadata
|
||||
metadata = {'filename': image_name, 'file_type': file_type}
|
||||
metadata = {'source': image_name, 'file_type': file_type}
|
||||
text_extractor = TextExtractor()
|
||||
text = text_extractor.read_text_from_image(image_path)
|
||||
# removing special characters and line breaks
|
||||
@@ -199,6 +213,7 @@ def create_image_document(image_path, file_type='image'):
|
||||
# -----------------------------------------------AUDIO PROCESSING-----------------------------------------------------
|
||||
# Audio to Text
|
||||
def audio_to_text(filepath):
|
||||
logger.info(f"Transcribing audio file {filepath}")
|
||||
with open(filepath, "rb") as file:
|
||||
translation = client.audio.translations.create(
|
||||
file=(filepath, file.read()),
|
||||
@@ -208,6 +223,7 @@ def audio_to_text(filepath):
|
||||
|
||||
|
||||
def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_output=True):
|
||||
logger.info(f"Splitting audio file {audio_file_path} by duration")
|
||||
# Convert chunk duration to milliseconds
|
||||
chunk_length_ms = chunk_duration_minutes * 60 * 1000
|
||||
|
||||
@@ -247,6 +263,7 @@ def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_outpu
|
||||
return chunk_folder, chunk_paths
|
||||
|
||||
def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'):
|
||||
logger.info(f"Transcribing audio chunks from {audio_file_path}")
|
||||
# Split the audio file into chunks
|
||||
chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes)
|
||||
|
||||
@@ -270,11 +287,25 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
|
||||
start_min = (chunk_index - 1) * chunk_duration_minutes
|
||||
end_min = chunk_index * chunk_duration_minutes
|
||||
actual_end_min = min(end_min, (len(AudioSegment.from_file(audio_file_path)) // 60000)) # To handle the last chunk's actual duration
|
||||
|
||||
# preparing the start and end min in a timestamp format, also also catching cases of decimal, making it a real time
|
||||
if start_min % 1 == 0:
|
||||
start_min = f"{int(start_min)}:00"
|
||||
end_min = f"{int(end_min)}:00"
|
||||
else:
|
||||
# splitting the decimal part of the start and end min
|
||||
start_min_int, start_min_dec = str(start_min).split('.')
|
||||
end_min_int, end_min_dec = str(end_min).split('.')
|
||||
# converting the decimal part to seconds
|
||||
start_sec = int(start_min_dec) * 6
|
||||
end_sec = int(end_min_dec) * 6
|
||||
start_min = f"{start_min_int}:{start_sec}"
|
||||
end_min = f"{end_min_int}:{end_sec}"
|
||||
|
||||
# Create a document with the transcript and metadata
|
||||
metadata = {
|
||||
"filename": base_filename,
|
||||
"duration": f"{start_min}-{end_min} minutes",
|
||||
"source": base_filename,
|
||||
"timestamp": f"{start_min}-{end_min}",
|
||||
"file_type": file_type,
|
||||
}
|
||||
document = Document(page_content=transcript, metadata=metadata)
|
||||
@@ -282,6 +313,9 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
|
||||
|
||||
# Delete the chunk folder after processing
|
||||
shutil.rmtree(chunk_folder)
|
||||
|
||||
# adding a delay
|
||||
time.sleep(0.2)
|
||||
|
||||
return documents
|
||||
|
||||
@@ -294,7 +328,7 @@ def create_audio_document(audio_file_path, chunk_duration_minutes=3, file_type='
|
||||
|
||||
# ------------------------------------------------VIDEO PROCESSING-----------------------------------------------------
|
||||
def preprocess_video_data(video_path: str, time_interval: int):
|
||||
|
||||
logger.info(f"Preprocessing video data from {video_path}")
|
||||
# Load the video file
|
||||
video = VideoFileClip(video_path)
|
||||
|
||||
@@ -341,6 +375,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
|
||||
|
||||
# now creating document from the audio file
|
||||
documents = create_audio_document(audio_path, chunk_duration_minutes=0.5, file_type='video')
|
||||
logger.info(f"Documents created from video {video_path}")
|
||||
|
||||
# deleting the audio file
|
||||
os.remove(audio_path)
|
||||
@@ -349,6 +384,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
|
||||
|
||||
#----------------------------------------------------DOC SUMMARIZER --------------------------------------------------
|
||||
def doc_summarizer(document_page: list) -> str:
|
||||
logger.info(f"Summarizing document")
|
||||
initiator_prompt = PromptTemplate(
|
||||
template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
||||
Create a short summary of the document based on the provided text.
|
||||
@@ -370,12 +406,15 @@ def doc_summarizer(document_page: list) -> str:
|
||||
#-----------------------------------------------------OTHERS--------------------------------------------------------------
|
||||
|
||||
def save_embedded_data(embeddings, key="data"):
|
||||
embeddings.save_local(f"index/faiss_index_{key}")
|
||||
print("Embeddings saved")
|
||||
logger.info(f"Saving embeddings")
|
||||
embeddings.save_local(f"index/faiss_index_{key}")
|
||||
print("Embeddings saved")
|
||||
return 'saved'
|
||||
|
||||
def load_embedded_data(embeddings=embeddings, key="data"):
|
||||
embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
|
||||
return embed_db
|
||||
logger.info(f"Loading embedded data")
|
||||
embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
|
||||
return embed_db
|
||||
|
||||
#-----------------------------------------------------Data Loading Process----------------------------------------------------
|
||||
|
||||
@@ -396,15 +435,15 @@ def process_document(path, extension, text_doc, image_doc, audio_doc, video_doc)
|
||||
elif extension in image_doc:
|
||||
doc = process_map["image"](path)
|
||||
num_pages = 1
|
||||
doc_name = doc[0].metadata['filename']
|
||||
doc_name = doc[0].metadata['source'].split('\\')[-1]
|
||||
elif extension in audio_doc:
|
||||
doc = process_map["audio"](path)
|
||||
num_pages = len(doc)
|
||||
doc_name = doc[0].metadata['filename']
|
||||
doc_name = doc[0].metadata['source']
|
||||
elif extension in video_doc:
|
||||
doc = process_map["video"](path, time_interval=30)
|
||||
num_pages = len(doc)
|
||||
doc_name = doc[0].metadata['filename']
|
||||
doc_name = doc[0].metadata['source']
|
||||
else:
|
||||
return None, None, None # Unhandled extension
|
||||
|
||||
@@ -425,7 +464,7 @@ def load_documents_from_directory(directory_path: str):
|
||||
|
||||
def process_with_delay(file):
|
||||
result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc)
|
||||
time.sleep(0.1) # Introduce a 0.1s delay between processing each document
|
||||
time.sleep(0.4) # Introduce a 0.4s delay between processing each document
|
||||
return result
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
@@ -441,27 +480,31 @@ def load_documents_from_directory(directory_path: str):
|
||||
first_page = doc[0].page_content
|
||||
summary = doc_summarizer(first_page)
|
||||
doc_summary.append(summary)
|
||||
# adding some delay
|
||||
time.sleep(0.5)
|
||||
|
||||
docs_id = [uuid4().hex for _ in range(len(documents))]
|
||||
|
||||
json_file = os.path.join(directory_path, 'data.json')
|
||||
data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages, 'doc_summaary': doc_summary}
|
||||
|
||||
if os.path.exists(json_file):
|
||||
with open(json_file, 'r+') as f:
|
||||
existing_data = json.load(f)
|
||||
existing_data.update(data)
|
||||
f.seek(0)
|
||||
json.dump(existing_data, f)
|
||||
else:
|
||||
with open(json_file, 'w') as f:
|
||||
json.dump(data, f)
|
||||
# creating a dictionary for each document in the json file
|
||||
for i in range(len(documents)):
|
||||
data = {doc_names[i].split("\\")[-1]: {'doc_id':docs_id[i], 'num_pages': num_pages[i], 'doc_summary': doc_summary[i]}}
|
||||
if os.path.exists(json_file):
|
||||
with open(json_file, 'r+') as f:
|
||||
existing_data = json.load(f)
|
||||
existing_data.update(data)
|
||||
f.seek(0)
|
||||
json.dump(existing_data, f)
|
||||
else:
|
||||
with open(json_file, 'w') as f:
|
||||
json.dump(data, f)
|
||||
|
||||
return documents, docs_id, num_pages
|
||||
|
||||
|
||||
# A function to create vector store
|
||||
def create_vector_store(documents: list, docs_id: list, num_pages: list):
|
||||
logger.info(f"Creating vector store")
|
||||
# index set up with the embedding dimension
|
||||
index = faiss.IndexFlatL2(384)
|
||||
# Initialize the FAISS vector store
|
||||
@@ -476,10 +519,11 @@ def create_vector_store(documents: list, docs_id: list, num_pages: list):
|
||||
doc_id = docs_id[i]
|
||||
page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
|
||||
vector_store.add_documents(documents=documents[i], ids=page_ids)
|
||||
|
||||
logger.info(f"Vector store created")
|
||||
logger.info(f"Saving the vector store")
|
||||
# saving the vector store automatically
|
||||
save_embedded_data(vector_store, key="data")
|
||||
|
||||
logger.info(f"Vector store saved")
|
||||
return vector_store
|
||||
|
||||
# creating a function to add documents to the vector store
|
||||
@@ -491,14 +535,70 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
|
||||
page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
|
||||
vector_store.add_documents(documents=documents[i], ids=page_ids)
|
||||
print ("Documents added to the vector store")
|
||||
|
||||
|
||||
#----------------------------------------------------------Thumbnail Generator-----------------------------------------------------
|
||||
def create_text_thumbnail(file_path):
|
||||
logger.info(f"Creating thumbnail for {file_path}")
|
||||
# Create a folder for thumbnails if it doesn't exist
|
||||
thumbnail_folder = os.path.join(os.path.dirname(file_path), 'thumbnails')
|
||||
os.makedirs(thumbnail_folder, exist_ok=True)
|
||||
|
||||
# Extract file name (without extension)
|
||||
file_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||
|
||||
# Create a random background color
|
||||
background_color = tuple(random.randint(0, 255) for _ in range(3))
|
||||
|
||||
# Create an image with the random background color
|
||||
img = Image.new('RGB', (800, 400), color=background_color)
|
||||
|
||||
# Initialize drawing context
|
||||
d = ImageDraw.Draw(img)
|
||||
|
||||
# Load a font
|
||||
try:
|
||||
font = ImageFont.truetype("arial.ttf", 25) # Adjust the font size as needed
|
||||
except IOError:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
# Get the bounding box of the text
|
||||
text_bbox = d.textbbox((0, 0), file_name, font=font)
|
||||
text_width = text_bbox[2] - text_bbox[0]
|
||||
text_height = text_bbox[3] - text_bbox[1]
|
||||
|
||||
# Calculate the position to center the text
|
||||
text_x = (img.width - text_width) / 2
|
||||
text_y = (img.height - text_height) / 2
|
||||
|
||||
# Draw the text onto the image
|
||||
d.text((text_x, text_y), file_name, font=font, fill=(255, 255, 255)) # White text
|
||||
|
||||
# Save the image
|
||||
thumbnail_path = os.path.join(thumbnail_folder, f"{file_name}.png")
|
||||
img.save(thumbnail_path)
|
||||
|
||||
print(f"Thumbnail created: {thumbnail_path}")
|
||||
|
||||
def process_directory(directory_path):
|
||||
supported_extensions = ['.txt', '.pdf', '.docx', '.mp3', '.m4a']
|
||||
|
||||
for file in os.listdir(directory_path):
|
||||
file_path = os.path.join(directory_path, file)
|
||||
if os.path.isfile(file_path):
|
||||
file_extension = os.path.splitext(file)[1].lower()
|
||||
if file_extension in supported_extensions:
|
||||
create_text_thumbnail(file_path)
|
||||
return "Done"
|
||||
|
||||
#-----------------------------------------------------------SEARCH-------------------------------------------------------
|
||||
# A document search function
|
||||
def search(query, k=20):
|
||||
logger.info(f"Searching for {query}")
|
||||
# loading the embedded data
|
||||
embed_db = load_embedded_data()
|
||||
db = embed_db
|
||||
docs = db.similarity_search(query, k)
|
||||
logger.info(f"Search completed")
|
||||
all = []
|
||||
info = []
|
||||
for doc in docs:
|
||||
|
||||
Reference in New Issue
Block a user