AI indexing completed

2024-08-16 17:37:28 +01:00
parent 713354371e
commit cff9511d86
13 changed files with 2843 additions and 257 deletions
@@ -12,7 +12,11 @@ from langchain_core.output_parsers import StrOutputParser
 from uuid import uuid4
 from langchain_core.documents  import Document
 from text_extractor import TextExtractor
-import os
+import os, sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from loggings.logging_config import logger
+import random
+from PIL import Image, ImageDraw, ImageFont
 from concurrent.futures import ThreadPoolExecutor
 import math
 import json
@@ -29,6 +33,7 @@ import ffmpeg
 from dotenv import load_dotenv
 load_dotenv()

+
 # OpenAI API Key
 api_key = os.getenv('OPENAI_API_KEY')
 # setting up groq api key
@@ -53,11 +58,14 @@ def load_embedding_model():

 # ----------------------------------------------------------------------------------------------------
 # loading the embedding model
+logger.info("Loading the embedding model")
 embeddings = load_embedding_model()
+logger.info("Embedding model loaded")


 # --------------------------------------------------------TEXT PREPROCESSING--------------------------------------------
 def create_documents(doc, file_type='text'):
+    logger.info(f"Creating documents from text")
    text = doc[0].page_content
    metadata = doc[0].metadata
    text_splitter = RecursiveCharacterTextSplitter(
@@ -80,6 +88,7 @@ def create_documents(doc, file_type='text'):


 def load_txt_document(document_path):
+    logger.info(f"Loading text document from {document_path}")
    try:
        txt_doc = TextLoader(document_path)
        text = txt_doc.load()
@@ -91,6 +100,7 @@ def load_txt_document(document_path):
  
  
 def load_docx_document(document_path):
+    logger.info(f"Loading docx document from {document_path}")
    try:
        docx_doc = Docx2txtLoader(document_path)
        text = docx_doc.load()
@@ -103,6 +113,7 @@ def load_docx_document(document_path):
    
 # creating a function that checks the document type and loads the document
 def load_pdf_document(document_path):
+    logger.info(f"Loading pdf document from {document_path}")
    try:
        pdf_doc = PyPDFLoader(document_path)
        pages = pdf_doc.load_and_split()
@@ -125,11 +136,13 @@ def load_document(document_path):
 # ----------------------------------------------------IMAGE PROCESSING------------------------------------------------
 # Function to encode the image
 def encode_image(image_path):
-  with open(image_path, "rb") as image_file:
-    return base64.b64encode(image_file.read()).decode('utf-8')
+    logger.info(f"Encoding image {image_path}")
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')

 # Vision API to process the image
 def process_image(image_path):
+    logger.info(f"Processing image {image_path}")
    global api_key

    # Getting the base64 string
@@ -174,10 +187,11 @@ def process_image(image_path):

 # create image document
 def create_image_document(image_path, file_type='image'):
+    logger.info(f"Creating image document from {image_path}")
    # getting the image name from the image path
-    image_name = image_path.split('/')[-1].split('.')[0]
+    image_name = image_path.split('\\')[-1].split('.')[0]
    # setting image name as metadata
-    metadata = {'filename': image_name, 'file_type': file_type}
+    metadata = {'source': image_name, 'file_type': file_type}
    text_extractor = TextExtractor()
    text = text_extractor.read_text_from_image(image_path)
    # removing special characters and line breaks
@@ -199,6 +213,7 @@ def create_image_document(image_path, file_type='image'):
 # -----------------------------------------------AUDIO PROCESSING-----------------------------------------------------
 # Audio to Text
 def audio_to_text(filepath):
+    logger.info(f"Transcribing audio file {filepath}")
    with open(filepath, "rb") as file:
        translation = client.audio.translations.create(
            file=(filepath, file.read()),
@@ -208,6 +223,7 @@ def audio_to_text(filepath):


 def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_output=True):
+    logger.info(f"Splitting audio file {audio_file_path} by duration")
    # Convert chunk duration to milliseconds
    chunk_length_ms = chunk_duration_minutes * 60 * 1000

@@ -247,6 +263,7 @@ def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_outpu
    return chunk_folder, chunk_paths

 def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'):
+    logger.info(f"Transcribing audio chunks from {audio_file_path}")
    # Split the audio file into chunks
    chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes)

@@ -270,11 +287,25 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
        start_min = (chunk_index - 1) * chunk_duration_minutes
        end_min = chunk_index * chunk_duration_minutes
        actual_end_min = min(end_min, (len(AudioSegment.from_file(audio_file_path)) // 60000))  # To handle the last chunk's actual duration
+        
+        # preparing the start and end min in a timestamp format, also also catching cases of decimal, making it a real time 
+        if start_min % 1 == 0:
+            start_min = f"{int(start_min)}:00"
+            end_min = f"{int(end_min)}:00"
+        else:
+            # splitting the decimal part of the start and end min
+            start_min_int, start_min_dec = str(start_min).split('.')
+            end_min_int, end_min_dec = str(end_min).split('.')
+            # converting the decimal part to seconds
+            start_sec = int(start_min_dec) * 6
+            end_sec = int(end_min_dec) * 6
+            start_min = f"{start_min_int}:{start_sec}"
+            end_min = f"{end_min_int}:{end_sec}"

        # Create a document with the transcript and metadata
        metadata = {
-                "filename": base_filename,
-                "duration": f"{start_min}-{end_min} minutes", 
+                "source": base_filename,
+                "timestamp": f"{start_min}-{end_min}", 
                "file_type": file_type,
            }
        document = Document(page_content=transcript, metadata=metadata)
@@ -282,6 +313,9 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='

    # Delete the chunk folder after processing
    shutil.rmtree(chunk_folder)
+    
+    # adding a delay
+    time.sleep(0.2)

    return documents

@@ -294,7 +328,7 @@ def create_audio_document(audio_file_path, chunk_duration_minutes=3, file_type='

 # ------------------------------------------------VIDEO PROCESSING-----------------------------------------------------
 def preprocess_video_data(video_path: str, time_interval: int):
-    
+    logger.info(f"Preprocessing video data from {video_path}")
    # Load the video file
    video = VideoFileClip(video_path)
    
@@ -341,6 +375,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
    
    # now creating document from the audio file
    documents = create_audio_document(audio_path, chunk_duration_minutes=0.5, file_type='video')
+    logger.info(f"Documents created from video {video_path}")
    
    # deleting the audio file
    os.remove(audio_path)
@@ -349,6 +384,7 @@ def preprocess_video_data(video_path: str, time_interval: int):

 #----------------------------------------------------DOC SUMMARIZER --------------------------------------------------
 def doc_summarizer(document_page: list) -> str:
+    logger.info(f"Summarizing document")
    initiator_prompt = PromptTemplate(
        template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        Create a short summary of the document based on the provided text. 
@@ -370,12 +406,15 @@ def doc_summarizer(document_page: list) -> str:
 #-----------------------------------------------------OTHERS--------------------------------------------------------------

 def save_embedded_data(embeddings, key="data"):
-  embeddings.save_local(f"index/faiss_index_{key}")
-  print("Embeddings saved")
+    logger.info(f"Saving embeddings")
+    embeddings.save_local(f"index/faiss_index_{key}")
+    print("Embeddings saved")
+    return 'saved'

 def load_embedded_data(embeddings=embeddings, key="data"):
-  embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
-  return embed_db
+    logger.info(f"Loading embedded data")
+    embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
+    return embed_db

 #-----------------------------------------------------Data Loading Process----------------------------------------------------

@@ -396,15 +435,15 @@ def process_document(path, extension, text_doc, image_doc, audio_doc, video_doc)
    elif extension in image_doc:
        doc = process_map["image"](path)
        num_pages = 1
-        doc_name = doc[0].metadata['filename']
+        doc_name = doc[0].metadata['source'].split('\\')[-1]
    elif extension in audio_doc:
        doc = process_map["audio"](path)
        num_pages = len(doc)
-        doc_name = doc[0].metadata['filename']
+        doc_name = doc[0].metadata['source']
    elif extension in video_doc:
        doc = process_map["video"](path, time_interval=30)
        num_pages = len(doc)
-        doc_name = doc[0].metadata['filename']
+        doc_name = doc[0].metadata['source']
    else:
        return None, None, None  # Unhandled extension
    
@@ -425,7 +464,7 @@ def load_documents_from_directory(directory_path: str):
    
    def process_with_delay(file):
        result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc)
-        time.sleep(0.1)  # Introduce a 0.1s delay between processing each document
+        time.sleep(0.4)  # Introduce a 0.4s delay between processing each document
        return result
    
    with ThreadPoolExecutor() as executor:
@@ -441,27 +480,31 @@ def load_documents_from_directory(directory_path: str):
            first_page = doc[0].page_content
            summary = doc_summarizer(first_page)
            doc_summary.append(summary)
+            # adding some delay
+            time.sleep(0.5)
    
    docs_id = [uuid4().hex for _ in range(len(documents))]
    
    json_file = os.path.join(directory_path, 'data.json')
-    data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages, 'doc_summaary': doc_summary}
-    
-    if os.path.exists(json_file):
-        with open(json_file, 'r+') as f:
-            existing_data = json.load(f)
-            existing_data.update(data)
-            f.seek(0)
-            json.dump(existing_data, f)
-    else:
-        with open(json_file, 'w') as f:
-            json.dump(data, f)
+    # creating a dictionary for each document in the json file
+    for i in range(len(documents)):
+        data = {doc_names[i].split("\\")[-1]: {'doc_id':docs_id[i], 'num_pages': num_pages[i], 'doc_summary': doc_summary[i]}}
+        if os.path.exists(json_file):
+            with open(json_file, 'r+') as f:
+                existing_data = json.load(f)
+                existing_data.update(data)
+                f.seek(0)
+                json.dump(existing_data, f)
+        else:
+            with open(json_file, 'w') as f:
+                json.dump(data, f)
    
    return documents, docs_id, num_pages


 # A function to create vector store
 def create_vector_store(documents: list, docs_id: list, num_pages: list):
+    logger.info(f"Creating vector store")
    # index set up with the embedding dimension
    index = faiss.IndexFlatL2(384)
    # Initialize the FAISS vector store
@@ -476,10 +519,11 @@ def create_vector_store(documents: list, docs_id: list, num_pages: list):
        doc_id = docs_id[i]
        page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
        vector_store.add_documents(documents=documents[i], ids=page_ids)
-        
+    logger.info(f"Vector store created")
+    logger.info(f"Saving the vector store")
    # saving the vector store automatically
    save_embedded_data(vector_store, key="data")
-    
+    logger.info(f"Vector store saved")
    return vector_store

 # creating a function to add documents to the vector store
@@ -491,14 +535,70 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
        page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
        vector_store.add_documents(documents=documents[i], ids=page_ids)
    print ("Documents added to the vector store")
-    

+#----------------------------------------------------------Thumbnail Generator-----------------------------------------------------
+def create_text_thumbnail(file_path):
+    logger.info(f"Creating thumbnail for {file_path}")
+    # Create a folder for thumbnails if it doesn't exist
+    thumbnail_folder = os.path.join(os.path.dirname(file_path), 'thumbnails')
+    os.makedirs(thumbnail_folder, exist_ok=True)
+    
+    # Extract file name (without extension)
+    file_name = os.path.splitext(os.path.basename(file_path))[0]
+    
+    # Create a random background color
+    background_color = tuple(random.randint(0, 255) for _ in range(3))
+    
+    # Create an image with the random background color
+    img = Image.new('RGB', (800, 400), color=background_color)
+    
+    # Initialize drawing context
+    d = ImageDraw.Draw(img)
+    
+    # Load a font
+    try:
+        font = ImageFont.truetype("arial.ttf", 25)  # Adjust the font size as needed
+    except IOError:
+        font = ImageFont.load_default()
+    
+    # Get the bounding box of the text
+    text_bbox = d.textbbox((0, 0), file_name, font=font)
+    text_width = text_bbox[2] - text_bbox[0]
+    text_height = text_bbox[3] - text_bbox[1]
+    
+    # Calculate the position to center the text
+    text_x = (img.width - text_width) / 2
+    text_y = (img.height - text_height) / 2
+    
+    # Draw the text onto the image
+    d.text((text_x, text_y), file_name, font=font, fill=(255, 255, 255))  # White text
+    
+    # Save the image
+    thumbnail_path = os.path.join(thumbnail_folder, f"{file_name}.png")
+    img.save(thumbnail_path)
+    
+    print(f"Thumbnail created: {thumbnail_path}")
+
+def process_directory(directory_path):
+    supported_extensions = ['.txt', '.pdf', '.docx', '.mp3', '.m4a']
+    
+    for file in os.listdir(directory_path):
+        file_path = os.path.join(directory_path, file)
+        if os.path.isfile(file_path):
+            file_extension = os.path.splitext(file)[1].lower()
+            if file_extension in supported_extensions:
+                create_text_thumbnail(file_path)
+    return "Done"
+
+#-----------------------------------------------------------SEARCH-------------------------------------------------------
 # A document search function
 def search(query, k=20):
+    logger.info(f"Searching for {query}")
    # loading the embedded data
    embed_db = load_embedded_data()
    db = embed_db
    docs = db.similarity_search(query, k)
+    logger.info(f"Search completed")
    all = []
    info = []
    for doc in docs: