AI indexing completed

2024-08-16 17:37:28 +01:00
parent 713354371e
commit cff9511d86
13 changed files with 2843 additions and 257 deletions
@@ -1,21 +1,98 @@
 from utils import search
 import sys, os
+import json

 #  Add the root directory to sys.path
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from loggings.logging_config import logger

+# a function to get data description
+def get_data_description(data_path):
+    # ensuring no // or / or extension is present
+    data_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
+    # print(data_name)
+    # open the data.json file
+    with open('data/data.json') as f:
+        data = json.load(f)
+        existing_data = data.keys()
+        if data_name in existing_data:
+            return data[data_name]['doc_summary']
+        else:
+            return 'No description available'

+# getting data thumbnais. 
+def get_data_thumbnail(data_path, timestamp = None):
+    # ensuring no // or / or extension is present
+    file_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
+    # first check is to see if the file_name has a .png image in the thumbnail folder
+    if os.path.exists(f'data/thumbnails/{file_name}.png'):
+        return f'data/thumbnails/{file_name}.png'
+    # the second check is to see if we have a folder with this file_name
+    elif os.path.exists(f'data/{file_name}'):
+        # so now we want to access the first timestamp
+        if timestamp:
+            first = timestamp[0]
+            # split by -
+            start, end = first.split('-')
+            # we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds
+            start = int(start.split(':')[0])*60 + int(start.split(':')[1])
+            end = int(end.split(':')[0])*60 + int(end.split(':')[1])
+            # bringing them together
+            image_file = f"{start}-{end}s.png"
+            # niw checkin if the file exists
+            if os.path.exists(f'data/{file_name}/{image_file}'):
+                return f'data/{file_name}/{image_file}'
+
+def summarize_doc_search(data):
+    summary = {}
+
+    for item in data:
+        source = item['source']
+        if source not in summary:
+            summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}
+        
+        if 'page' in item:
+            summary[source]['pages'].append(item['page'])
+        if 'timestamp' in item:
+            summary[source]['timestamps'].append(item['timestamp'])
+    
+    # Formatting the summary as a list of dictionaries
+    summarized_list = [
+        {'filename': key.split("\\")[-1], 
+         'pages': value['pages'], 
+         'timestamps': value['timestamps'], 
+         'file_type': value['file_type']}
+        for key, value in summary.items()
+    ]
+    
+    # getting the file description and thumbnail
+    for item in summarized_list:
+        item['description'] = get_data_description(item['filename'])
+        # ehcking if we have an empty timestamp list
+        if len(item['timestamps']) > 0:
+            item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])
+        else:
+            item['thumbnail'] = get_data_thumbnail(item['filename'])
+    
+    return summarized_list
+
+# a function that perform the search and summary together
+def search_and_summarize(query):
+    logger.info("Searching for the query")
+    docs = search(query)
+    logger.info("Search completed")
+    logger.info("Summarizing search results")
+    summary = summarize_doc_search(docs)
+    logger.info("Search results summarized")
+    return summary



 if __name__ == "__main__":
    logger.info("Receiving the search query")
    query = input("Enter the search query: ")
-    logger.info(f"Searching for {query}")
-    page_content, all, pages = search(query)
-    logger.info("Search completed")
-    logger.info(f"Page content: {page_content}")
-    print(f"Page content: {all}")
-    print(f"Pages: {pages}")
-    print("Search completed")
+    logger.info(f"Search query received: {query}")
+    logger.info("Searching and summarizing the search results")
+    search_results = search_and_summarize(query)
+    logger.info("Search results summarized")
+    print(search_results)