from utils import search import sys, os import json # Add the root directory to sys.path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from loggings.logging_config import logger # a function to get data description def get_data_description(data_path): # ensuring no // or / or extension is present data_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0] # print(data_name) # open the data.json file with open('uploads/data.json') as f: data = json.load(f) existing_data = data.keys() if data_name in existing_data: return data[data_name]['doc_summary'] else: return 'No description available' # getting data thumbnais. def get_data_thumbnail(data_path, timestamp = None): # ensuring no // or / or extension is present file_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0] # first check is to see if the file_name has a .png image in the thumbnail folder if os.path.exists(f'uploads/thumbnails/{file_name}.png'): return f'uploads/thumbnails/{file_name}.png' # the second check is to see if we have a folder with this file_name elif os.path.exists(f'uploads/{file_name}'): # so now we want to access the first timestamp if timestamp: first = timestamp[0] # split by - start, end = first.split('-') # we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds start = int(start.split(':')[0])*60 + int(start.split(':')[1]) end = int(end.split(':')[0])*60 + int(end.split(':')[1]) # bringing them together image_file = f"{start}-{end}s.png" # niw checkin if the file exists if os.path.exists(f'uploads/{file_name}/{image_file}'): return f'uploads/{file_name}/{image_file}' def summarize_doc_search(data): summary = {} for item in data: source = item['source'] if source not in summary: summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')} if 'page' in item: summary[source]['pages'].append(item['page']) if 'timestamp' in item: summary[source]['timestamps'].append(item['timestamp']) # Formatting the summary as a list of dictionaries summarized_list = [ {'filename': key.split("\\")[-1], 'pages': value['pages'], 'timestamps': value['timestamps'], 'file_type': value['file_type']} for key, value in summary.items() ] # getting the file description and thumbnail for item in summarized_list: item['description'] = get_data_description(item['filename']) # ehcking if we have an empty timestamp list if len(item['timestamps']) > 0: item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps']) else: item['thumbnail'] = get_data_thumbnail(item['filename']) return summarized_list # a function that perform the search and summary together def search_and_summarize(query): logger.info("Searching for the query") docs = search(query) logger.info("Search completed") logger.info("Summarizing search results") summary = summarize_doc_search(docs) logger.info("Search results summarized") return summary if __name__ == "__main__": logger.info("Receiving the search query") query = input("Enter the search query: ") logger.info(f"Search query received: {query}") logger.info("Searching and summarizing the search results") search_results = search_and_summarize(query) logger.info("Search results summarized") print(search_results)