Files
timothyafolami ddd0dda276 last commi
2024-08-16 21:39:28 +01:00

98 lines
3.7 KiB
Python

from utils import search
import sys, os
import json
# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from loggings.logging_config import logger
# a function to get data description
def get_data_description(data_path):
# ensuring no // or / or extension is present
data_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
# print(data_name)
# open the data.json file
with open('uploads/data.json') as f:
data = json.load(f)
existing_data = data.keys()
if data_name in existing_data:
return data[data_name]['doc_summary']
else:
return 'No description available'
# getting data thumbnais.
def get_data_thumbnail(data_path, timestamp = None):
# ensuring no // or / or extension is present
file_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
# first check is to see if the file_name has a .png image in the thumbnail folder
if os.path.exists(f'uploads/thumbnails/{file_name}.png'):
return f'uploads/thumbnails/{file_name}.png'
# the second check is to see if we have a folder with this file_name
elif os.path.exists(f'uploads/{file_name}'):
# so now we want to access the first timestamp
if timestamp:
first = timestamp[0]
# split by -
start, end = first.split('-')
# we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds
start = int(start.split(':')[0])*60 + int(start.split(':')[1])
end = int(end.split(':')[0])*60 + int(end.split(':')[1])
# bringing them together
image_file = f"{start}-{end}s.png"
# niw checkin if the file exists
if os.path.exists(f'uploads/{file_name}/{image_file}'):
return f'uploads/{file_name}/{image_file}'
def summarize_doc_search(data):
summary = {}
for item in data:
source = item['source']
if source not in summary:
summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}
if 'page' in item:
summary[source]['pages'].append(item['page'])
if 'timestamp' in item:
summary[source]['timestamps'].append(item['timestamp'])
# Formatting the summary as a list of dictionaries
summarized_list = [
{'filename': key.split("\\")[-1],
'pages': value['pages'],
'timestamps': value['timestamps'],
'file_type': value['file_type']}
for key, value in summary.items()
]
# getting the file description and thumbnail
for item in summarized_list:
item['description'] = get_data_description(item['filename'])
# ehcking if we have an empty timestamp list
if len(item['timestamps']) > 0:
item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])
else:
item['thumbnail'] = get_data_thumbnail(item['filename'])
return summarized_list
# a function that perform the search and summary together
def search_and_summarize(query):
logger.info("Searching for the query")
docs = search(query)
logger.info("Search completed")
logger.info("Summarizing search results")
summary = summarize_doc_search(docs)
logger.info("Search results summarized")
return summary
if __name__ == "__main__":
logger.info("Receiving the search query")
query = input("Enter the search query: ")
logger.info(f"Search query received: {query}")
logger.info("Searching and summarizing the search results")
search_results = search_and_summarize(query)
logger.info("Search results summarized")
print(search_results)