98 lines
3.7 KiB
Python
98 lines
3.7 KiB
Python
from utils import search
|
|
import sys, os
|
|
import json
|
|
|
|
# Add the root directory to sys.path
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
from loggings.logging_config import logger
|
|
|
|
# a function to get data description
|
|
def get_data_description(data_path):
|
|
# ensuring no // or / or extension is present
|
|
data_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
|
|
# print(data_name)
|
|
# open the data.json file
|
|
with open('data/data.json') as f:
|
|
data = json.load(f)
|
|
existing_data = data.keys()
|
|
if data_name in existing_data:
|
|
return data[data_name]['doc_summary']
|
|
else:
|
|
return 'No description available'
|
|
|
|
# getting data thumbnais.
|
|
def get_data_thumbnail(data_path, timestamp = None):
|
|
# ensuring no // or / or extension is present
|
|
file_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
|
|
# first check is to see if the file_name has a .png image in the thumbnail folder
|
|
if os.path.exists(f'data/thumbnails/{file_name}.png'):
|
|
return f'data/thumbnails/{file_name}.png'
|
|
# the second check is to see if we have a folder with this file_name
|
|
elif os.path.exists(f'data/{file_name}'):
|
|
# so now we want to access the first timestamp
|
|
if timestamp:
|
|
first = timestamp[0]
|
|
# split by -
|
|
start, end = first.split('-')
|
|
# we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds
|
|
start = int(start.split(':')[0])*60 + int(start.split(':')[1])
|
|
end = int(end.split(':')[0])*60 + int(end.split(':')[1])
|
|
# bringing them together
|
|
image_file = f"{start}-{end}s.png"
|
|
# niw checkin if the file exists
|
|
if os.path.exists(f'data/{file_name}/{image_file}'):
|
|
return f'data/{file_name}/{image_file}'
|
|
|
|
def summarize_doc_search(data):
|
|
summary = {}
|
|
|
|
for item in data:
|
|
source = item['source']
|
|
if source not in summary:
|
|
summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}
|
|
|
|
if 'page' in item:
|
|
summary[source]['pages'].append(item['page'])
|
|
if 'timestamp' in item:
|
|
summary[source]['timestamps'].append(item['timestamp'])
|
|
|
|
# Formatting the summary as a list of dictionaries
|
|
summarized_list = [
|
|
{'filename': key.split("\\")[-1],
|
|
'pages': value['pages'],
|
|
'timestamps': value['timestamps'],
|
|
'file_type': value['file_type']}
|
|
for key, value in summary.items()
|
|
]
|
|
|
|
# getting the file description and thumbnail
|
|
for item in summarized_list:
|
|
item['description'] = get_data_description(item['filename'])
|
|
# ehcking if we have an empty timestamp list
|
|
if len(item['timestamps']) > 0:
|
|
item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])
|
|
else:
|
|
item['thumbnail'] = get_data_thumbnail(item['filename'])
|
|
|
|
return summarized_list
|
|
|
|
# a function that perform the search and summary together
|
|
def search_and_summarize(query):
|
|
logger.info("Searching for the query")
|
|
docs = search(query)
|
|
logger.info("Search completed")
|
|
logger.info("Summarizing search results")
|
|
summary = summarize_doc_search(docs)
|
|
logger.info("Search results summarized")
|
|
return summary
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logger.info("Receiving the search query")
|
|
query = input("Enter the search query: ")
|
|
logger.info(f"Search query received: {query}")
|
|
logger.info("Searching and summarizing the search results")
|
|
search_results = search_and_summarize(query)
|
|
logger.info("Search results summarized")
|
|
print(search_results) |