AI indexing completed
This commit is contained in:
@@ -1,21 +1,98 @@
|
||||
from utils import search
|
||||
import sys, os
|
||||
import json
|
||||
|
||||
# Add the root directory to sys.path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
from loggings.logging_config import logger
|
||||
|
||||
# a function to get data description
|
||||
def get_data_description(data_path):
|
||||
# ensuring no // or / or extension is present
|
||||
data_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
|
||||
# print(data_name)
|
||||
# open the data.json file
|
||||
with open('data/data.json') as f:
|
||||
data = json.load(f)
|
||||
existing_data = data.keys()
|
||||
if data_name in existing_data:
|
||||
return data[data_name]['doc_summary']
|
||||
else:
|
||||
return 'No description available'
|
||||
|
||||
# getting data thumbnais.
|
||||
def get_data_thumbnail(data_path, timestamp = None):
|
||||
# ensuring no // or / or extension is present
|
||||
file_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
|
||||
# first check is to see if the file_name has a .png image in the thumbnail folder
|
||||
if os.path.exists(f'data/thumbnails/{file_name}.png'):
|
||||
return f'data/thumbnails/{file_name}.png'
|
||||
# the second check is to see if we have a folder with this file_name
|
||||
elif os.path.exists(f'data/{file_name}'):
|
||||
# so now we want to access the first timestamp
|
||||
if timestamp:
|
||||
first = timestamp[0]
|
||||
# split by -
|
||||
start, end = first.split('-')
|
||||
# we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds
|
||||
start = int(start.split(':')[0])*60 + int(start.split(':')[1])
|
||||
end = int(end.split(':')[0])*60 + int(end.split(':')[1])
|
||||
# bringing them together
|
||||
image_file = f"{start}-{end}s.png"
|
||||
# niw checkin if the file exists
|
||||
if os.path.exists(f'data/{file_name}/{image_file}'):
|
||||
return f'data/{file_name}/{image_file}'
|
||||
|
||||
def summarize_doc_search(data):
|
||||
summary = {}
|
||||
|
||||
for item in data:
|
||||
source = item['source']
|
||||
if source not in summary:
|
||||
summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}
|
||||
|
||||
if 'page' in item:
|
||||
summary[source]['pages'].append(item['page'])
|
||||
if 'timestamp' in item:
|
||||
summary[source]['timestamps'].append(item['timestamp'])
|
||||
|
||||
# Formatting the summary as a list of dictionaries
|
||||
summarized_list = [
|
||||
{'filename': key.split("\\")[-1],
|
||||
'pages': value['pages'],
|
||||
'timestamps': value['timestamps'],
|
||||
'file_type': value['file_type']}
|
||||
for key, value in summary.items()
|
||||
]
|
||||
|
||||
# getting the file description and thumbnail
|
||||
for item in summarized_list:
|
||||
item['description'] = get_data_description(item['filename'])
|
||||
# ehcking if we have an empty timestamp list
|
||||
if len(item['timestamps']) > 0:
|
||||
item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])
|
||||
else:
|
||||
item['thumbnail'] = get_data_thumbnail(item['filename'])
|
||||
|
||||
return summarized_list
|
||||
|
||||
# a function that perform the search and summary together
|
||||
def search_and_summarize(query):
|
||||
logger.info("Searching for the query")
|
||||
docs = search(query)
|
||||
logger.info("Search completed")
|
||||
logger.info("Summarizing search results")
|
||||
summary = summarize_doc_search(docs)
|
||||
logger.info("Search results summarized")
|
||||
return summary
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("Receiving the search query")
|
||||
query = input("Enter the search query: ")
|
||||
logger.info(f"Searching for {query}")
|
||||
page_content, all, pages = search(query)
|
||||
logger.info("Search completed")
|
||||
logger.info(f"Page content: {page_content}")
|
||||
print(f"Page content: {all}")
|
||||
print(f"Pages: {pages}")
|
||||
print("Search completed")
|
||||
logger.info(f"Search query received: {query}")
|
||||
logger.info("Searching and summarizing the search results")
|
||||
search_results = search_and_summarize(query)
|
||||
logger.info("Search results summarized")
|
||||
print(search_results)
|
||||
Reference in New Issue
Block a user