ds_erp_ai/search.py

from utils import search
import sys, os
import json

#  Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from loggings.logging_config import logger

# a function to get data description
def get_data_description(data_path):
    # ensuring no // or / or extension is present
    data_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
    # print(data_name)
    # open the data.json file
    with open('uploads/data.json') as f:
        data = json.load(f)
        existing_data = data.keys()
        if data_name in existing_data:
            return data[data_name]['doc_summary']
        else:
            return 'No description available'

# getting data thumbnais.
def get_data_thumbnail(data_path, timestamp = None):
    # ensuring no // or / or extension is present
    file_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
    # first check is to see if the file_name has a .png image in the thumbnail folder
    if os.path.exists(f'uploads/thumbnails/{file_name}.png'):
        return f'uploads/thumbnails/{file_name}.png'
    # the second check is to see if we have a folder with this file_name
    elif os.path.exists(f'uploads/{file_name}'):
        # so now we want to access the first timestamp
        if timestamp:
            first = timestamp[0]
            # split by -
            start, end = first.split('-')
            # we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds
            start = int(start.split(':')[0])*60 + int(start.split(':')[1])
            end = int(end.split(':')[0])*60 + int(end.split(':')[1])
            # bringing them together
            image_file = f"{start}-{end}s.png"
            # niw checkin if the file exists
            if os.path.exists(f'uploads/{file_name}/{image_file}'):
                return f'uploads/{file_name}/{image_file}'

def summarize_doc_search(data):
    summary = {}

    for item in data:
        source = item['source']
        if source not in summary:
            summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}

        if 'page' in item:
            summary[source]['pages'].append(item['page'])
        if 'timestamp' in item:
            summary[source]['timestamps'].append(item['timestamp'])

    # Formatting the summary as a list of dictionaries
    summarized_list = [
        {'filename': key.split("\\")[-1],
         'pages': value['pages'],
         'timestamps': value['timestamps'],
         'file_type': value['file_type']}
        for key, value in summary.items()
    ]

    # getting the file description and thumbnail
    for item in summarized_list:
        item['description'] = get_data_description(item['filename'])
        # ehcking if we have an empty timestamp list
        if len(item['timestamps']) > 0:
            item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])
        else:
            item['thumbnail'] = get_data_thumbnail(item['filename'])

    return summarized_list

# a function that perform the search and summary together
def search_and_summarize(query):
    logger.info("Searching for the query")
    docs = search(query)
    logger.info("Search completed")
    logger.info("Summarizing search results")
    summary = summarize_doc_search(docs)
    logger.info("Search results summarized")
    return summary


if __name__ == "__main__":
    logger.info("Receiving the search query")
    query = input("Enter the search query: ")
    logger.info(f"Search query received: {query}")
    logger.info("Searching and summarizing the search results")
    search_results = search_and_summarize(query)
    logger.info("Search results summarized")
    print(search_results)