AI indexing completed
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
+14
-6
@@ -1,19 +1,18 @@
|
|||||||
import sys, os
|
import sys, os
|
||||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||||
from utils import create_vector_store, save_embedded_data, load_documents_from_directory, load_embedding_model
|
from utils import create_vector_store, save_embedded_data, load_documents_from_directory, process_directory
|
||||||
from loggings.logging_config import logger
|
from loggings.logging_config import logger
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
# This module will load in the data, you only need to add the data path to it.
|
# This module will load in the data, you only need to add the data path to it.
|
||||||
data_path = './data'
|
data_path = './data'
|
||||||
|
|
||||||
# # loading the embeddings
|
|
||||||
# logger.info(f"Loading the embeddings")
|
|
||||||
# embeddings = load_embedding_model()
|
|
||||||
# logger.info(f"Embeddings loaded")
|
|
||||||
|
|
||||||
def load_data(data_path: str):
|
def load_data(data_path: str):
|
||||||
logger.info(f"Loading data from {data_path}")
|
logger.info(f"Loading data from {data_path}")
|
||||||
|
start_time = time.time()
|
||||||
|
# logging the start time
|
||||||
|
logger.info(f"Start time: {start_time}")
|
||||||
documents, docs_id, num_pages = load_documents_from_directory(data_path)
|
documents, docs_id, num_pages = load_documents_from_directory(data_path)
|
||||||
logger.info(f"Data loaded")
|
logger.info(f"Data loaded")
|
||||||
logger.info(f"Creating vector store")
|
logger.info(f"Creating vector store")
|
||||||
@@ -23,8 +22,17 @@ def load_data(data_path: str):
|
|||||||
# saving the embedded data
|
# saving the embedded data
|
||||||
save_embedded_data(embed_db)
|
save_embedded_data(embed_db)
|
||||||
logger.info(f"Vector store saved")
|
logger.info(f"Vector store saved")
|
||||||
|
end_time = time.time()
|
||||||
|
logger.info(f"End time: {end_time}")
|
||||||
|
time_taken = end_time - start_time
|
||||||
|
logger.info(f"Time taken: {time_taken}")
|
||||||
|
|
||||||
print("Vector store created and saved")
|
print("Vector store created and saved")
|
||||||
|
# creating the thumbnails
|
||||||
|
logger.info(f"Creating thumbnails")
|
||||||
|
status = process_directory(data_path)
|
||||||
|
print(f"{status}: Thumbnails created.")
|
||||||
|
logger.info(f"Thumbnails created")
|
||||||
return embed_db
|
return embed_db
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
+2147
File diff suppressed because it is too large
Load Diff
@@ -4,16 +4,12 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from utils import search, load_embedded_data
|
from utils import load_embedded_data, load_documents_from_directory, create_vector_store, save_embedded_data
|
||||||
|
from search import search_and_summarize
|
||||||
from data_ingest import load_data
|
from data_ingest import load_data
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
# Initialize global variables for FAISS index and vector store
|
|
||||||
try:
|
|
||||||
vector_store = load_embedded_data()
|
|
||||||
except Exception as e:
|
|
||||||
vector_store = None
|
|
||||||
|
|
||||||
# Define allowed origins for CORS
|
# Define allowed origins for CORS
|
||||||
origins = [
|
origins = [
|
||||||
@@ -37,19 +33,21 @@ class SearchRequest(BaseModel):
|
|||||||
|
|
||||||
@app.get("/load_documents")
|
@app.get("/load_documents")
|
||||||
def load_documents(directory: str):
|
def load_documents(directory: str):
|
||||||
global vector_store
|
|
||||||
|
|
||||||
# Load documents using the utility function
|
# loading the documents from the directory
|
||||||
vector_store = load_data(directory)
|
documents, docs_id, num_pages = load_documents_from_directory(directory)
|
||||||
|
# embedding the documents
|
||||||
|
embed_db = create_vector_store(documents, docs_id, num_pages)
|
||||||
|
# saving the embedded data
|
||||||
|
status = save_embedded_data(embed_db)
|
||||||
|
|
||||||
return {"status": "Documents loaded successfully"}
|
return {"status": "Documents loaded successfully"}
|
||||||
|
|
||||||
@app.get("/search")
|
@app.post("/search")
|
||||||
def search(request: SearchRequest):
|
def search(request: SearchRequest):
|
||||||
global vector_store
|
|
||||||
|
|
||||||
# Perform search using the utility function
|
# Perform search using the utility function
|
||||||
results = search(vector_store, request.query)
|
results = search_and_summarize(request.query)
|
||||||
|
|
||||||
return {"results": results}
|
return {"results": results}
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,8 @@ docx2txt
|
|||||||
docx
|
docx
|
||||||
fastapi[standard]
|
fastapi[standard]
|
||||||
pdfplumber
|
pdfplumber
|
||||||
|
pypdf
|
||||||
|
python-docx
|
||||||
pytesseract
|
pytesseract
|
||||||
groq
|
groq
|
||||||
python-dotenv
|
python-dotenv
|
||||||
|
|||||||
@@ -1,21 +1,98 @@
|
|||||||
from utils import search
|
from utils import search
|
||||||
import sys, os
|
import sys, os
|
||||||
|
import json
|
||||||
|
|
||||||
# Add the root directory to sys.path
|
# Add the root directory to sys.path
|
||||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||||
from loggings.logging_config import logger
|
from loggings.logging_config import logger
|
||||||
|
|
||||||
|
# a function to get data description
|
||||||
|
def get_data_description(data_path):
|
||||||
|
# ensuring no // or / or extension is present
|
||||||
|
data_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
|
||||||
|
# print(data_name)
|
||||||
|
# open the data.json file
|
||||||
|
with open('data/data.json') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
existing_data = data.keys()
|
||||||
|
if data_name in existing_data:
|
||||||
|
return data[data_name]['doc_summary']
|
||||||
|
else:
|
||||||
|
return 'No description available'
|
||||||
|
|
||||||
|
# getting data thumbnais.
|
||||||
|
def get_data_thumbnail(data_path, timestamp = None):
|
||||||
|
# ensuring no // or / or extension is present
|
||||||
|
file_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
|
||||||
|
# first check is to see if the file_name has a .png image in the thumbnail folder
|
||||||
|
if os.path.exists(f'data/thumbnails/{file_name}.png'):
|
||||||
|
return f'data/thumbnails/{file_name}.png'
|
||||||
|
# the second check is to see if we have a folder with this file_name
|
||||||
|
elif os.path.exists(f'data/{file_name}'):
|
||||||
|
# so now we want to access the first timestamp
|
||||||
|
if timestamp:
|
||||||
|
first = timestamp[0]
|
||||||
|
# split by -
|
||||||
|
start, end = first.split('-')
|
||||||
|
# we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds
|
||||||
|
start = int(start.split(':')[0])*60 + int(start.split(':')[1])
|
||||||
|
end = int(end.split(':')[0])*60 + int(end.split(':')[1])
|
||||||
|
# bringing them together
|
||||||
|
image_file = f"{start}-{end}s.png"
|
||||||
|
# niw checkin if the file exists
|
||||||
|
if os.path.exists(f'data/{file_name}/{image_file}'):
|
||||||
|
return f'data/{file_name}/{image_file}'
|
||||||
|
|
||||||
|
def summarize_doc_search(data):
|
||||||
|
summary = {}
|
||||||
|
|
||||||
|
for item in data:
|
||||||
|
source = item['source']
|
||||||
|
if source not in summary:
|
||||||
|
summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}
|
||||||
|
|
||||||
|
if 'page' in item:
|
||||||
|
summary[source]['pages'].append(item['page'])
|
||||||
|
if 'timestamp' in item:
|
||||||
|
summary[source]['timestamps'].append(item['timestamp'])
|
||||||
|
|
||||||
|
# Formatting the summary as a list of dictionaries
|
||||||
|
summarized_list = [
|
||||||
|
{'filename': key.split("\\")[-1],
|
||||||
|
'pages': value['pages'],
|
||||||
|
'timestamps': value['timestamps'],
|
||||||
|
'file_type': value['file_type']}
|
||||||
|
for key, value in summary.items()
|
||||||
|
]
|
||||||
|
|
||||||
|
# getting the file description and thumbnail
|
||||||
|
for item in summarized_list:
|
||||||
|
item['description'] = get_data_description(item['filename'])
|
||||||
|
# ehcking if we have an empty timestamp list
|
||||||
|
if len(item['timestamps']) > 0:
|
||||||
|
item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])
|
||||||
|
else:
|
||||||
|
item['thumbnail'] = get_data_thumbnail(item['filename'])
|
||||||
|
|
||||||
|
return summarized_list
|
||||||
|
|
||||||
|
# a function that perform the search and summary together
|
||||||
|
def search_and_summarize(query):
|
||||||
|
logger.info("Searching for the query")
|
||||||
|
docs = search(query)
|
||||||
|
logger.info("Search completed")
|
||||||
|
logger.info("Summarizing search results")
|
||||||
|
summary = summarize_doc_search(docs)
|
||||||
|
logger.info("Search results summarized")
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logger.info("Receiving the search query")
|
logger.info("Receiving the search query")
|
||||||
query = input("Enter the search query: ")
|
query = input("Enter the search query: ")
|
||||||
logger.info(f"Searching for {query}")
|
logger.info(f"Search query received: {query}")
|
||||||
page_content, all, pages = search(query)
|
logger.info("Searching and summarizing the search results")
|
||||||
logger.info("Search completed")
|
search_results = search_and_summarize(query)
|
||||||
logger.info(f"Page content: {page_content}")
|
logger.info("Search results summarized")
|
||||||
print(f"Page content: {all}")
|
print(search_results)
|
||||||
print(f"Pages: {pages}")
|
|
||||||
print("Search completed")
|
|
||||||
+311
-202
@@ -11,16 +11,29 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"2024-08-16 16:06:32,880 - INFO - Loading the embedding model\n",
|
||||||
|
"2024-08-16 16:06:38,758 - WARNING - c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
|
||||||
|
" from tqdm.autonotebook import tqdm, trange\n",
|
||||||
|
"\n",
|
||||||
|
"2024-08-16 16:06:47,268 - INFO - PyTorch version 2.4.0+cu124 available.\n",
|
||||||
|
"2024-08-16 16:06:47,868 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en\n",
|
||||||
|
"2024-08-16 16:06:55,638 - INFO - Embedding model loaded\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"True"
|
"True"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 2,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@@ -32,17 +45,51 @@
|
|||||||
"from langchain_groq import ChatGroq\n",
|
"from langchain_groq import ChatGroq\n",
|
||||||
"from langchain_core.prompts.prompt import PromptTemplate\n",
|
"from langchain_core.prompts.prompt import PromptTemplate\n",
|
||||||
"from langchain_core.output_parsers import StrOutputParser\n",
|
"from langchain_core.output_parsers import StrOutputParser\n",
|
||||||
|
"from collections import defaultdict\n",
|
||||||
|
"import json\n",
|
||||||
"load_dotenv()"
|
"load_dotenv()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# setting up groq api key\n",
|
||||||
|
"# os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# setting up groq api key\n",
|
"\n",
|
||||||
"os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
|
"# # chat set up\n",
|
||||||
|
"# GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# ### Chains #####\n",
|
||||||
|
"# # Initiator\n",
|
||||||
|
"# def doc_summarizer(document_page: list) -> str:\n",
|
||||||
|
"# initiator_prompt = PromptTemplate(\n",
|
||||||
|
"# template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
|
||||||
|
"# Create a short summary of the document based on the provided text. \n",
|
||||||
|
" \n",
|
||||||
|
"# Start with: This document is about...\n",
|
||||||
|
" \n",
|
||||||
|
"# <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
|
||||||
|
"# DOCUMENT: {document_page} \\n\n",
|
||||||
|
" \n",
|
||||||
|
"# <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
|
||||||
|
"# input_variables=[\"document_page\"],\n",
|
||||||
|
"# )\n",
|
||||||
|
"\n",
|
||||||
|
"# initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
|
||||||
|
"# output = initiator_router.invoke({\"document_page\":document_page})\n",
|
||||||
|
"# return output\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -51,41 +98,31 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"\n",
|
"document_page = 'Wirebrush WD 40'\n",
|
||||||
"# chat set up\n",
|
"# testing the function\n",
|
||||||
"GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
|
"# summary = doc_summarizer(document_page)"
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"### Chains #####\n",
|
|
||||||
"# Initiator\n",
|
|
||||||
"def doc_summarizer(document_page: list) -> str:\n",
|
|
||||||
" initiator_prompt = PromptTemplate(\n",
|
|
||||||
" template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
|
|
||||||
" Create a short summary of the document based on the provided text. \n",
|
|
||||||
" \n",
|
|
||||||
" Start with: This document is about...\n",
|
|
||||||
" \n",
|
|
||||||
" <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
|
|
||||||
" DOCUMENT: {document_page} \\n\n",
|
|
||||||
" \n",
|
|
||||||
" <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
|
|
||||||
" input_variables=[\"document_page\"],\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
|
|
||||||
" output = initiator_router.invoke({\"document_page\":document_page})\n",
|
|
||||||
" return output\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"2024-08-16 16:06:55,717 - INFO - Searching for Wirebrush WD 40\n",
|
||||||
|
"2024-08-16 16:06:55,717 - INFO - Loading embedded data\n",
|
||||||
|
"2024-08-16 16:06:56,487 - WARNING - c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
|
||||||
|
" attn_output = torch.nn.functional.scaled_dot_product_attention(\n",
|
||||||
|
"\n",
|
||||||
|
"2024-08-16 16:06:56,628 - INFO - Search completed\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"document_page = 'How to change the engine oil of a toyota corrolla.'\n",
|
"docs = search(document_page)"
|
||||||
"# testing the function\n",
|
|
||||||
"summary = doc_summarizer(document_page)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -96,7 +133,46 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'"
|
"[{'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 424},\n",
|
||||||
|
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'timestamp': '0:30-1:0',\n",
|
||||||
|
" 'file_type': 'video'},\n",
|
||||||
|
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'timestamp': '2:00-2:00',\n",
|
||||||
|
" 'file_type': 'video'},\n",
|
||||||
|
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'timestamp': '2:30-3:0',\n",
|
||||||
|
" 'file_type': 'video'},\n",
|
||||||
|
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'timestamp': '4:00-4:00',\n",
|
||||||
|
" 'file_type': 'video'},\n",
|
||||||
|
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'timestamp': '5:30-6:0',\n",
|
||||||
|
" 'file_type': 'video'},\n",
|
||||||
|
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'timestamp': '8:00-8:00',\n",
|
||||||
|
" 'file_type': 'video'},\n",
|
||||||
|
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'timestamp': '8:30-9:0',\n",
|
||||||
|
" 'file_type': 'video'},\n",
|
||||||
|
" {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'timestamp': '10:00-10:00',\n",
|
||||||
|
" 'file_type': 'video'},\n",
|
||||||
|
" {'source': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'timestamp': '3:30-4:0',\n",
|
||||||
|
" 'file_type': 'video'},\n",
|
||||||
|
" {'source': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'timestamp': '5:30-6:0',\n",
|
||||||
|
" 'file_type': 'video'},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 329},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 264},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 290},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 201},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 326},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 315},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 317},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 325},\n",
|
||||||
|
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422}]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 6,
|
"execution_count": 6,
|
||||||
@@ -104,202 +180,235 @@
|
|||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
|
||||||
"summary"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"docs = search(document_page)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
|
||||||
" 'page': 1,\n",
|
|
||||||
" 'file_type': 'text'},\n",
|
|
||||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n",
|
|
||||||
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
|
||||||
" 'page': 3,\n",
|
|
||||||
" 'file_type': 'text'},\n",
|
|
||||||
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
|
||||||
" 'page': 2,\n",
|
|
||||||
" 'file_type': 'text'},\n",
|
|
||||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n",
|
|
||||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
|
||||||
" 'page': 2,\n",
|
|
||||||
" 'file_type': 'text'},\n",
|
|
||||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
|
||||||
" 'page': 3,\n",
|
|
||||||
" 'file_type': 'text'},\n",
|
|
||||||
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
|
||||||
" 'page': 0,\n",
|
|
||||||
" 'file_type': 'text'},\n",
|
|
||||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
|
||||||
" 'page': 5,\n",
|
|
||||||
" 'file_type': 'text'},\n",
|
|
||||||
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
|
||||||
" 'page': 6,\n",
|
|
||||||
" 'file_type': 'text'},\n",
|
|
||||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n",
|
|
||||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n",
|
|
||||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n",
|
|
||||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n",
|
|
||||||
" {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n",
|
|
||||||
" {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n",
|
|
||||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n",
|
|
||||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n",
|
|
||||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n",
|
|
||||||
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"docs"
|
"docs"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 20,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from collections import defaultdict\n",
|
"# a function to get data description\n",
|
||||||
"\n",
|
"def get_data_description(data_path):\n",
|
||||||
"def transform_file_data(input_data):\n",
|
" # ensuring no // or / or extension is present\n",
|
||||||
" # Create a dictionary to aggregate data by filename\n",
|
" data_name = data_path.split('/')[-1].split('\\\\')[-1].split('.')[0]\n",
|
||||||
" aggregated_data = defaultdict(lambda: {\n",
|
" # print(data_name)\n",
|
||||||
" 'filename': '',\n",
|
" # open the data.json file\n",
|
||||||
" 'pages': [],\n",
|
" with open('data/data.json') as f:\n",
|
||||||
" 'timestamps': [],\n",
|
" data = json.load(f)\n",
|
||||||
" 'description': 'lorem ipsum',\n",
|
" existing_data = data.keys()\n",
|
||||||
" 'filetype': '',\n",
|
" if data_name in existing_data:\n",
|
||||||
" 'thumbnail': '',\n",
|
" return data[data_name]['doc_summary']\n",
|
||||||
" 'track_id': 123\n",
|
" else:\n",
|
||||||
" })\n",
|
" return 'No description available'"
|
||||||
"\n",
|
|
||||||
" for item in input_data:\n",
|
|
||||||
" if 'source' in item:\n",
|
|
||||||
" file_path = item['source']\n",
|
|
||||||
" filename = file_path.split('\\\\')[-1]\n",
|
|
||||||
" extension = filename.split('.')[-1]\n",
|
|
||||||
"\n",
|
|
||||||
" aggregated_data[filename]['filename'] = filename\n",
|
|
||||||
" aggregated_data[filename]['filetype'] = extension\n",
|
|
||||||
" aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n",
|
|
||||||
"\n",
|
|
||||||
" if extension in ['pdf', 'txt', 'docx']:\n",
|
|
||||||
" aggregated_data[filename]['pages'].append(item['page'])\n",
|
|
||||||
" elif extension in ['mp4', 'mkv', 'flv']:\n",
|
|
||||||
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
|
|
||||||
" elif extension in ['mp3', 'wav', 'flac']:\n",
|
|
||||||
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
|
|
||||||
" elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n",
|
|
||||||
" aggregated_data[filename].pop('pages', None) # Remove pages if it's an image\n",
|
|
||||||
" aggregated_data[filename].pop('timestamps', None) # Remove timestamps if it's an image\n",
|
|
||||||
"\n",
|
|
||||||
" elif 'filename' in item:\n",
|
|
||||||
" filename = item['filename']\n",
|
|
||||||
" extension = item['file_type']\n",
|
|
||||||
" aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n",
|
|
||||||
" aggregated_data[filename]['filetype'] = extension\n",
|
|
||||||
" aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n",
|
|
||||||
" if 'duration' in item:\n",
|
|
||||||
" start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n",
|
|
||||||
" aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n",
|
|
||||||
"\n",
|
|
||||||
" # Convert aggregated data to the desired output format\n",
|
|
||||||
" output_data = []\n",
|
|
||||||
" for filename, data in aggregated_data.items():\n",
|
|
||||||
" # Remove empty lists for pages and timestamps\n",
|
|
||||||
" if not data['pages']:\n",
|
|
||||||
" data.pop('pages', None)\n",
|
|
||||||
" if not data['timestamps']:\n",
|
|
||||||
" data.pop('timestamps', None)\n",
|
|
||||||
" output_data.append(data)\n",
|
|
||||||
"\n",
|
|
||||||
" return output_data\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 21,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n",
|
|
||||||
"{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n",
|
|
||||||
"{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n",
|
|
||||||
"{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"output = transform_file_data(docs)\n",
|
|
||||||
"for item in output:\n",
|
|
||||||
" print(item)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n",
|
"\"This document is about a video tutorial series on replacing car parts, specifically the latest installment of AutoDoc's video tutorials.\""
|
||||||
" 'pages': [1, 3, 2, 0],\n",
|
|
||||||
" 'description': 'lorem ipsum',\n",
|
|
||||||
" 'filetype': 'txt',\n",
|
|
||||||
" 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n",
|
|
||||||
" 'track_id': 123},\n",
|
|
||||||
" {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
|
|
||||||
" 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n",
|
|
||||||
" 'description': 'lorem ipsum',\n",
|
|
||||||
" 'filetype': 'pdf',\n",
|
|
||||||
" 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n",
|
|
||||||
" 'track_id': 123},\n",
|
|
||||||
" {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n",
|
|
||||||
" 'pages': [2, 3, 5, 6],\n",
|
|
||||||
" 'description': 'lorem ipsum',\n",
|
|
||||||
" 'filetype': 'docx',\n",
|
|
||||||
" 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n",
|
|
||||||
" 'track_id': 123},\n",
|
|
||||||
" {'filename': 'audio-2.audio',\n",
|
|
||||||
" 'timestamps': [(0, 3), (3, 6)],\n",
|
|
||||||
" 'description': 'lorem ipsum',\n",
|
|
||||||
" 'filetype': 'audio',\n",
|
|
||||||
" 'thumbnail': 'audio-2.jpg',\n",
|
|
||||||
" 'track_id': 123}]"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 12,
|
"execution_count": 21,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"output"
|
"get_data_description('How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# getting data thumbnais. \n",
|
||||||
|
"def get_data_thumbnail(data_path, timestamp = None):\n",
|
||||||
|
" # ensuring no // or / or extension is present\n",
|
||||||
|
" file_name = data_path.split('/')[-1].split('\\\\')[-1].split('.')[0]\n",
|
||||||
|
" # first check is to see if the file_name has a .png image in the thumbnail folder\n",
|
||||||
|
" if os.path.exists(f'data/thumbnails/{file_name}.png'):\n",
|
||||||
|
" return f'data/thumbnails/{file_name}.png'\n",
|
||||||
|
" # the second check is to see if we have a folder with this file_name\n",
|
||||||
|
" elif os.path.exists(f'data/{file_name}'):\n",
|
||||||
|
" # so now we want to access the first timestamp\n",
|
||||||
|
" if timestamp:\n",
|
||||||
|
" first = timestamp[0]\n",
|
||||||
|
" # split by -\n",
|
||||||
|
" start, end = first.split('-')\n",
|
||||||
|
" # we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds\n",
|
||||||
|
" start = int(start.split(':')[0])*60 + int(start.split(':')[1])\n",
|
||||||
|
" end = int(end.split(':')[0])*60 + int(end.split(':')[1])\n",
|
||||||
|
" # bringing them together\n",
|
||||||
|
" image_file = f\"{start}-{end}s.png\"\n",
|
||||||
|
" # niw checkin if the file exists\n",
|
||||||
|
" if os.path.exists(f'data/{file_name}/{image_file}'):\n",
|
||||||
|
" return f'data/{file_name}/{image_file}'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'data/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/210-240s.png'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"get_data_thumbnail('How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]', timestamp=['3:30-4:0', '5:30-6:0'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'data/thumbnails/corolla-2020-toyota-owners-manual.png'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"get_data_thumbnail(\"./data\\\\corolla-2020-toyota-owners-manual.pdf'\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 34,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def summarize_doc_search(data):\n",
|
||||||
|
" summary = {}\n",
|
||||||
|
"\n",
|
||||||
|
" for item in data:\n",
|
||||||
|
" source = item['source']\n",
|
||||||
|
" if source not in summary:\n",
|
||||||
|
" summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}\n",
|
||||||
|
" \n",
|
||||||
|
" if 'page' in item:\n",
|
||||||
|
" summary[source]['pages'].append(item['page'])\n",
|
||||||
|
" if 'timestamp' in item:\n",
|
||||||
|
" summary[source]['timestamps'].append(item['timestamp'])\n",
|
||||||
|
" \n",
|
||||||
|
" # Formatting the summary as a list of dictionaries\n",
|
||||||
|
" summarized_list = [\n",
|
||||||
|
" {'filename': key.split(\"\\\\\")[-1], \n",
|
||||||
|
" 'pages': value['pages'], \n",
|
||||||
|
" 'timestamps': value['timestamps'], \n",
|
||||||
|
" 'file_type': value['file_type']}\n",
|
||||||
|
" for key, value in summary.items()\n",
|
||||||
|
" ]\n",
|
||||||
|
" \n",
|
||||||
|
" # getting the file description and thumbnail\n",
|
||||||
|
" for item in summarized_list:\n",
|
||||||
|
" item['description'] = get_data_description(item['filename'])\n",
|
||||||
|
" # ehcking if we have an empty timestamp list\n",
|
||||||
|
" if len(item['timestamps']) > 0:\n",
|
||||||
|
" item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])\n",
|
||||||
|
" else:\n",
|
||||||
|
" item['thumbnail'] = get_data_thumbnail(item['filename'])\n",
|
||||||
|
" \n",
|
||||||
|
" return summarized_list"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc_summary = summarize_doc_search(docs)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[{'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
|
||||||
|
" 'pages': [424, 329, 264, 290, 201, 326, 315, 317, 325, 422],\n",
|
||||||
|
" 'timestamps': [],\n",
|
||||||
|
" 'file_type': 'pdf',\n",
|
||||||
|
" 'description': \"This document is about the user manual for a Toyota Corolla, providing information and instructions on various aspects of the vehicle, including safety and security, vehicle status, driving operations, interior features, maintenance, and troubleshooting. The manual covers topics such as child seat installation, theft deterrent systems, reading driving-related information, operating the Entune audio system, and caring for the vehicle's interior and exterior. It also includes information on reporting safety defects and provides instructions for Canadian owners on seat belt and SRS air\",\n",
|
||||||
|
" 'thumbnail': 'data/thumbnails/corolla-2020-toyota-owners-manual.png'},\n",
|
||||||
|
" {'filename': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'pages': [],\n",
|
||||||
|
" 'timestamps': ['0:30-1:0',\n",
|
||||||
|
" '2:00-2:00',\n",
|
||||||
|
" '2:30-3:0',\n",
|
||||||
|
" '4:00-4:00',\n",
|
||||||
|
" '5:30-6:0',\n",
|
||||||
|
" '8:00-8:00',\n",
|
||||||
|
" '8:30-9:0',\n",
|
||||||
|
" '10:00-10:00'],\n",
|
||||||
|
" 'file_type': 'video',\n",
|
||||||
|
" 'description': \"This document is about a video tutorial series on replacing car parts, specifically the latest installment of AutoDoc's video tutorials.\",\n",
|
||||||
|
" 'thumbnail': 'data/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/30-60s.png'},\n",
|
||||||
|
" {'filename': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
|
||||||
|
" 'pages': [],\n",
|
||||||
|
" 'timestamps': ['3:30-4:0', '5:30-6:0'],\n",
|
||||||
|
" 'file_type': 'video',\n",
|
||||||
|
" 'description': \"This document is about a video tutorial series on replacing car parts, specifically the latest installment of Auto-Doc's video tutorials.\",\n",
|
||||||
|
" 'thumbnail': 'data/How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/210-240s.png'}]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc_summary"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
|||||||
@@ -0,0 +1,145 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import random\n",
|
||||||
|
"from PIL import Image, ImageDraw, ImageFont\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def create_text_thumbnail(file_path):\n",
|
||||||
|
" # Create a folder for thumbnails if it doesn't exist\n",
|
||||||
|
" thumbnail_folder = os.path.join(os.path.dirname(file_path), 'thumbnails')\n",
|
||||||
|
" os.makedirs(thumbnail_folder, exist_ok=True)\n",
|
||||||
|
" \n",
|
||||||
|
" # Extract file name (without extension)\n",
|
||||||
|
" file_name = os.path.splitext(os.path.basename(file_path))[0]\n",
|
||||||
|
" \n",
|
||||||
|
" # Create a random background color\n",
|
||||||
|
" background_color = tuple(random.randint(0, 255) for _ in range(3))\n",
|
||||||
|
" \n",
|
||||||
|
" # Create an image with the random background color\n",
|
||||||
|
" img = Image.new('RGB', (800, 400), color=background_color)\n",
|
||||||
|
" \n",
|
||||||
|
" # Initialize drawing context\n",
|
||||||
|
" d = ImageDraw.Draw(img)\n",
|
||||||
|
" \n",
|
||||||
|
" # Load a font\n",
|
||||||
|
" try:\n",
|
||||||
|
" font = ImageFont.truetype(\"arial.ttf\", 25) # Adjust the font size as needed\n",
|
||||||
|
" except IOError:\n",
|
||||||
|
" font = ImageFont.load_default()\n",
|
||||||
|
" \n",
|
||||||
|
" # Get the bounding box of the text\n",
|
||||||
|
" text_bbox = d.textbbox((0, 0), file_name, font=font)\n",
|
||||||
|
" text_width = text_bbox[2] - text_bbox[0]\n",
|
||||||
|
" text_height = text_bbox[3] - text_bbox[1]\n",
|
||||||
|
" \n",
|
||||||
|
" # Calculate the position to center the text\n",
|
||||||
|
" text_x = (img.width - text_width) / 2\n",
|
||||||
|
" text_y = (img.height - text_height) / 2\n",
|
||||||
|
" \n",
|
||||||
|
" # Draw the text onto the image\n",
|
||||||
|
" d.text((text_x, text_y), file_name, font=font, fill=(255, 255, 255)) # White text\n",
|
||||||
|
" \n",
|
||||||
|
" # Save the image\n",
|
||||||
|
" thumbnail_path = os.path.join(thumbnail_folder, f\"{file_name}.png\")\n",
|
||||||
|
" img.save(thumbnail_path)\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"Thumbnail created: {thumbnail_path}\")\n",
|
||||||
|
"\n",
|
||||||
|
"def process_directory(directory_path):\n",
|
||||||
|
" supported_extensions = ['.txt', '.pdf', '.docx', '.mp3', '.m4a']\n",
|
||||||
|
" \n",
|
||||||
|
" for file in os.listdir(directory_path):\n",
|
||||||
|
" file_path = os.path.join(directory_path, file)\n",
|
||||||
|
" if os.path.isfile(file_path):\n",
|
||||||
|
" file_extension = os.path.splitext(file)[1].lower()\n",
|
||||||
|
" if file_extension in supported_extensions:\n",
|
||||||
|
" create_text_thumbnail(file_path)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 36,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Thumbnail created: data\\thumbnails\\audio-2.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-repair.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-service.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-tire.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-tuning.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-wash.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\corolla-2020-toyota-owners-manual.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\How to change engine oil and filter on TOYOTA Corolla.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\How to change front brake pads on TOYOTA Corolla.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\How to change rear windshield wipers on TOYOTA Corolla.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\How to change spark plugs on TOYOTA COROLLA.png\n",
|
||||||
|
"Thumbnail created: data\\thumbnails\\test_rec.png\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Example usage:\n",
|
||||||
|
"directory_path = 'data'\n",
|
||||||
|
"process_directory(directory_path)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "smog_env",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@@ -12,7 +12,11 @@ from langchain_core.output_parsers import StrOutputParser
|
|||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from text_extractor import TextExtractor
|
from text_extractor import TextExtractor
|
||||||
import os
|
import os, sys
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||||
|
from loggings.logging_config import logger
|
||||||
|
import random
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
import math
|
import math
|
||||||
import json
|
import json
|
||||||
@@ -29,6 +33,7 @@ import ffmpeg
|
|||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
# OpenAI API Key
|
# OpenAI API Key
|
||||||
api_key = os.getenv('OPENAI_API_KEY')
|
api_key = os.getenv('OPENAI_API_KEY')
|
||||||
# setting up groq api key
|
# setting up groq api key
|
||||||
@@ -53,11 +58,14 @@ def load_embedding_model():
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
# loading the embedding model
|
# loading the embedding model
|
||||||
|
logger.info("Loading the embedding model")
|
||||||
embeddings = load_embedding_model()
|
embeddings = load_embedding_model()
|
||||||
|
logger.info("Embedding model loaded")
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------TEXT PREPROCESSING--------------------------------------------
|
# --------------------------------------------------------TEXT PREPROCESSING--------------------------------------------
|
||||||
def create_documents(doc, file_type='text'):
|
def create_documents(doc, file_type='text'):
|
||||||
|
logger.info(f"Creating documents from text")
|
||||||
text = doc[0].page_content
|
text = doc[0].page_content
|
||||||
metadata = doc[0].metadata
|
metadata = doc[0].metadata
|
||||||
text_splitter = RecursiveCharacterTextSplitter(
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
@@ -80,6 +88,7 @@ def create_documents(doc, file_type='text'):
|
|||||||
|
|
||||||
|
|
||||||
def load_txt_document(document_path):
|
def load_txt_document(document_path):
|
||||||
|
logger.info(f"Loading text document from {document_path}")
|
||||||
try:
|
try:
|
||||||
txt_doc = TextLoader(document_path)
|
txt_doc = TextLoader(document_path)
|
||||||
text = txt_doc.load()
|
text = txt_doc.load()
|
||||||
@@ -91,6 +100,7 @@ def load_txt_document(document_path):
|
|||||||
|
|
||||||
|
|
||||||
def load_docx_document(document_path):
|
def load_docx_document(document_path):
|
||||||
|
logger.info(f"Loading docx document from {document_path}")
|
||||||
try:
|
try:
|
||||||
docx_doc = Docx2txtLoader(document_path)
|
docx_doc = Docx2txtLoader(document_path)
|
||||||
text = docx_doc.load()
|
text = docx_doc.load()
|
||||||
@@ -103,6 +113,7 @@ def load_docx_document(document_path):
|
|||||||
|
|
||||||
# creating a function that checks the document type and loads the document
|
# creating a function that checks the document type and loads the document
|
||||||
def load_pdf_document(document_path):
|
def load_pdf_document(document_path):
|
||||||
|
logger.info(f"Loading pdf document from {document_path}")
|
||||||
try:
|
try:
|
||||||
pdf_doc = PyPDFLoader(document_path)
|
pdf_doc = PyPDFLoader(document_path)
|
||||||
pages = pdf_doc.load_and_split()
|
pages = pdf_doc.load_and_split()
|
||||||
@@ -125,11 +136,13 @@ def load_document(document_path):
|
|||||||
# ----------------------------------------------------IMAGE PROCESSING------------------------------------------------
|
# ----------------------------------------------------IMAGE PROCESSING------------------------------------------------
|
||||||
# Function to encode the image
|
# Function to encode the image
|
||||||
def encode_image(image_path):
|
def encode_image(image_path):
|
||||||
with open(image_path, "rb") as image_file:
|
logger.info(f"Encoding image {image_path}")
|
||||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
with open(image_path, "rb") as image_file:
|
||||||
|
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||||
|
|
||||||
# Vision API to process the image
|
# Vision API to process the image
|
||||||
def process_image(image_path):
|
def process_image(image_path):
|
||||||
|
logger.info(f"Processing image {image_path}")
|
||||||
global api_key
|
global api_key
|
||||||
|
|
||||||
# Getting the base64 string
|
# Getting the base64 string
|
||||||
@@ -174,10 +187,11 @@ def process_image(image_path):
|
|||||||
|
|
||||||
# create image document
|
# create image document
|
||||||
def create_image_document(image_path, file_type='image'):
|
def create_image_document(image_path, file_type='image'):
|
||||||
|
logger.info(f"Creating image document from {image_path}")
|
||||||
# getting the image name from the image path
|
# getting the image name from the image path
|
||||||
image_name = image_path.split('/')[-1].split('.')[0]
|
image_name = image_path.split('\\')[-1].split('.')[0]
|
||||||
# setting image name as metadata
|
# setting image name as metadata
|
||||||
metadata = {'filename': image_name, 'file_type': file_type}
|
metadata = {'source': image_name, 'file_type': file_type}
|
||||||
text_extractor = TextExtractor()
|
text_extractor = TextExtractor()
|
||||||
text = text_extractor.read_text_from_image(image_path)
|
text = text_extractor.read_text_from_image(image_path)
|
||||||
# removing special characters and line breaks
|
# removing special characters and line breaks
|
||||||
@@ -199,6 +213,7 @@ def create_image_document(image_path, file_type='image'):
|
|||||||
# -----------------------------------------------AUDIO PROCESSING-----------------------------------------------------
|
# -----------------------------------------------AUDIO PROCESSING-----------------------------------------------------
|
||||||
# Audio to Text
|
# Audio to Text
|
||||||
def audio_to_text(filepath):
|
def audio_to_text(filepath):
|
||||||
|
logger.info(f"Transcribing audio file {filepath}")
|
||||||
with open(filepath, "rb") as file:
|
with open(filepath, "rb") as file:
|
||||||
translation = client.audio.translations.create(
|
translation = client.audio.translations.create(
|
||||||
file=(filepath, file.read()),
|
file=(filepath, file.read()),
|
||||||
@@ -208,6 +223,7 @@ def audio_to_text(filepath):
|
|||||||
|
|
||||||
|
|
||||||
def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_output=True):
|
def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_output=True):
|
||||||
|
logger.info(f"Splitting audio file {audio_file_path} by duration")
|
||||||
# Convert chunk duration to milliseconds
|
# Convert chunk duration to milliseconds
|
||||||
chunk_length_ms = chunk_duration_minutes * 60 * 1000
|
chunk_length_ms = chunk_duration_minutes * 60 * 1000
|
||||||
|
|
||||||
@@ -247,6 +263,7 @@ def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_outpu
|
|||||||
return chunk_folder, chunk_paths
|
return chunk_folder, chunk_paths
|
||||||
|
|
||||||
def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'):
|
def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'):
|
||||||
|
logger.info(f"Transcribing audio chunks from {audio_file_path}")
|
||||||
# Split the audio file into chunks
|
# Split the audio file into chunks
|
||||||
chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes)
|
chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes)
|
||||||
|
|
||||||
@@ -270,11 +287,25 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
|
|||||||
start_min = (chunk_index - 1) * chunk_duration_minutes
|
start_min = (chunk_index - 1) * chunk_duration_minutes
|
||||||
end_min = chunk_index * chunk_duration_minutes
|
end_min = chunk_index * chunk_duration_minutes
|
||||||
actual_end_min = min(end_min, (len(AudioSegment.from_file(audio_file_path)) // 60000)) # To handle the last chunk's actual duration
|
actual_end_min = min(end_min, (len(AudioSegment.from_file(audio_file_path)) // 60000)) # To handle the last chunk's actual duration
|
||||||
|
|
||||||
|
# preparing the start and end min in a timestamp format, also also catching cases of decimal, making it a real time
|
||||||
|
if start_min % 1 == 0:
|
||||||
|
start_min = f"{int(start_min)}:00"
|
||||||
|
end_min = f"{int(end_min)}:00"
|
||||||
|
else:
|
||||||
|
# splitting the decimal part of the start and end min
|
||||||
|
start_min_int, start_min_dec = str(start_min).split('.')
|
||||||
|
end_min_int, end_min_dec = str(end_min).split('.')
|
||||||
|
# converting the decimal part to seconds
|
||||||
|
start_sec = int(start_min_dec) * 6
|
||||||
|
end_sec = int(end_min_dec) * 6
|
||||||
|
start_min = f"{start_min_int}:{start_sec}"
|
||||||
|
end_min = f"{end_min_int}:{end_sec}"
|
||||||
|
|
||||||
# Create a document with the transcript and metadata
|
# Create a document with the transcript and metadata
|
||||||
metadata = {
|
metadata = {
|
||||||
"filename": base_filename,
|
"source": base_filename,
|
||||||
"duration": f"{start_min}-{end_min} minutes",
|
"timestamp": f"{start_min}-{end_min}",
|
||||||
"file_type": file_type,
|
"file_type": file_type,
|
||||||
}
|
}
|
||||||
document = Document(page_content=transcript, metadata=metadata)
|
document = Document(page_content=transcript, metadata=metadata)
|
||||||
@@ -282,6 +313,9 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
|
|||||||
|
|
||||||
# Delete the chunk folder after processing
|
# Delete the chunk folder after processing
|
||||||
shutil.rmtree(chunk_folder)
|
shutil.rmtree(chunk_folder)
|
||||||
|
|
||||||
|
# adding a delay
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
@@ -294,7 +328,7 @@ def create_audio_document(audio_file_path, chunk_duration_minutes=3, file_type='
|
|||||||
|
|
||||||
# ------------------------------------------------VIDEO PROCESSING-----------------------------------------------------
|
# ------------------------------------------------VIDEO PROCESSING-----------------------------------------------------
|
||||||
def preprocess_video_data(video_path: str, time_interval: int):
|
def preprocess_video_data(video_path: str, time_interval: int):
|
||||||
|
logger.info(f"Preprocessing video data from {video_path}")
|
||||||
# Load the video file
|
# Load the video file
|
||||||
video = VideoFileClip(video_path)
|
video = VideoFileClip(video_path)
|
||||||
|
|
||||||
@@ -341,6 +375,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
|
|||||||
|
|
||||||
# now creating document from the audio file
|
# now creating document from the audio file
|
||||||
documents = create_audio_document(audio_path, chunk_duration_minutes=0.5, file_type='video')
|
documents = create_audio_document(audio_path, chunk_duration_minutes=0.5, file_type='video')
|
||||||
|
logger.info(f"Documents created from video {video_path}")
|
||||||
|
|
||||||
# deleting the audio file
|
# deleting the audio file
|
||||||
os.remove(audio_path)
|
os.remove(audio_path)
|
||||||
@@ -349,6 +384,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
|
|||||||
|
|
||||||
#----------------------------------------------------DOC SUMMARIZER --------------------------------------------------
|
#----------------------------------------------------DOC SUMMARIZER --------------------------------------------------
|
||||||
def doc_summarizer(document_page: list) -> str:
|
def doc_summarizer(document_page: list) -> str:
|
||||||
|
logger.info(f"Summarizing document")
|
||||||
initiator_prompt = PromptTemplate(
|
initiator_prompt = PromptTemplate(
|
||||||
template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
||||||
Create a short summary of the document based on the provided text.
|
Create a short summary of the document based on the provided text.
|
||||||
@@ -370,12 +406,15 @@ def doc_summarizer(document_page: list) -> str:
|
|||||||
#-----------------------------------------------------OTHERS--------------------------------------------------------------
|
#-----------------------------------------------------OTHERS--------------------------------------------------------------
|
||||||
|
|
||||||
def save_embedded_data(embeddings, key="data"):
|
def save_embedded_data(embeddings, key="data"):
|
||||||
embeddings.save_local(f"index/faiss_index_{key}")
|
logger.info(f"Saving embeddings")
|
||||||
print("Embeddings saved")
|
embeddings.save_local(f"index/faiss_index_{key}")
|
||||||
|
print("Embeddings saved")
|
||||||
|
return 'saved'
|
||||||
|
|
||||||
def load_embedded_data(embeddings=embeddings, key="data"):
|
def load_embedded_data(embeddings=embeddings, key="data"):
|
||||||
embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
|
logger.info(f"Loading embedded data")
|
||||||
return embed_db
|
embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
|
||||||
|
return embed_db
|
||||||
|
|
||||||
#-----------------------------------------------------Data Loading Process----------------------------------------------------
|
#-----------------------------------------------------Data Loading Process----------------------------------------------------
|
||||||
|
|
||||||
@@ -396,15 +435,15 @@ def process_document(path, extension, text_doc, image_doc, audio_doc, video_doc)
|
|||||||
elif extension in image_doc:
|
elif extension in image_doc:
|
||||||
doc = process_map["image"](path)
|
doc = process_map["image"](path)
|
||||||
num_pages = 1
|
num_pages = 1
|
||||||
doc_name = doc[0].metadata['filename']
|
doc_name = doc[0].metadata['source'].split('\\')[-1]
|
||||||
elif extension in audio_doc:
|
elif extension in audio_doc:
|
||||||
doc = process_map["audio"](path)
|
doc = process_map["audio"](path)
|
||||||
num_pages = len(doc)
|
num_pages = len(doc)
|
||||||
doc_name = doc[0].metadata['filename']
|
doc_name = doc[0].metadata['source']
|
||||||
elif extension in video_doc:
|
elif extension in video_doc:
|
||||||
doc = process_map["video"](path, time_interval=30)
|
doc = process_map["video"](path, time_interval=30)
|
||||||
num_pages = len(doc)
|
num_pages = len(doc)
|
||||||
doc_name = doc[0].metadata['filename']
|
doc_name = doc[0].metadata['source']
|
||||||
else:
|
else:
|
||||||
return None, None, None # Unhandled extension
|
return None, None, None # Unhandled extension
|
||||||
|
|
||||||
@@ -425,7 +464,7 @@ def load_documents_from_directory(directory_path: str):
|
|||||||
|
|
||||||
def process_with_delay(file):
|
def process_with_delay(file):
|
||||||
result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc)
|
result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc)
|
||||||
time.sleep(0.1) # Introduce a 0.1s delay between processing each document
|
time.sleep(0.4) # Introduce a 0.4s delay between processing each document
|
||||||
return result
|
return result
|
||||||
|
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
@@ -441,27 +480,31 @@ def load_documents_from_directory(directory_path: str):
|
|||||||
first_page = doc[0].page_content
|
first_page = doc[0].page_content
|
||||||
summary = doc_summarizer(first_page)
|
summary = doc_summarizer(first_page)
|
||||||
doc_summary.append(summary)
|
doc_summary.append(summary)
|
||||||
|
# adding some delay
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
docs_id = [uuid4().hex for _ in range(len(documents))]
|
docs_id = [uuid4().hex for _ in range(len(documents))]
|
||||||
|
|
||||||
json_file = os.path.join(directory_path, 'data.json')
|
json_file = os.path.join(directory_path, 'data.json')
|
||||||
data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages, 'doc_summaary': doc_summary}
|
# creating a dictionary for each document in the json file
|
||||||
|
for i in range(len(documents)):
|
||||||
if os.path.exists(json_file):
|
data = {doc_names[i].split("\\")[-1]: {'doc_id':docs_id[i], 'num_pages': num_pages[i], 'doc_summary': doc_summary[i]}}
|
||||||
with open(json_file, 'r+') as f:
|
if os.path.exists(json_file):
|
||||||
existing_data = json.load(f)
|
with open(json_file, 'r+') as f:
|
||||||
existing_data.update(data)
|
existing_data = json.load(f)
|
||||||
f.seek(0)
|
existing_data.update(data)
|
||||||
json.dump(existing_data, f)
|
f.seek(0)
|
||||||
else:
|
json.dump(existing_data, f)
|
||||||
with open(json_file, 'w') as f:
|
else:
|
||||||
json.dump(data, f)
|
with open(json_file, 'w') as f:
|
||||||
|
json.dump(data, f)
|
||||||
|
|
||||||
return documents, docs_id, num_pages
|
return documents, docs_id, num_pages
|
||||||
|
|
||||||
|
|
||||||
# A function to create vector store
|
# A function to create vector store
|
||||||
def create_vector_store(documents: list, docs_id: list, num_pages: list):
|
def create_vector_store(documents: list, docs_id: list, num_pages: list):
|
||||||
|
logger.info(f"Creating vector store")
|
||||||
# index set up with the embedding dimension
|
# index set up with the embedding dimension
|
||||||
index = faiss.IndexFlatL2(384)
|
index = faiss.IndexFlatL2(384)
|
||||||
# Initialize the FAISS vector store
|
# Initialize the FAISS vector store
|
||||||
@@ -476,10 +519,11 @@ def create_vector_store(documents: list, docs_id: list, num_pages: list):
|
|||||||
doc_id = docs_id[i]
|
doc_id = docs_id[i]
|
||||||
page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
|
page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
|
||||||
vector_store.add_documents(documents=documents[i], ids=page_ids)
|
vector_store.add_documents(documents=documents[i], ids=page_ids)
|
||||||
|
logger.info(f"Vector store created")
|
||||||
|
logger.info(f"Saving the vector store")
|
||||||
# saving the vector store automatically
|
# saving the vector store automatically
|
||||||
save_embedded_data(vector_store, key="data")
|
save_embedded_data(vector_store, key="data")
|
||||||
|
logger.info(f"Vector store saved")
|
||||||
return vector_store
|
return vector_store
|
||||||
|
|
||||||
# creating a function to add documents to the vector store
|
# creating a function to add documents to the vector store
|
||||||
@@ -491,14 +535,70 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
|
|||||||
page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
|
page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
|
||||||
vector_store.add_documents(documents=documents[i], ids=page_ids)
|
vector_store.add_documents(documents=documents[i], ids=page_ids)
|
||||||
print ("Documents added to the vector store")
|
print ("Documents added to the vector store")
|
||||||
|
|
||||||
|
|
||||||
|
#----------------------------------------------------------Thumbnail Generator-----------------------------------------------------
|
||||||
|
def create_text_thumbnail(file_path):
|
||||||
|
logger.info(f"Creating thumbnail for {file_path}")
|
||||||
|
# Create a folder for thumbnails if it doesn't exist
|
||||||
|
thumbnail_folder = os.path.join(os.path.dirname(file_path), 'thumbnails')
|
||||||
|
os.makedirs(thumbnail_folder, exist_ok=True)
|
||||||
|
|
||||||
|
# Extract file name (without extension)
|
||||||
|
file_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||||
|
|
||||||
|
# Create a random background color
|
||||||
|
background_color = tuple(random.randint(0, 255) for _ in range(3))
|
||||||
|
|
||||||
|
# Create an image with the random background color
|
||||||
|
img = Image.new('RGB', (800, 400), color=background_color)
|
||||||
|
|
||||||
|
# Initialize drawing context
|
||||||
|
d = ImageDraw.Draw(img)
|
||||||
|
|
||||||
|
# Load a font
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype("arial.ttf", 25) # Adjust the font size as needed
|
||||||
|
except IOError:
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
|
||||||
|
# Get the bounding box of the text
|
||||||
|
text_bbox = d.textbbox((0, 0), file_name, font=font)
|
||||||
|
text_width = text_bbox[2] - text_bbox[0]
|
||||||
|
text_height = text_bbox[3] - text_bbox[1]
|
||||||
|
|
||||||
|
# Calculate the position to center the text
|
||||||
|
text_x = (img.width - text_width) / 2
|
||||||
|
text_y = (img.height - text_height) / 2
|
||||||
|
|
||||||
|
# Draw the text onto the image
|
||||||
|
d.text((text_x, text_y), file_name, font=font, fill=(255, 255, 255)) # White text
|
||||||
|
|
||||||
|
# Save the image
|
||||||
|
thumbnail_path = os.path.join(thumbnail_folder, f"{file_name}.png")
|
||||||
|
img.save(thumbnail_path)
|
||||||
|
|
||||||
|
print(f"Thumbnail created: {thumbnail_path}")
|
||||||
|
|
||||||
|
def process_directory(directory_path):
|
||||||
|
supported_extensions = ['.txt', '.pdf', '.docx', '.mp3', '.m4a']
|
||||||
|
|
||||||
|
for file in os.listdir(directory_path):
|
||||||
|
file_path = os.path.join(directory_path, file)
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
file_extension = os.path.splitext(file)[1].lower()
|
||||||
|
if file_extension in supported_extensions:
|
||||||
|
create_text_thumbnail(file_path)
|
||||||
|
return "Done"
|
||||||
|
|
||||||
|
#-----------------------------------------------------------SEARCH-------------------------------------------------------
|
||||||
# A document search function
|
# A document search function
|
||||||
def search(query, k=20):
|
def search(query, k=20):
|
||||||
|
logger.info(f"Searching for {query}")
|
||||||
# loading the embedded data
|
# loading the embedded data
|
||||||
embed_db = load_embedded_data()
|
embed_db = load_embedded_data()
|
||||||
db = embed_db
|
db = embed_db
|
||||||
docs = db.similarity_search(query, k)
|
docs = db.similarity_search(query, k)
|
||||||
|
logger.info(f"Search completed")
|
||||||
all = []
|
all = []
|
||||||
info = []
|
info = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
|||||||
Reference in New Issue
Block a user