AI indexing completed

This commit is contained in:
timothyafolami
2024-08-16 17:37:28 +01:00
parent 713354371e
commit cff9511d86
13 changed files with 2843 additions and 257 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
+14 -6
View File
@@ -1,19 +1,18 @@
import sys, os import sys, os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from utils import create_vector_store, save_embedded_data, load_documents_from_directory, load_embedding_model from utils import create_vector_store, save_embedded_data, load_documents_from_directory, process_directory
from loggings.logging_config import logger from loggings.logging_config import logger
import time
# This module will load in the data, you only need to add the data path to it. # This module will load in the data, you only need to add the data path to it.
data_path = './data' data_path = './data'
# # loading the embeddings
# logger.info(f"Loading the embeddings")
# embeddings = load_embedding_model()
# logger.info(f"Embeddings loaded")
def load_data(data_path: str): def load_data(data_path: str):
logger.info(f"Loading data from {data_path}") logger.info(f"Loading data from {data_path}")
start_time = time.time()
# logging the start time
logger.info(f"Start time: {start_time}")
documents, docs_id, num_pages = load_documents_from_directory(data_path) documents, docs_id, num_pages = load_documents_from_directory(data_path)
logger.info(f"Data loaded") logger.info(f"Data loaded")
logger.info(f"Creating vector store") logger.info(f"Creating vector store")
@@ -23,8 +22,17 @@ def load_data(data_path: str):
# saving the embedded data # saving the embedded data
save_embedded_data(embed_db) save_embedded_data(embed_db)
logger.info(f"Vector store saved") logger.info(f"Vector store saved")
end_time = time.time()
logger.info(f"End time: {end_time}")
time_taken = end_time - start_time
logger.info(f"Time taken: {time_taken}")
print("Vector store created and saved") print("Vector store created and saved")
# creating the thumbnails
logger.info(f"Creating thumbnails")
status = process_directory(data_path)
print(f"{status}: Thumbnails created.")
logger.info(f"Thumbnails created")
return embed_db return embed_db
Binary file not shown.
Binary file not shown.
+2147
View File
File diff suppressed because it is too large Load Diff
+10 -12
View File
@@ -4,16 +4,12 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel from pydantic import BaseModel
from utils import search, load_embedded_data from utils import load_embedded_data, load_documents_from_directory, create_vector_store, save_embedded_data
from search import search_and_summarize
from data_ingest import load_data from data_ingest import load_data
app = FastAPI() app = FastAPI()
# Initialize global variables for FAISS index and vector store
try:
vector_store = load_embedded_data()
except Exception as e:
vector_store = None
# Define allowed origins for CORS # Define allowed origins for CORS
origins = [ origins = [
@@ -37,19 +33,21 @@ class SearchRequest(BaseModel):
@app.get("/load_documents") @app.get("/load_documents")
def load_documents(directory: str): def load_documents(directory: str):
global vector_store
# Load documents using the utility function # loading the documents from the directory
vector_store = load_data(directory) documents, docs_id, num_pages = load_documents_from_directory(directory)
# embedding the documents
embed_db = create_vector_store(documents, docs_id, num_pages)
# saving the embedded data
status = save_embedded_data(embed_db)
return {"status": "Documents loaded successfully"} return {"status": "Documents loaded successfully"}
@app.get("/search") @app.post("/search")
def search(request: SearchRequest): def search(request: SearchRequest):
global vector_store
# Perform search using the utility function # Perform search using the utility function
results = search(vector_store, request.query) results = search_and_summarize(request.query)
return {"results": results} return {"results": results}
+2
View File
@@ -13,6 +13,8 @@ docx2txt
docx docx
fastapi[standard] fastapi[standard]
pdfplumber pdfplumber
pypdf
python-docx
pytesseract pytesseract
groq groq
python-dotenv python-dotenv
+84 -7
View File
@@ -1,21 +1,98 @@
from utils import search from utils import search
import sys, os import sys, os
import json
# Add the root directory to sys.path # Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from loggings.logging_config import logger from loggings.logging_config import logger
# a function to get data description
def get_data_description(data_path):
# ensuring no // or / or extension is present
data_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
# print(data_name)
# open the data.json file
with open('data/data.json') as f:
data = json.load(f)
existing_data = data.keys()
if data_name in existing_data:
return data[data_name]['doc_summary']
else:
return 'No description available'
# getting data thumbnais.
def get_data_thumbnail(data_path, timestamp = None):
# ensuring no // or / or extension is present
file_name = data_path.split('/')[-1].split('\\')[-1].split('.')[0]
# first check is to see if the file_name has a .png image in the thumbnail folder
if os.path.exists(f'data/thumbnails/{file_name}.png'):
return f'data/thumbnails/{file_name}.png'
# the second check is to see if we have a folder with this file_name
elif os.path.exists(f'data/{file_name}'):
# so now we want to access the first timestamp
if timestamp:
first = timestamp[0]
# split by -
start, end = first.split('-')
# we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds
start = int(start.split(':')[0])*60 + int(start.split(':')[1])
end = int(end.split(':')[0])*60 + int(end.split(':')[1])
# bringing them together
image_file = f"{start}-{end}s.png"
# niw checkin if the file exists
if os.path.exists(f'data/{file_name}/{image_file}'):
return f'data/{file_name}/{image_file}'
def summarize_doc_search(data):
summary = {}
for item in data:
source = item['source']
if source not in summary:
summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}
if 'page' in item:
summary[source]['pages'].append(item['page'])
if 'timestamp' in item:
summary[source]['timestamps'].append(item['timestamp'])
# Formatting the summary as a list of dictionaries
summarized_list = [
{'filename': key.split("\\")[-1],
'pages': value['pages'],
'timestamps': value['timestamps'],
'file_type': value['file_type']}
for key, value in summary.items()
]
# getting the file description and thumbnail
for item in summarized_list:
item['description'] = get_data_description(item['filename'])
# ehcking if we have an empty timestamp list
if len(item['timestamps']) > 0:
item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])
else:
item['thumbnail'] = get_data_thumbnail(item['filename'])
return summarized_list
# a function that perform the search and summary together
def search_and_summarize(query):
logger.info("Searching for the query")
docs = search(query)
logger.info("Search completed")
logger.info("Summarizing search results")
summary = summarize_doc_search(docs)
logger.info("Search results summarized")
return summary
if __name__ == "__main__": if __name__ == "__main__":
logger.info("Receiving the search query") logger.info("Receiving the search query")
query = input("Enter the search query: ") query = input("Enter the search query: ")
logger.info(f"Searching for {query}") logger.info(f"Search query received: {query}")
page_content, all, pages = search(query) logger.info("Searching and summarizing the search results")
logger.info("Search completed") search_results = search_and_summarize(query)
logger.info(f"Page content: {page_content}") logger.info("Search results summarized")
print(f"Page content: {all}") print(search_results)
print(f"Pages: {pages}")
print("Search completed")
+307 -198
View File
@@ -11,16 +11,29 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-08-16 16:06:32,880 - INFO - Loading the embedding model\n",
"2024-08-16 16:06:38,758 - WARNING - c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" from tqdm.autonotebook import tqdm, trange\n",
"\n",
"2024-08-16 16:06:47,268 - INFO - PyTorch version 2.4.0+cu124 available.\n",
"2024-08-16 16:06:47,868 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en\n",
"2024-08-16 16:06:55,638 - INFO - Embedding model loaded\n"
]
},
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"True" "True"
] ]
}, },
"execution_count": 2, "execution_count": 1,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -32,17 +45,51 @@
"from langchain_groq import ChatGroq\n", "from langchain_groq import ChatGroq\n",
"from langchain_core.prompts.prompt import PromptTemplate\n", "from langchain_core.prompts.prompt import PromptTemplate\n",
"from langchain_core.output_parsers import StrOutputParser\n", "from langchain_core.output_parsers import StrOutputParser\n",
"from collections import defaultdict\n",
"import json\n",
"load_dotenv()" "load_dotenv()"
] ]
}, },
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# setting up groq api key\n",
"# os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# setting up groq api key\n", "\n",
"os.environ[\"GROQ_API_KEY\"] = os.getenv('GROQ_API_KEY')" "# # chat set up\n",
"# GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n",
"\n",
"\n",
"# ### Chains #####\n",
"# # Initiator\n",
"# def doc_summarizer(document_page: list) -> str:\n",
"# initiator_prompt = PromptTemplate(\n",
"# template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
"# Create a short summary of the document based on the provided text. \n",
" \n",
"# Start with: This document is about...\n",
" \n",
"# <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
"# DOCUMENT: {document_page} \\n\n",
" \n",
"# <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
"# input_variables=[\"document_page\"],\n",
"# )\n",
"\n",
"# initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
"# output = initiator_router.invoke({\"document_page\":document_page})\n",
"# return output\n"
] ]
}, },
{ {
@@ -51,119 +98,84 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"\n", "document_page = 'Wirebrush WD 40'\n",
"# chat set up\n", "# testing the function\n",
"GROQ_LLM = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\", max_tokens=100)\n", "# summary = doc_summarizer(document_page)"
"\n",
"\n",
"### Chains #####\n",
"# Initiator\n",
"def doc_summarizer(document_page: list) -> str:\n",
" initiator_prompt = PromptTemplate(\n",
" template=\"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
" Create a short summary of the document based on the provided text. \n",
" \n",
" Start with: This document is about...\n",
" \n",
" <|eot_id|><|start_header_id|>user<|end_header_id|>\n",
" DOCUMENT: {document_page} \\n\n",
" \n",
" <|eot_id|><|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
" input_variables=[\"document_page\"],\n",
" )\n",
"\n",
" initiator_router = initiator_prompt | GROQ_LLM | StrOutputParser()\n",
" output = initiator_router.invoke({\"document_page\":document_page})\n",
" return output\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [
"document_page = 'How to change the engine oil of a toyota corrolla.'\n",
"# testing the function\n",
"summary = doc_summarizer(document_page)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "name": "stderr",
"text/plain": [ "output_type": "stream",
"'This document is about providing a step-by-step guide on how to change the engine oil of a Toyota Corolla.'" "text": [
"2024-08-16 16:06:55,717 - INFO - Searching for Wirebrush WD 40\n",
"2024-08-16 16:06:55,717 - INFO - Loading embedded data\n",
"2024-08-16 16:06:56,487 - WARNING - c:\\Users\\timmy_3aupohg\\anaconda3\\envs\\smog_env\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:439: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
" attn_output = torch.nn.functional.scaled_dot_product_attention(\n",
"\n",
"2024-08-16 16:06:56,628 - INFO - Search completed\n"
] ]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
} }
], ],
"source": [
"summary"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [ "source": [
"docs = search(document_page)" "docs = search(document_page)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[{'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", "[{'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 424},\n",
" 'page': 1,\n", " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" 'file_type': 'text'},\n", " 'timestamp': '0:30-1:0',\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 438},\n", " 'file_type': 'video'},\n",
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" 'page': 3,\n", " 'timestamp': '2:00-2:00',\n",
" 'file_type': 'text'},\n", " 'file_type': 'video'},\n",
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" 'page': 2,\n", " 'timestamp': '2:30-3:0',\n",
" 'file_type': 'text'},\n", " 'file_type': 'video'},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 525},\n", " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", " 'timestamp': '4:00-4:00',\n",
" 'page': 2,\n", " 'file_type': 'video'},\n",
" 'file_type': 'text'},\n", " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", " 'timestamp': '5:30-6:0',\n",
" 'page': 3,\n", " 'file_type': 'video'},\n",
" 'file_type': 'text'},\n", " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" {'source': './data\\\\How to change engine oil and filter on TOYOTA Corolla.txt',\n", " 'timestamp': '8:00-8:00',\n",
" 'page': 0,\n", " 'file_type': 'video'},\n",
" 'file_type': 'text'},\n", " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", " 'timestamp': '8:30-9:0',\n",
" 'page': 5,\n", " 'file_type': 'video'},\n",
" 'file_type': 'text'},\n", " {'source': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" {'source': './data\\\\How to change spark plugs on TOYOTA COROLLA.docx',\n", " 'timestamp': '10:00-10:00',\n",
" 'page': 6,\n", " 'file_type': 'video'},\n",
" 'file_type': 'text'},\n", " {'source': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 526},\n", " 'timestamp': '3:30-4:0',\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422},\n", " 'file_type': 'video'},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 514},\n", " {'source': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 153},\n", " 'timestamp': '5:30-6:0',\n",
" {'filename': 'audio-2', 'duration': '0-3 minutes', 'file_type': 'audio'},\n", " 'file_type': 'video'},\n",
" {'filename': 'audio-2', 'duration': '3-6 minutes', 'file_type': 'audio'},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 329},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 149},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 264},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 513},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 290},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 436},\n", " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 201},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 148}]" " {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 326},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 315},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 317},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 325},\n",
" {'source': './data\\\\corolla-2020-toyota-owners-manual.pdf', 'page': 422}]"
] ]
}, },
"execution_count": 8, "execution_count": 6,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -174,132 +186,229 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 20,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from collections import defaultdict\n", "# a function to get data description\n",
"\n", "def get_data_description(data_path):\n",
"def transform_file_data(input_data):\n", " # ensuring no // or / or extension is present\n",
" # Create a dictionary to aggregate data by filename\n", " data_name = data_path.split('/')[-1].split('\\\\')[-1].split('.')[0]\n",
" aggregated_data = defaultdict(lambda: {\n", " # print(data_name)\n",
" 'filename': '',\n", " # open the data.json file\n",
" 'pages': [],\n", " with open('data/data.json') as f:\n",
" 'timestamps': [],\n", " data = json.load(f)\n",
" 'description': 'lorem ipsum',\n", " existing_data = data.keys()\n",
" 'filetype': '',\n", " if data_name in existing_data:\n",
" 'thumbnail': '',\n", " return data[data_name]['doc_summary']\n",
" 'track_id': 123\n", " else:\n",
" })\n", " return 'No description available'"
"\n",
" for item in input_data:\n",
" if 'source' in item:\n",
" file_path = item['source']\n",
" filename = file_path.split('\\\\')[-1]\n",
" extension = filename.split('.')[-1]\n",
"\n",
" aggregated_data[filename]['filename'] = filename\n",
" aggregated_data[filename]['filetype'] = extension\n",
" aggregated_data[filename]['thumbnail'] = f\"{filename.split('.')[0]}.jpg\"\n",
"\n",
" if extension in ['pdf', 'txt', 'docx']:\n",
" aggregated_data[filename]['pages'].append(item['page'])\n",
" elif extension in ['mp4', 'mkv', 'flv']:\n",
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
" elif extension in ['mp3', 'wav', 'flac']:\n",
" aggregated_data[filename]['timestamps'].append(item['page'])\n",
" elif extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:\n",
" aggregated_data[filename].pop('pages', None) # Remove pages if it's an image\n",
" aggregated_data[filename].pop('timestamps', None) # Remove timestamps if it's an image\n",
"\n",
" elif 'filename' in item:\n",
" filename = item['filename']\n",
" extension = item['file_type']\n",
" aggregated_data[filename]['filename'] = f\"{filename}.{extension}\"\n",
" aggregated_data[filename]['filetype'] = extension\n",
" aggregated_data[filename]['thumbnail'] = f\"{filename}.jpg\"\n",
" if 'duration' in item:\n",
" start_time, end_time = item['duration'].split(' minutes')[0].split('-')\n",
" aggregated_data[filename]['timestamps'].append((int(start_time), int(end_time)))\n",
"\n",
" # Convert aggregated data to the desired output format\n",
" output_data = []\n",
" for filename, data in aggregated_data.items():\n",
" # Remove empty lists for pages and timestamps\n",
" if not data['pages']:\n",
" data.pop('pages', None)\n",
" if not data['timestamps']:\n",
" data.pop('timestamps', None)\n",
" output_data.append(data)\n",
"\n",
" return output_data\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt', 'pages': [1, 3, 2, 0], 'description': 'lorem ipsum', 'filetype': 'txt', 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg', 'track_id': 123}\n",
"{'filename': 'corolla-2020-toyota-owners-manual.pdf', 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148], 'description': 'lorem ipsum', 'filetype': 'pdf', 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg', 'track_id': 123}\n",
"{'filename': 'How to change spark plugs on TOYOTA COROLLA.docx', 'pages': [2, 3, 5, 6], 'description': 'lorem ipsum', 'filetype': 'docx', 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg', 'track_id': 123}\n",
"{'filename': 'audio-2.audio', 'timestamps': [(0, 3), (3, 6)], 'description': 'lorem ipsum', 'filetype': 'audio', 'thumbnail': 'audio-2.jpg', 'track_id': 123}\n"
]
}
],
"source": [
"output = transform_file_data(docs)\n",
"for item in output:\n",
" print(item)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[{'filename': 'How to change engine oil and filter on TOYOTA Corolla.txt',\n", "\"This document is about a video tutorial series on replacing car parts, specifically the latest installment of AutoDoc's video tutorials.\""
" 'pages': [1, 3, 2, 0],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'txt',\n",
" 'thumbnail': 'How to change engine oil and filter on TOYOTA Corolla.jpg',\n",
" 'track_id': 123},\n",
" {'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
" 'pages': [438, 525, 526, 422, 514, 153, 149, 513, 436, 148],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'pdf',\n",
" 'thumbnail': 'corolla-2020-toyota-owners-manual.jpg',\n",
" 'track_id': 123},\n",
" {'filename': 'How to change spark plugs on TOYOTA COROLLA.docx',\n",
" 'pages': [2, 3, 5, 6],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'docx',\n",
" 'thumbnail': 'How to change spark plugs on TOYOTA COROLLA.jpg',\n",
" 'track_id': 123},\n",
" {'filename': 'audio-2.audio',\n",
" 'timestamps': [(0, 3), (3, 6)],\n",
" 'description': 'lorem ipsum',\n",
" 'filetype': 'audio',\n",
" 'thumbnail': 'audio-2.jpg',\n",
" 'track_id': 123}]"
] ]
}, },
"execution_count": 12, "execution_count": 21,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"output" "get_data_description('How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]')"
] ]
}, },
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# getting data thumbnais. \n",
"def get_data_thumbnail(data_path, timestamp = None):\n",
" # ensuring no // or / or extension is present\n",
" file_name = data_path.split('/')[-1].split('\\\\')[-1].split('.')[0]\n",
" # first check is to see if the file_name has a .png image in the thumbnail folder\n",
" if os.path.exists(f'data/thumbnails/{file_name}.png'):\n",
" return f'data/thumbnails/{file_name}.png'\n",
" # the second check is to see if we have a folder with this file_name\n",
" elif os.path.exists(f'data/{file_name}'):\n",
" # so now we want to access the first timestamp\n",
" if timestamp:\n",
" first = timestamp[0]\n",
" # split by -\n",
" start, end = first.split('-')\n",
" # we want to convert something like 03:00, 04:00, 03:30 which is in min:sec to seconds\n",
" start = int(start.split(':')[0])*60 + int(start.split(':')[1])\n",
" end = int(end.split(':')[0])*60 + int(end.split(':')[1])\n",
" # bringing them together\n",
" image_file = f\"{start}-{end}s.png\"\n",
" # niw checkin if the file exists\n",
" if os.path.exists(f'data/{file_name}/{image_file}'):\n",
" return f'data/{file_name}/{image_file}'"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'data/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/210-240s.png'"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_data_thumbnail('How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]', timestamp=['3:30-4:0', '5:30-6:0'])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'data/thumbnails/corolla-2020-toyota-owners-manual.png'"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_data_thumbnail(\"./data\\\\corolla-2020-toyota-owners-manual.pdf'\")"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"def summarize_doc_search(data):\n",
" summary = {}\n",
"\n",
" for item in data:\n",
" source = item['source']\n",
" if source not in summary:\n",
" summary[source] = {'pages': [], 'timestamps': [], 'file_type': item.get('file_type', 'pdf')}\n",
" \n",
" if 'page' in item:\n",
" summary[source]['pages'].append(item['page'])\n",
" if 'timestamp' in item:\n",
" summary[source]['timestamps'].append(item['timestamp'])\n",
" \n",
" # Formatting the summary as a list of dictionaries\n",
" summarized_list = [\n",
" {'filename': key.split(\"\\\\\")[-1], \n",
" 'pages': value['pages'], \n",
" 'timestamps': value['timestamps'], \n",
" 'file_type': value['file_type']}\n",
" for key, value in summary.items()\n",
" ]\n",
" \n",
" # getting the file description and thumbnail\n",
" for item in summarized_list:\n",
" item['description'] = get_data_description(item['filename'])\n",
" # ehcking if we have an empty timestamp list\n",
" if len(item['timestamps']) > 0:\n",
" item['thumbnail'] = get_data_thumbnail(item['filename'], item['timestamps'])\n",
" else:\n",
" item['thumbnail'] = get_data_thumbnail(item['filename'])\n",
" \n",
" return summarized_list"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"doc_summary = summarize_doc_search(docs)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'filename': 'corolla-2020-toyota-owners-manual.pdf',\n",
" 'pages': [424, 329, 264, 290, 201, 326, 315, 317, 325, 422],\n",
" 'timestamps': [],\n",
" 'file_type': 'pdf',\n",
" 'description': \"This document is about the user manual for a Toyota Corolla, providing information and instructions on various aspects of the vehicle, including safety and security, vehicle status, driving operations, interior features, maintenance, and troubleshooting. The manual covers topics such as child seat installation, theft deterrent systems, reading driving-related information, operating the Entune audio system, and caring for the vehicle's interior and exterior. It also includes information on reporting safety defects and provides instructions for Canadian owners on seat belt and SRS air\",\n",
" 'thumbnail': 'data/thumbnails/corolla-2020-toyota-owners-manual.png'},\n",
" {'filename': 'How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" 'pages': [],\n",
" 'timestamps': ['0:30-1:0',\n",
" '2:00-2:00',\n",
" '2:30-3:0',\n",
" '4:00-4:00',\n",
" '5:30-6:0',\n",
" '8:00-8:00',\n",
" '8:30-9:0',\n",
" '10:00-10:00'],\n",
" 'file_type': 'video',\n",
" 'description': \"This document is about a video tutorial series on replacing car parts, specifically the latest installment of AutoDoc's video tutorials.\",\n",
" 'thumbnail': 'data/How to change front wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/30-60s.png'},\n",
" {'filename': 'How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]',\n",
" 'pages': [],\n",
" 'timestamps': ['3:30-4:0', '5:30-6:0'],\n",
" 'file_type': 'video',\n",
" 'description': \"This document is about a video tutorial series on replacing car parts, specifically the latest installment of Auto-Doc's video tutorials.\",\n",
" 'thumbnail': 'data/How to change rear wheel bearing on TOYOTA RAV4 II [TUTORIAL AUTODOC]/210-240s.png'}]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc_summary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
+145
View File
@@ -0,0 +1,145 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import random\n",
"from PIL import Image, ImageDraw, ImageFont\n"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"def create_text_thumbnail(file_path):\n",
" # Create a folder for thumbnails if it doesn't exist\n",
" thumbnail_folder = os.path.join(os.path.dirname(file_path), 'thumbnails')\n",
" os.makedirs(thumbnail_folder, exist_ok=True)\n",
" \n",
" # Extract file name (without extension)\n",
" file_name = os.path.splitext(os.path.basename(file_path))[0]\n",
" \n",
" # Create a random background color\n",
" background_color = tuple(random.randint(0, 255) for _ in range(3))\n",
" \n",
" # Create an image with the random background color\n",
" img = Image.new('RGB', (800, 400), color=background_color)\n",
" \n",
" # Initialize drawing context\n",
" d = ImageDraw.Draw(img)\n",
" \n",
" # Load a font\n",
" try:\n",
" font = ImageFont.truetype(\"arial.ttf\", 25) # Adjust the font size as needed\n",
" except IOError:\n",
" font = ImageFont.load_default()\n",
" \n",
" # Get the bounding box of the text\n",
" text_bbox = d.textbbox((0, 0), file_name, font=font)\n",
" text_width = text_bbox[2] - text_bbox[0]\n",
" text_height = text_bbox[3] - text_bbox[1]\n",
" \n",
" # Calculate the position to center the text\n",
" text_x = (img.width - text_width) / 2\n",
" text_y = (img.height - text_height) / 2\n",
" \n",
" # Draw the text onto the image\n",
" d.text((text_x, text_y), file_name, font=font, fill=(255, 255, 255)) # White text\n",
" \n",
" # Save the image\n",
" thumbnail_path = os.path.join(thumbnail_folder, f\"{file_name}.png\")\n",
" img.save(thumbnail_path)\n",
" \n",
" print(f\"Thumbnail created: {thumbnail_path}\")\n",
"\n",
"def process_directory(directory_path):\n",
" supported_extensions = ['.txt', '.pdf', '.docx', '.mp3', '.m4a']\n",
" \n",
" for file in os.listdir(directory_path):\n",
" file_path = os.path.join(directory_path, file)\n",
" if os.path.isfile(file_path):\n",
" file_extension = os.path.splitext(file)[1].lower()\n",
" if file_extension in supported_extensions:\n",
" create_text_thumbnail(file_path)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Thumbnail created: data\\thumbnails\\audio-2.png\n",
"Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-repair.png\n",
"Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-service.png\n",
"Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-tire.png\n",
"Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-tuning.png\n",
"Thumbnail created: data\\thumbnails\\Car-Repair-Receipt-wash.png\n",
"Thumbnail created: data\\thumbnails\\corolla-2020-toyota-owners-manual.png\n",
"Thumbnail created: data\\thumbnails\\How to change engine oil and filter on TOYOTA Corolla.png\n",
"Thumbnail created: data\\thumbnails\\How to change front brake pads on TOYOTA Corolla.png\n",
"Thumbnail created: data\\thumbnails\\How to change rear windshield wipers on TOYOTA Corolla.png\n",
"Thumbnail created: data\\thumbnails\\How to change spark plugs on TOYOTA COROLLA.png\n",
"Thumbnail created: data\\thumbnails\\test_rec.png\n"
]
}
],
"source": [
"# Example usage:\n",
"directory_path = 'data'\n",
"process_directory(directory_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "smog_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
+114 -14
View File
@@ -12,7 +12,11 @@ from langchain_core.output_parsers import StrOutputParser
from uuid import uuid4 from uuid import uuid4
from langchain_core.documents import Document from langchain_core.documents import Document
from text_extractor import TextExtractor from text_extractor import TextExtractor
import os import os, sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from loggings.logging_config import logger
import random
from PIL import Image, ImageDraw, ImageFont
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
import math import math
import json import json
@@ -29,6 +33,7 @@ import ffmpeg
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
# OpenAI API Key # OpenAI API Key
api_key = os.getenv('OPENAI_API_KEY') api_key = os.getenv('OPENAI_API_KEY')
# setting up groq api key # setting up groq api key
@@ -53,11 +58,14 @@ def load_embedding_model():
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
# loading the embedding model # loading the embedding model
logger.info("Loading the embedding model")
embeddings = load_embedding_model() embeddings = load_embedding_model()
logger.info("Embedding model loaded")
# --------------------------------------------------------TEXT PREPROCESSING-------------------------------------------- # --------------------------------------------------------TEXT PREPROCESSING--------------------------------------------
def create_documents(doc, file_type='text'): def create_documents(doc, file_type='text'):
logger.info(f"Creating documents from text")
text = doc[0].page_content text = doc[0].page_content
metadata = doc[0].metadata metadata = doc[0].metadata
text_splitter = RecursiveCharacterTextSplitter( text_splitter = RecursiveCharacterTextSplitter(
@@ -80,6 +88,7 @@ def create_documents(doc, file_type='text'):
def load_txt_document(document_path): def load_txt_document(document_path):
logger.info(f"Loading text document from {document_path}")
try: try:
txt_doc = TextLoader(document_path) txt_doc = TextLoader(document_path)
text = txt_doc.load() text = txt_doc.load()
@@ -91,6 +100,7 @@ def load_txt_document(document_path):
def load_docx_document(document_path): def load_docx_document(document_path):
logger.info(f"Loading docx document from {document_path}")
try: try:
docx_doc = Docx2txtLoader(document_path) docx_doc = Docx2txtLoader(document_path)
text = docx_doc.load() text = docx_doc.load()
@@ -103,6 +113,7 @@ def load_docx_document(document_path):
# creating a function that checks the document type and loads the document # creating a function that checks the document type and loads the document
def load_pdf_document(document_path): def load_pdf_document(document_path):
logger.info(f"Loading pdf document from {document_path}")
try: try:
pdf_doc = PyPDFLoader(document_path) pdf_doc = PyPDFLoader(document_path)
pages = pdf_doc.load_and_split() pages = pdf_doc.load_and_split()
@@ -125,11 +136,13 @@ def load_document(document_path):
# ----------------------------------------------------IMAGE PROCESSING------------------------------------------------ # ----------------------------------------------------IMAGE PROCESSING------------------------------------------------
# Function to encode the image # Function to encode the image
def encode_image(image_path): def encode_image(image_path):
logger.info(f"Encoding image {image_path}")
with open(image_path, "rb") as image_file: with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8') return base64.b64encode(image_file.read()).decode('utf-8')
# Vision API to process the image # Vision API to process the image
def process_image(image_path): def process_image(image_path):
logger.info(f"Processing image {image_path}")
global api_key global api_key
# Getting the base64 string # Getting the base64 string
@@ -174,10 +187,11 @@ def process_image(image_path):
# create image document # create image document
def create_image_document(image_path, file_type='image'): def create_image_document(image_path, file_type='image'):
logger.info(f"Creating image document from {image_path}")
# getting the image name from the image path # getting the image name from the image path
image_name = image_path.split('/')[-1].split('.')[0] image_name = image_path.split('\\')[-1].split('.')[0]
# setting image name as metadata # setting image name as metadata
metadata = {'filename': image_name, 'file_type': file_type} metadata = {'source': image_name, 'file_type': file_type}
text_extractor = TextExtractor() text_extractor = TextExtractor()
text = text_extractor.read_text_from_image(image_path) text = text_extractor.read_text_from_image(image_path)
# removing special characters and line breaks # removing special characters and line breaks
@@ -199,6 +213,7 @@ def create_image_document(image_path, file_type='image'):
# -----------------------------------------------AUDIO PROCESSING----------------------------------------------------- # -----------------------------------------------AUDIO PROCESSING-----------------------------------------------------
# Audio to Text # Audio to Text
def audio_to_text(filepath): def audio_to_text(filepath):
logger.info(f"Transcribing audio file {filepath}")
with open(filepath, "rb") as file: with open(filepath, "rb") as file:
translation = client.audio.translations.create( translation = client.audio.translations.create(
file=(filepath, file.read()), file=(filepath, file.read()),
@@ -208,6 +223,7 @@ def audio_to_text(filepath):
def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_output=True): def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_output=True):
logger.info(f"Splitting audio file {audio_file_path} by duration")
# Convert chunk duration to milliseconds # Convert chunk duration to milliseconds
chunk_length_ms = chunk_duration_minutes * 60 * 1000 chunk_length_ms = chunk_duration_minutes * 60 * 1000
@@ -247,6 +263,7 @@ def split_audio_by_duration(audio_file_path, chunk_duration_minutes, print_outpu
return chunk_folder, chunk_paths return chunk_folder, chunk_paths
def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'): def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='audio'):
logger.info(f"Transcribing audio chunks from {audio_file_path}")
# Split the audio file into chunks # Split the audio file into chunks
chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes) chunk_folder, chunk_paths = split_audio_by_duration(audio_file_path, chunk_duration_minutes)
@@ -271,10 +288,24 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
end_min = chunk_index * chunk_duration_minutes end_min = chunk_index * chunk_duration_minutes
actual_end_min = min(end_min, (len(AudioSegment.from_file(audio_file_path)) // 60000)) # To handle the last chunk's actual duration actual_end_min = min(end_min, (len(AudioSegment.from_file(audio_file_path)) // 60000)) # To handle the last chunk's actual duration
# preparing the start and end min in a timestamp format, also also catching cases of decimal, making it a real time
if start_min % 1 == 0:
start_min = f"{int(start_min)}:00"
end_min = f"{int(end_min)}:00"
else:
# splitting the decimal part of the start and end min
start_min_int, start_min_dec = str(start_min).split('.')
end_min_int, end_min_dec = str(end_min).split('.')
# converting the decimal part to seconds
start_sec = int(start_min_dec) * 6
end_sec = int(end_min_dec) * 6
start_min = f"{start_min_int}:{start_sec}"
end_min = f"{end_min_int}:{end_sec}"
# Create a document with the transcript and metadata # Create a document with the transcript and metadata
metadata = { metadata = {
"filename": base_filename, "source": base_filename,
"duration": f"{start_min}-{end_min} minutes", "timestamp": f"{start_min}-{end_min}",
"file_type": file_type, "file_type": file_type,
} }
document = Document(page_content=transcript, metadata=metadata) document = Document(page_content=transcript, metadata=metadata)
@@ -283,6 +314,9 @@ def transcribe_audio_chunks(audio_file_path, chunk_duration_minutes, file_type='
# Delete the chunk folder after processing # Delete the chunk folder after processing
shutil.rmtree(chunk_folder) shutil.rmtree(chunk_folder)
# adding a delay
time.sleep(0.2)
return documents return documents
@@ -294,7 +328,7 @@ def create_audio_document(audio_file_path, chunk_duration_minutes=3, file_type='
# ------------------------------------------------VIDEO PROCESSING----------------------------------------------------- # ------------------------------------------------VIDEO PROCESSING-----------------------------------------------------
def preprocess_video_data(video_path: str, time_interval: int): def preprocess_video_data(video_path: str, time_interval: int):
logger.info(f"Preprocessing video data from {video_path}")
# Load the video file # Load the video file
video = VideoFileClip(video_path) video = VideoFileClip(video_path)
@@ -341,6 +375,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
# now creating document from the audio file # now creating document from the audio file
documents = create_audio_document(audio_path, chunk_duration_minutes=0.5, file_type='video') documents = create_audio_document(audio_path, chunk_duration_minutes=0.5, file_type='video')
logger.info(f"Documents created from video {video_path}")
# deleting the audio file # deleting the audio file
os.remove(audio_path) os.remove(audio_path)
@@ -349,6 +384,7 @@ def preprocess_video_data(video_path: str, time_interval: int):
#----------------------------------------------------DOC SUMMARIZER -------------------------------------------------- #----------------------------------------------------DOC SUMMARIZER --------------------------------------------------
def doc_summarizer(document_page: list) -> str: def doc_summarizer(document_page: list) -> str:
logger.info(f"Summarizing document")
initiator_prompt = PromptTemplate( initiator_prompt = PromptTemplate(
template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Create a short summary of the document based on the provided text. Create a short summary of the document based on the provided text.
@@ -370,10 +406,13 @@ def doc_summarizer(document_page: list) -> str:
#-----------------------------------------------------OTHERS-------------------------------------------------------------- #-----------------------------------------------------OTHERS--------------------------------------------------------------
def save_embedded_data(embeddings, key="data"): def save_embedded_data(embeddings, key="data"):
logger.info(f"Saving embeddings")
embeddings.save_local(f"index/faiss_index_{key}") embeddings.save_local(f"index/faiss_index_{key}")
print("Embeddings saved") print("Embeddings saved")
return 'saved'
def load_embedded_data(embeddings=embeddings, key="data"): def load_embedded_data(embeddings=embeddings, key="data"):
logger.info(f"Loading embedded data")
embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True) embed_db = FAISS.load_local(f"index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
return embed_db return embed_db
@@ -396,15 +435,15 @@ def process_document(path, extension, text_doc, image_doc, audio_doc, video_doc)
elif extension in image_doc: elif extension in image_doc:
doc = process_map["image"](path) doc = process_map["image"](path)
num_pages = 1 num_pages = 1
doc_name = doc[0].metadata['filename'] doc_name = doc[0].metadata['source'].split('\\')[-1]
elif extension in audio_doc: elif extension in audio_doc:
doc = process_map["audio"](path) doc = process_map["audio"](path)
num_pages = len(doc) num_pages = len(doc)
doc_name = doc[0].metadata['filename'] doc_name = doc[0].metadata['source']
elif extension in video_doc: elif extension in video_doc:
doc = process_map["video"](path, time_interval=30) doc = process_map["video"](path, time_interval=30)
num_pages = len(doc) num_pages = len(doc)
doc_name = doc[0].metadata['filename'] doc_name = doc[0].metadata['source']
else: else:
return None, None, None # Unhandled extension return None, None, None # Unhandled extension
@@ -425,7 +464,7 @@ def load_documents_from_directory(directory_path: str):
def process_with_delay(file): def process_with_delay(file):
result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc) result = process_document(os.path.join(directory_path, file), file.split('.')[-1], text_doc, image_doc, audio_doc, video_doc)
time.sleep(0.1) # Introduce a 0.1s delay between processing each document time.sleep(0.4) # Introduce a 0.4s delay between processing each document
return result return result
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
@@ -441,12 +480,15 @@ def load_documents_from_directory(directory_path: str):
first_page = doc[0].page_content first_page = doc[0].page_content
summary = doc_summarizer(first_page) summary = doc_summarizer(first_page)
doc_summary.append(summary) doc_summary.append(summary)
# adding some delay
time.sleep(0.5)
docs_id = [uuid4().hex for _ in range(len(documents))] docs_id = [uuid4().hex for _ in range(len(documents))]
json_file = os.path.join(directory_path, 'data.json') json_file = os.path.join(directory_path, 'data.json')
data = {'doc_names': doc_names, 'docs_id': docs_id, 'num_pages': num_pages, 'doc_summaary': doc_summary} # creating a dictionary for each document in the json file
for i in range(len(documents)):
data = {doc_names[i].split("\\")[-1]: {'doc_id':docs_id[i], 'num_pages': num_pages[i], 'doc_summary': doc_summary[i]}}
if os.path.exists(json_file): if os.path.exists(json_file):
with open(json_file, 'r+') as f: with open(json_file, 'r+') as f:
existing_data = json.load(f) existing_data = json.load(f)
@@ -462,6 +504,7 @@ def load_documents_from_directory(directory_path: str):
# A function to create vector store # A function to create vector store
def create_vector_store(documents: list, docs_id: list, num_pages: list): def create_vector_store(documents: list, docs_id: list, num_pages: list):
logger.info(f"Creating vector store")
# index set up with the embedding dimension # index set up with the embedding dimension
index = faiss.IndexFlatL2(384) index = faiss.IndexFlatL2(384)
# Initialize the FAISS vector store # Initialize the FAISS vector store
@@ -476,10 +519,11 @@ def create_vector_store(documents: list, docs_id: list, num_pages: list):
doc_id = docs_id[i] doc_id = docs_id[i]
page_ids = [doc_id+ str(i) for i in range(num_pages[i])] page_ids = [doc_id+ str(i) for i in range(num_pages[i])]
vector_store.add_documents(documents=documents[i], ids=page_ids) vector_store.add_documents(documents=documents[i], ids=page_ids)
logger.info(f"Vector store created")
logger.info(f"Saving the vector store")
# saving the vector store automatically # saving the vector store automatically
save_embedded_data(vector_store, key="data") save_embedded_data(vector_store, key="data")
logger.info(f"Vector store saved")
return vector_store return vector_store
# creating a function to add documents to the vector store # creating a function to add documents to the vector store
@@ -492,13 +536,69 @@ def add_documents_to_vector_store(embeddings, documents: list, docs_id: list, nu
vector_store.add_documents(documents=documents[i], ids=page_ids) vector_store.add_documents(documents=documents[i], ids=page_ids)
print ("Documents added to the vector store") print ("Documents added to the vector store")
#----------------------------------------------------------Thumbnail Generator-----------------------------------------------------
def create_text_thumbnail(file_path):
logger.info(f"Creating thumbnail for {file_path}")
# Create a folder for thumbnails if it doesn't exist
thumbnail_folder = os.path.join(os.path.dirname(file_path), 'thumbnails')
os.makedirs(thumbnail_folder, exist_ok=True)
# Extract file name (without extension)
file_name = os.path.splitext(os.path.basename(file_path))[0]
# Create a random background color
background_color = tuple(random.randint(0, 255) for _ in range(3))
# Create an image with the random background color
img = Image.new('RGB', (800, 400), color=background_color)
# Initialize drawing context
d = ImageDraw.Draw(img)
# Load a font
try:
font = ImageFont.truetype("arial.ttf", 25) # Adjust the font size as needed
except IOError:
font = ImageFont.load_default()
# Get the bounding box of the text
text_bbox = d.textbbox((0, 0), file_name, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
# Calculate the position to center the text
text_x = (img.width - text_width) / 2
text_y = (img.height - text_height) / 2
# Draw the text onto the image
d.text((text_x, text_y), file_name, font=font, fill=(255, 255, 255)) # White text
# Save the image
thumbnail_path = os.path.join(thumbnail_folder, f"{file_name}.png")
img.save(thumbnail_path)
print(f"Thumbnail created: {thumbnail_path}")
def process_directory(directory_path):
supported_extensions = ['.txt', '.pdf', '.docx', '.mp3', '.m4a']
for file in os.listdir(directory_path):
file_path = os.path.join(directory_path, file)
if os.path.isfile(file_path):
file_extension = os.path.splitext(file)[1].lower()
if file_extension in supported_extensions:
create_text_thumbnail(file_path)
return "Done"
#-----------------------------------------------------------SEARCH-------------------------------------------------------
# A document search function # A document search function
def search(query, k=20): def search(query, k=20):
logger.info(f"Searching for {query}")
# loading the embedded data # loading the embedded data
embed_db = load_embedded_data() embed_db = load_embedded_data()
db = embed_db db = embed_db
docs = db.similarity_search(query, k) docs = db.similarity_search(query, k)
logger.info(f"Search completed")
all = [] all = []
info = [] info = []
for doc in docs: for doc in docs: