diff --git a/data/documents.json b/data/documents.json index f7608a2f..c700f924 100644 --- a/data/documents.json +++ b/data/documents.json @@ -1 +1 @@ -{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["320bc9710952499baa9553d3f0d4e727", "6ba07e1cf09a4ae6b54863040f901328", "dd067c452bd146e4becd61bde8602a3c", "640493ad16b546d38851216917d3e82b", "08cf1c3c8eab4efe9f81efcf8ce770be", "d8d6a3ca9a0a44e08cd4423ee3fb979d", "2b6e45cd99ff46b08242282a423642d4", "05524682d2e9425c83c9b57693182c50", "4eb170648fbe47c3b87b2831a97f0dd8", "cec3e82f0432402e940a0299bfa086fe"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]} \ No newline at end of file +{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["911dea9b7b714adf8ecafd483a37741b", "e9308cf998a64cab8aef9bde04795fc4", "1f013bd6ac464a07acd8d60a425142d7", "3c99eade18a344d4a568cd77e58558f3", "708f7ba5121442c692dba1346097c4e4", "9e134439a0b84f26a213a288cbe45ab5", "8eb0c0f04eb44e2bafba7640ed34b26b", "c4571cec94034cf38b5d2d59a694464e", "4253d6ea5aeb43f1a65b11a2a631389f", "e2c66cfac77b4099908b1d41a66a7fe2"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]} \ No newline at end of file diff --git a/data_ingestion/__pycache__/utils.cpython-311.pyc b/data_ingestion/__pycache__/utils.cpython-311.pyc index bba3716f..403ae253 100644 Binary files a/data_ingestion/__pycache__/utils.cpython-311.pyc and b/data_ingestion/__pycache__/utils.cpython-311.pyc differ diff --git a/data_ingestion/data_ingest.py b/data_ingestion/data_ingest.py new file mode 100644 index 00000000..ac48baf7 --- /dev/null +++ b/data_ingestion/data_ingest.py @@ -0,0 +1,33 @@ +from utils import create_vector_store, save_embedded_data, load_documents_from_directory, load_embedding_model +import sys, os + +# Add the root directory to sys.path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from loggings.logging_config import logger + + +# This module will load in the data, you only need to add the data path to it. +data_path = './data' + +# loading the embeddings +logger.info(f"Loading the embeddings") +embeddings = load_embedding_model() +logger.info(f"Embeddings loaded") + +def load_data(data_path: str): + logger.info(f"Loading data from {data_path}") + documents, docs_id, num_pages = load_documents_from_directory(data_path) + logger.info(f"Data loaded") + logger.info(f"Creating vector store") + embed_db = create_vector_store(embeddings,documents, docs_id, num_pages) + logger.info(f"Vector store created") + logger.info(f"Saving the vector store") + # saving the embedded data + save_embedded_data(embed_db) + logger.info(f"Vector store saved") + + return "Vector store created and saved" + + +if __name__ == "__main__": + load_data(data_path) \ No newline at end of file diff --git a/data_ingestion/utils.py b/data_ingestion/utils.py index d55241f7..50ce4885 100644 --- a/data_ingestion/utils.py +++ b/data_ingestion/utils.py @@ -79,8 +79,6 @@ def load_pdf_document(document_path): raise ValueError(f"Error loading -- {document_path}") - - # A general function that loads textual documents def load_document(document_path): if document_path.endswith(".pdf"): diff --git a/loggings/app.log b/loggings/app.log index 179a0c21..991e935a 100644 --- a/loggings/app.log +++ b/loggings/app.log @@ -29,3 +29,12 @@ 2024-08-07 17:49:19,962 - INFO - Receiving the search query 2024-08-07 17:49:29,498 - INFO - Searching for what is lda? 2024-08-07 17:49:29,876 - INFO - Search completed +2024-08-07 18:01:11,466 - INFO - Loading the embeddings +2024-08-07 18:01:11,468 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en +2024-08-07 18:01:14,748 - INFO - Embeddings loaded +2024-08-07 18:01:14,748 - INFO - Loading data from ./data +2024-08-07 18:01:58,703 - INFO - Data loaded +2024-08-07 18:01:58,703 - INFO - Creating vector store +2024-08-07 18:02:08,752 - INFO - Vector store created +2024-08-07 18:02:08,752 - INFO - Saving the vector store +2024-08-07 18:02:08,752 - INFO - Vector store saved diff --git a/vec-db/index/faiss_index_data/index.faiss b/vec-db/index/faiss_index_data/index.faiss index 0381e07d..cdb06fcd 100644 Binary files a/vec-db/index/faiss_index_data/index.faiss and b/vec-db/index/faiss_index_data/index.faiss differ diff --git a/vec-db/index/faiss_index_data/index.pkl b/vec-db/index/faiss_index_data/index.pkl index 0eaf1b3f..4d6c0f41 100644 Binary files a/vec-db/index/faiss_index_data/index.pkl and b/vec-db/index/faiss_index_data/index.pkl differ