created a data ingestion module.

This commit is contained in:
timothyafolami
2024-08-07 18:03:15 +01:00
parent 8e6acc7cf8
commit 228fffefd8
7 changed files with 43 additions and 3 deletions
+1 -1
View File
@@ -1 +1 @@
{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["320bc9710952499baa9553d3f0d4e727", "6ba07e1cf09a4ae6b54863040f901328", "dd067c452bd146e4becd61bde8602a3c", "640493ad16b546d38851216917d3e82b", "08cf1c3c8eab4efe9f81efcf8ce770be", "d8d6a3ca9a0a44e08cd4423ee3fb979d", "2b6e45cd99ff46b08242282a423642d4", "05524682d2e9425c83c9b57693182c50", "4eb170648fbe47c3b87b2831a97f0dd8", "cec3e82f0432402e940a0299bfa086fe"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]} {"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["911dea9b7b714adf8ecafd483a37741b", "e9308cf998a64cab8aef9bde04795fc4", "1f013bd6ac464a07acd8d60a425142d7", "3c99eade18a344d4a568cd77e58558f3", "708f7ba5121442c692dba1346097c4e4", "9e134439a0b84f26a213a288cbe45ab5", "8eb0c0f04eb44e2bafba7640ed34b26b", "c4571cec94034cf38b5d2d59a694464e", "4253d6ea5aeb43f1a65b11a2a631389f", "e2c66cfac77b4099908b1d41a66a7fe2"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}
Binary file not shown.
+33
View File
@@ -0,0 +1,33 @@
from utils import create_vector_store, save_embedded_data, load_documents_from_directory, load_embedding_model
import sys, os
# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from loggings.logging_config import logger
# This module will load in the data, you only need to add the data path to it.
data_path = './data'
# loading the embeddings
logger.info(f"Loading the embeddings")
embeddings = load_embedding_model()
logger.info(f"Embeddings loaded")
def load_data(data_path: str):
logger.info(f"Loading data from {data_path}")
documents, docs_id, num_pages = load_documents_from_directory(data_path)
logger.info(f"Data loaded")
logger.info(f"Creating vector store")
embed_db = create_vector_store(embeddings,documents, docs_id, num_pages)
logger.info(f"Vector store created")
logger.info(f"Saving the vector store")
# saving the embedded data
save_embedded_data(embed_db)
logger.info(f"Vector store saved")
return "Vector store created and saved"
if __name__ == "__main__":
load_data(data_path)
-2
View File
@@ -79,8 +79,6 @@ def load_pdf_document(document_path):
raise ValueError(f"Error loading -- {document_path}") raise ValueError(f"Error loading -- {document_path}")
# A general function that loads textual documents # A general function that loads textual documents
def load_document(document_path): def load_document(document_path):
if document_path.endswith(".pdf"): if document_path.endswith(".pdf"):
+9
View File
@@ -29,3 +29,12 @@
2024-08-07 17:49:19,962 - INFO - Receiving the search query 2024-08-07 17:49:19,962 - INFO - Receiving the search query
2024-08-07 17:49:29,498 - INFO - Searching for what is lda? 2024-08-07 17:49:29,498 - INFO - Searching for what is lda?
2024-08-07 17:49:29,876 - INFO - Search completed 2024-08-07 17:49:29,876 - INFO - Search completed
2024-08-07 18:01:11,466 - INFO - Loading the embeddings
2024-08-07 18:01:11,468 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
2024-08-07 18:01:14,748 - INFO - Embeddings loaded
2024-08-07 18:01:14,748 - INFO - Loading data from ./data
2024-08-07 18:01:58,703 - INFO - Data loaded
2024-08-07 18:01:58,703 - INFO - Creating vector store
2024-08-07 18:02:08,752 - INFO - Vector store created
2024-08-07 18:02:08,752 - INFO - Saving the vector store
2024-08-07 18:02:08,752 - INFO - Vector store saved
Binary file not shown.
Binary file not shown.