created a data ingestion module.
This commit is contained in:
+1
-1
@@ -1 +1 @@
|
||||
{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["320bc9710952499baa9553d3f0d4e727", "6ba07e1cf09a4ae6b54863040f901328", "dd067c452bd146e4becd61bde8602a3c", "640493ad16b546d38851216917d3e82b", "08cf1c3c8eab4efe9f81efcf8ce770be", "d8d6a3ca9a0a44e08cd4423ee3fb979d", "2b6e45cd99ff46b08242282a423642d4", "05524682d2e9425c83c9b57693182c50", "4eb170648fbe47c3b87b2831a97f0dd8", "cec3e82f0432402e940a0299bfa086fe"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}
|
||||
{"doc_names": ["Car-Repair-Receipt-repair", "Car-Repair-Receipt-service", "Car-Repair-Receipt-tire", "Car-Repair-Receipt-tuning", "Car-Repair-Receipt-wash", "corolla-2020-toyota-owners-manual", "How to change engine oil and filter on TOYOTA Corolla", "How to change front brake pads on TOYOTA Corolla", "How to change rear windshield wipers on TOYOTA Corolla", "How to change spark plugs on TOYOTA COROLLA"], "docs_id": ["911dea9b7b714adf8ecafd483a37741b", "e9308cf998a64cab8aef9bde04795fc4", "1f013bd6ac464a07acd8d60a425142d7", "3c99eade18a344d4a568cd77e58558f3", "708f7ba5121442c692dba1346097c4e4", "9e134439a0b84f26a213a288cbe45ab5", "8eb0c0f04eb44e2bafba7640ed34b26b", "c4571cec94034cf38b5d2d59a694464e", "4253d6ea5aeb43f1a65b11a2a631389f", "e2c66cfac77b4099908b1d41a66a7fe2"], "num_pages": [1, 2, 2, 2, 1, 588, 6, 7, 6, 10]}
|
||||
Binary file not shown.
@@ -0,0 +1,33 @@
|
||||
from utils import create_vector_store, save_embedded_data, load_documents_from_directory, load_embedding_model
|
||||
import sys, os
|
||||
|
||||
# Add the root directory to sys.path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
from loggings.logging_config import logger
|
||||
|
||||
|
||||
# This module will load in the data, you only need to add the data path to it.
|
||||
data_path = './data'
|
||||
|
||||
# loading the embeddings
|
||||
logger.info(f"Loading the embeddings")
|
||||
embeddings = load_embedding_model()
|
||||
logger.info(f"Embeddings loaded")
|
||||
|
||||
def load_data(data_path: str):
|
||||
logger.info(f"Loading data from {data_path}")
|
||||
documents, docs_id, num_pages = load_documents_from_directory(data_path)
|
||||
logger.info(f"Data loaded")
|
||||
logger.info(f"Creating vector store")
|
||||
embed_db = create_vector_store(embeddings,documents, docs_id, num_pages)
|
||||
logger.info(f"Vector store created")
|
||||
logger.info(f"Saving the vector store")
|
||||
# saving the embedded data
|
||||
save_embedded_data(embed_db)
|
||||
logger.info(f"Vector store saved")
|
||||
|
||||
return "Vector store created and saved"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_data(data_path)
|
||||
@@ -79,8 +79,6 @@ def load_pdf_document(document_path):
|
||||
raise ValueError(f"Error loading -- {document_path}")
|
||||
|
||||
|
||||
|
||||
|
||||
# A general function that loads textual documents
|
||||
def load_document(document_path):
|
||||
if document_path.endswith(".pdf"):
|
||||
|
||||
@@ -29,3 +29,12 @@
|
||||
2024-08-07 17:49:19,962 - INFO - Receiving the search query
|
||||
2024-08-07 17:49:29,498 - INFO - Searching for what is lda?
|
||||
2024-08-07 17:49:29,876 - INFO - Search completed
|
||||
2024-08-07 18:01:11,466 - INFO - Loading the embeddings
|
||||
2024-08-07 18:01:11,468 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en
|
||||
2024-08-07 18:01:14,748 - INFO - Embeddings loaded
|
||||
2024-08-07 18:01:14,748 - INFO - Loading data from ./data
|
||||
2024-08-07 18:01:58,703 - INFO - Data loaded
|
||||
2024-08-07 18:01:58,703 - INFO - Creating vector store
|
||||
2024-08-07 18:02:08,752 - INFO - Vector store created
|
||||
2024-08-07 18:02:08,752 - INFO - Saving the vector store
|
||||
2024-08-07 18:02:08,752 - INFO - Vector store saved
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user