42 lines
1.7 KiB
Python
42 lines
1.7 KiB
Python
|
|
from langchain_community.document_loaders import PyPDFLoader
|
||
|
|
from utils import create_vector_store, save_embedded_data
|
||
|
|
import sys, os
|
||
|
|
|
||
|
|
# Add the root directory to sys.path
|
||
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||
|
|
from loggings.logging_config import logger
|
||
|
|
|
||
|
|
# A function to load the pdf document
|
||
|
|
def load_pdf_document(document_path: str):
|
||
|
|
logger.info(f"Loading document from {document_path}")
|
||
|
|
logger.info(f"Checking if the document is a pdf")
|
||
|
|
if document_path.endswith(".pdf"):
|
||
|
|
logger.info(f"Document is a pdf")
|
||
|
|
logger.info(f"Loading and splitting the document")
|
||
|
|
pdf_doc = PyPDFLoader(document_path)
|
||
|
|
pages = pdf_doc.load_and_split()
|
||
|
|
logger.info(f"Document loaded and split into {len(pages)} pages")
|
||
|
|
return pages
|
||
|
|
else:
|
||
|
|
logger.error(f"Unsupported document type for {document_path}")
|
||
|
|
raise ValueError(f"Unsupported document type for {document_path}")
|
||
|
|
|
||
|
|
# creating a function that loads the pdf document and creates the vector store
|
||
|
|
def load_and_create_vector_store(document_path: str):
|
||
|
|
logger.info(f"Loading and creating vector store for {document_path}")
|
||
|
|
pages = load_pdf_document(document_path)
|
||
|
|
logger.info(f"Creating vector store")
|
||
|
|
embed_db = create_vector_store(pages)
|
||
|
|
logger.info(f"Vector store created")
|
||
|
|
logger.info(f"Saving the vector store")
|
||
|
|
# saving the embedded data
|
||
|
|
save_embedded_data(embed_db)
|
||
|
|
logger.info(f"Vector store saved")
|
||
|
|
|
||
|
|
return "Vector store created and saved"
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
document_path = "./data/corolla-2020-toyota-owners-manual.pdf"
|
||
|
|
load_and_create_vector_store(document_path)
|