Pdf Ingestion pipeline completed
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,42 @@
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
from utils import create_vector_store, save_embedded_data
|
||||
import sys, os
|
||||
|
||||
# Add the root directory to sys.path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
from loggings.logging_config import logger
|
||||
|
||||
# A function to load the pdf document
|
||||
def load_pdf_document(document_path: str):
|
||||
logger.info(f"Loading document from {document_path}")
|
||||
logger.info(f"Checking if the document is a pdf")
|
||||
if document_path.endswith(".pdf"):
|
||||
logger.info(f"Document is a pdf")
|
||||
logger.info(f"Loading and splitting the document")
|
||||
pdf_doc = PyPDFLoader(document_path)
|
||||
pages = pdf_doc.load_and_split()
|
||||
logger.info(f"Document loaded and split into {len(pages)} pages")
|
||||
return pages
|
||||
else:
|
||||
logger.error(f"Unsupported document type for {document_path}")
|
||||
raise ValueError(f"Unsupported document type for {document_path}")
|
||||
|
||||
# creating a function that loads the pdf document and creates the vector store
|
||||
def load_and_create_vector_store(document_path: str):
|
||||
logger.info(f"Loading and creating vector store for {document_path}")
|
||||
pages = load_pdf_document(document_path)
|
||||
logger.info(f"Creating vector store")
|
||||
embed_db = create_vector_store(pages)
|
||||
logger.info(f"Vector store created")
|
||||
logger.info(f"Saving the vector store")
|
||||
# saving the embedded data
|
||||
save_embedded_data(embed_db)
|
||||
logger.info(f"Vector store saved")
|
||||
|
||||
return "Vector store created and saved"
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
document_path = "./data/corolla-2020-toyota-owners-manual.pdf"
|
||||
load_and_create_vector_store(document_path)
|
||||
@@ -0,0 +1,42 @@
|
||||
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
||||
from langchain_community.vectorstores import FAISS
|
||||
|
||||
|
||||
# loading the embedding model
|
||||
def load_embedding_model():
|
||||
model_name = "BAAI/bge-small-en"
|
||||
model_kwargs = {"device": "cuda"} #can also be cpu
|
||||
encode_kwargs = {"normalize_embeddings": True}
|
||||
embeddings = HuggingFaceBgeEmbeddings(
|
||||
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
||||
)
|
||||
return embeddings
|
||||
|
||||
# loading the embedding model
|
||||
embeddings = load_embedding_model()
|
||||
|
||||
|
||||
# A function to create the vector store
|
||||
def create_vector_store(document, embeddings=embeddings):
|
||||
embed_db = FAISS.from_documents(document, embeddings)
|
||||
return embed_db
|
||||
|
||||
# A function to save the embedded data
|
||||
def save_embedded_data(docs, key="pdf"):
|
||||
docs.save_local(f"vec-db/index/faiss_index_{key}")
|
||||
print("Embeddings saved")
|
||||
|
||||
# A function to load the embedded data
|
||||
def load_embedded_data(embeddings=embeddings, key="pdf"):
|
||||
embed_db = FAISS.load_local(f"vec-db/index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
|
||||
return embed_db
|
||||
|
||||
# A document search function
|
||||
def search(db, query, k=4):
|
||||
docs = db.similarity_search(query, k)
|
||||
all = ""
|
||||
pages = []
|
||||
for doc in docs:
|
||||
all += f"{doc.page_content}\n"
|
||||
pages.append(doc.metadata['page'])
|
||||
return docs[0].page_content, all, pages
|
||||
Reference in New Issue
Block a user