Pdf Ingestion pipeline completed

2024-08-05 22:14:19 +01:00
parent b0c3eb8032
commit c34de21971
15 changed files with 318 additions and 90 deletions
@@ -0,0 +1,42 @@
+from langchain_community.document_loaders import PyPDFLoader
+from utils import create_vector_store, save_embedded_data
+import sys, os
+
+#  Add the root directory to sys.path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from loggings.logging_config import logger
+
+# A function to load the pdf document
+def load_pdf_document(document_path: str):
+    logger.info(f"Loading document from {document_path}")
+    logger.info(f"Checking if the document is a pdf")
+    if document_path.endswith(".pdf"):
+        logger.info(f"Document is a pdf")
+        logger.info(f"Loading and splitting the document")
+        pdf_doc = PyPDFLoader(document_path)
+        pages = pdf_doc.load_and_split()
+        logger.info(f"Document loaded and split into {len(pages)} pages")
+        return pages
+    else:
+        logger.error(f"Unsupported document type for {document_path}")
+        raise ValueError(f"Unsupported document type for {document_path}")
+    
+# creating a function that loads the pdf document and creates the vector store
+def load_and_create_vector_store(document_path: str):
+    logger.info(f"Loading and creating vector store for {document_path}")
+    pages = load_pdf_document(document_path)
+    logger.info(f"Creating vector store")
+    embed_db = create_vector_store(pages)
+    logger.info(f"Vector store created")
+    logger.info(f"Saving the vector store") 
+    # saving the embedded data
+    save_embedded_data(embed_db)
+    logger.info(f"Vector store saved")
+    
+    return "Vector store created and saved"
+
+
+
+if __name__ == "__main__":
+    document_path = "./data/corolla-2020-toyota-owners-manual.pdf"
+    load_and_create_vector_store(document_path)
@@ -0,0 +1,42 @@
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+from langchain_community.vectorstores import FAISS
+
+
+# loading the embedding model
+def load_embedding_model():
+    model_name = "BAAI/bge-small-en"
+    model_kwargs = {"device": "cuda"} #can also be cpu
+    encode_kwargs = {"normalize_embeddings": True}
+    embeddings = HuggingFaceBgeEmbeddings(
+                model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
+            )
+    return embeddings
+
+# loading the embedding model
+embeddings = load_embedding_model()
+
+
+# A function to create the vector store
+def create_vector_store(document, embeddings=embeddings):
+  embed_db = FAISS.from_documents(document, embeddings)
+  return embed_db
+
+# A function to save the embedded data
+def save_embedded_data(docs,  key="pdf"):
+  docs.save_local(f"vec-db/index/faiss_index_{key}")
+  print("Embeddings saved")
+
+# A function to load the embedded data
+def load_embedded_data(embeddings=embeddings, key="pdf"):
+  embed_db = FAISS.load_local(f"vec-db/index/faiss_index_{key}", embeddings, allow_dangerous_deserialization=True)
+  return embed_db
+
+# A document search function
+def search(db, query, k=4):
+  docs = db.similarity_search(query, k)
+  all = ""
+  pages = []
+  for doc in docs:
+      all += f"{doc.page_content}\n"
+      pages.append(doc.metadata['page'])
+  return docs[0].page_content, all, pages