{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "xcccEva1WWrh" }, "outputs": [], "source": [ "!pip install sentence_transformers pinecone openai" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "BYUc4Z7vY2bb" }, "outputs": [], "source": [ "import os\n", "os.environ['OPENAI_API_KEY'] = \"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MZAy8TaKY6pI" }, "outputs": [], "source": [ "#This is for embedding. In here, one LM model from huggingface used.\n", "\n", "from sentence_transformers import SentenceTransformer, util\n", "model = SentenceTransformer('all-MiniLM-L6-v2')\n", "\n", "\n", "text ='Abc'\n", "model.encode(text).tolist() #exmple how to do encoding." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gu04tON0cZvT" }, "outputs": [], "source": [ "#Function to split long documents in to smaller parts\n", "def split_text_into_chunks(plain_text, max_chars=2000):\n", " text_chunks = []\n", " current_chunk = \"\"\n", " for line in plain_text.split(\"\\n\"):\n", " if len(current_chunk) + len(line) + 1 <= max_chars:\n", " current_chunk += line + \" \"\n", " else:\n", " text_chunks.append(current_chunk.strip())\n", " current_chunk = line + \" \"\n", " if current_chunk:\n", " text_chunks.append(current_chunk.strip())\n", " return text_chunks" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "EqCYELlQZN0m" }, "outputs": [], "source": [ "import pinecone\n", "pinecone.init(api_key=\"\", environment=\"\") #Todo: Initialization of vector database module\n", "index = pinecone.Index(\"\") #Todo: Fill out with index name." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "CAqSjLcQZjjJ" }, "outputs": [], "source": [ "def addData(corpusData):\n", " id = index.describe_index_stats()['total_vector_count']\n", " for i in range(len(corpusData)):\n", " chunk=corpusData[i]\n", " chunkInfo=(str(id+i),\n", " model.encode(chunk).tolist(), #We are using the model to encode the original chunk of text.\n", " {'context': chunk}) #In metadata we are storing the original text here as context. \n", " index.upsert(vectors=[chunkInfo])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8VIZ5_ufbRQ5" }, "outputs": [], "source": [ "#This function is responsible for matching the input string with alread existing data on vector database.\n", "\n", "def find_match(query,k):\n", " query_em = model.encode(query).tolist()\n", " result = index.query(query_em, top_k=k, includeMetadata=True)\n", " \n", " return [result['matches'][i]['metadata']['context'] for i in range(k)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AoRDzK85aF9E" }, "outputs": [], "source": [ "def create_prompt(context,query):\n", " #Todo: Should be generated with the context/contexts we find by doing semantaic search\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "IyPNrKW3aeoD" }, "outputs": [], "source": [ "def generate_answer(prompt):\n", " #Todo: Pass the generated prompt and pass it to gpt-3 to get answers.\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uWM2IcOKarWz" }, "outputs": [], "source": [ "def user_query(query):\n", " #Todo: Make all the things together.\n", " pass\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jBds94_gbJ_G" }, "outputs": [], "source": [ "user_query(\"How can I do this?\")" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }