commit bfdc89cda125c416eed0f8dd5cb1ee6bbcca20b6 Author: tahsin-protik Date: Wed Feb 22 01:55:26 2023 +0600 first commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..f4da9dc --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +The objective of this project is to get better answers for user queries from gpt-3 on a specific matter. +So, there can be some sectors, the data for those are not updated on gpt-3. To handle that, +we tried to follow the following steps: +- First we'll read the data we want to use in a specific case. +- We will divide in to some chunks. +- Transform the chunks in to vector using embedding algorithm +- Save the vectors to a vector database. +- If an user quer appears, we'll find some best matches. +So, these are the steps we do s preparation of dataset. +Then, +If a query appeared, we do the following: +- We first take the quer and find matches with the data we have on ector database, like a semantic serch. +- We take those contexts, and generate a prompt appropriate to the use case, including the contexts and the user's original question. We tell gpt-3 to +answer based on the context. + +Note: The embedding model used here has 384 dimensions. + +Tasks: +1. Load the text from the given docx file and split them in to some chunks. (A splitter is defined, you can use that.) +2. Add all the splitted chunks to the vector database. (Use addData function) +3. Create a prompt using the process discussed above. +4. Get the answer from gpt-3 api. +5. Get all the things together such that, we can pass a query using the function user_query and get a solid answer. +6. The embedding model we used here is a basic embedding model, change the model and use openai's embedding model 'text-embedding-ada-002' +7. Can we improve something in this process? Any suggestion you think of list it down. +8. Do you think you have a better idea to handle the whole process? Write a summarry about the alternative approach. \ No newline at end of file diff --git a/ds_task_1.ipynb b/ds_task_1.ipynb new file mode 100644 index 0000000..8a8f115 --- /dev/null +++ b/ds_task_1.ipynb @@ -0,0 +1,180 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xcccEva1WWrh" + }, + "outputs": [], + "source": [ + "!pip install sentence_transformers pinecone openai" + ] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"\"" + ], + "metadata": { + "id": "BYUc4Z7vY2bb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#This is for embedding. In here, one LM model from huggingface used.\n", + "\n", + "from sentence_transformers import SentenceTransformer, util\n", + "model = SentenceTransformer('all-MiniLM-L6-v2')\n", + "\n", + "\n", + "text ='Abc'\n", + "model.encode(text).tolist() #exmple how to do encoding." + ], + "metadata": { + "id": "MZAy8TaKY6pI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#Function to split long documents in to smaller parts\n", + "def split_text_into_chunks(plain_text, max_chars=2000):\n", + " text_chunks = []\n", + " current_chunk = \"\"\n", + " for line in plain_text.split(\"\\n\"):\n", + " if len(current_chunk) + len(line) + 1 <= max_chars:\n", + " current_chunk += line + \" \"\n", + " else:\n", + " text_chunks.append(current_chunk.strip())\n", + " current_chunk = line + \" \"\n", + " if current_chunk:\n", + " text_chunks.append(current_chunk.strip())\n", + " return text_chunks" + ], + "metadata": { + "id": "gu04tON0cZvT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import pinecone\n", + "pinecone.init(api_key=\"\", environment=\"\") #Initialization of vector database module\n", + "index = pinecone.Index(\"\") #Fill out with index name." + ], + "metadata": { + "id": "EqCYELlQZN0m" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def addData(corpusData):\n", + " id = index.describe_index_stats()['total_vector_count']\n", + " for i in range(len(corpusData)):\n", + " chunk=corpusData[i]\n", + " chunkInfo=(str(id+i),\n", + " model.encode(chunk).tolist(), #We are using the model to encode the original chunk of text.\n", + " {'context': chunk}) #In metadata we are storing the original text here as context. \n", + " index.upsert(vectors=[chunkInfo])" + ], + "metadata": { + "id": "CAqSjLcQZjjJ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#This function is responsible for matching the input string with alread existing data on vector database.\n", + "\n", + "def find_match(query,k):\n", + " query_em = model.encode(query).tolist()\n", + " result = index.query(query_em, top_k=k, includeMetadata=True)\n", + " \n", + " return [result['matches'][i]['metadata']['context'] for i in range(k)]" + ], + "metadata": { + "id": "8VIZ5_ufbRQ5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def create_prompt(context,query):\n", + " #Unfinished, Should be generated with the context/contexts we find by doing semantaic search\n", + " pass" + ], + "metadata": { + "id": "AoRDzK85aF9E" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def generate_answer(prompt):\n", + " #Unfinished, Pass the generated prompt and pass it to gpt-3 to get answers.\n", + " pass" + ], + "metadata": { + "id": "IyPNrKW3aeoD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def user_query(query):\n", + " \n", + " pass\n" + ], + "metadata": { + "id": "uWM2IcOKarWz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "user_query(\"How can I do this?\")" + ], + "metadata": { + "id": "jBds94_gbJ_G" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file