first commit

This commit is contained in:
tahsin-protik
2023-02-22 01:55:26 +06:00
commit bfdc89cda1
2 changed files with 206 additions and 0 deletions
+26
View File
@@ -0,0 +1,26 @@
The objective of this project is to get better answers for user queries from gpt-3 on a specific matter.
So, there can be some sectors, the data for those are not updated on gpt-3. To handle that,
we tried to follow the following steps:
- First we'll read the data we want to use in a specific case.
- We will divide in to some chunks.
- Transform the chunks in to vector using embedding algorithm
- Save the vectors to a vector database.
- If an user quer appears, we'll find some best matches.
So, these are the steps we do s preparation of dataset.
Then,
If a query appeared, we do the following:
- We first take the quer and find matches with the data we have on ector database, like a semantic serch.
- We take those contexts, and generate a prompt appropriate to the use case, including the contexts and the user's original question. We tell gpt-3 to
answer based on the context.
Note: The embedding model used here has 384 dimensions.
Tasks:
1. Load the text from the given docx file and split them in to some chunks. (A splitter is defined, you can use that.)
2. Add all the splitted chunks to the vector database. (Use addData function)
3. Create a prompt using the process discussed above.
4. Get the answer from gpt-3 api.
5. Get all the things together such that, we can pass a query using the function user_query and get a solid answer.
6. The embedding model we used here is a basic embedding model, change the model and use openai's embedding model 'text-embedding-ada-002'
7. Can we improve something in this process? Any suggestion you think of list it down.
8. Do you think you have a better idea to handle the whole process? Write a summarry about the alternative approach.
+180
View File
@@ -0,0 +1,180 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xcccEva1WWrh"
},
"outputs": [],
"source": [
"!pip install sentence_transformers pinecone openai"
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"os.environ['OPENAI_API_KEY'] = \"\""
],
"metadata": {
"id": "BYUc4Z7vY2bb"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#This is for embedding. In here, one LM model from huggingface used.\n",
"\n",
"from sentence_transformers import SentenceTransformer, util\n",
"model = SentenceTransformer('all-MiniLM-L6-v2')\n",
"\n",
"\n",
"text ='Abc'\n",
"model.encode(text).tolist() #exmple how to do encoding."
],
"metadata": {
"id": "MZAy8TaKY6pI"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#Function to split long documents in to smaller parts\n",
"def split_text_into_chunks(plain_text, max_chars=2000):\n",
" text_chunks = []\n",
" current_chunk = \"\"\n",
" for line in plain_text.split(\"\\n\"):\n",
" if len(current_chunk) + len(line) + 1 <= max_chars:\n",
" current_chunk += line + \" \"\n",
" else:\n",
" text_chunks.append(current_chunk.strip())\n",
" current_chunk = line + \" \"\n",
" if current_chunk:\n",
" text_chunks.append(current_chunk.strip())\n",
" return text_chunks"
],
"metadata": {
"id": "gu04tON0cZvT"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import pinecone\n",
"pinecone.init(api_key=\"\", environment=\"\") #Initialization of vector database module\n",
"index = pinecone.Index(\"\") #Fill out with index name."
],
"metadata": {
"id": "EqCYELlQZN0m"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def addData(corpusData):\n",
" id = index.describe_index_stats()['total_vector_count']\n",
" for i in range(len(corpusData)):\n",
" chunk=corpusData[i]\n",
" chunkInfo=(str(id+i),\n",
" model.encode(chunk).tolist(), #We are using the model to encode the original chunk of text.\n",
" {'context': chunk}) #In metadata we are storing the original text here as context. \n",
" index.upsert(vectors=[chunkInfo])"
],
"metadata": {
"id": "CAqSjLcQZjjJ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#This function is responsible for matching the input string with alread existing data on vector database.\n",
"\n",
"def find_match(query,k):\n",
" query_em = model.encode(query).tolist()\n",
" result = index.query(query_em, top_k=k, includeMetadata=True)\n",
" \n",
" return [result['matches'][i]['metadata']['context'] for i in range(k)]"
],
"metadata": {
"id": "8VIZ5_ufbRQ5"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def create_prompt(context,query):\n",
" #Unfinished, Should be generated with the context/contexts we find by doing semantaic search\n",
" pass"
],
"metadata": {
"id": "AoRDzK85aF9E"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def generate_answer(prompt):\n",
" #Unfinished, Pass the generated prompt and pass it to gpt-3 to get answers.\n",
" pass"
],
"metadata": {
"id": "IyPNrKW3aeoD"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def user_query(query):\n",
" \n",
" pass\n"
],
"metadata": {
"id": "uWM2IcOKarWz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"user_query(\"How can I do this?\")"
],
"metadata": {
"id": "jBds94_gbJ_G"
},
"execution_count": null,
"outputs": []
}
]
}