first commit
This commit is contained in:
@@ -5,16 +5,21 @@ we tried to follow the following steps:
|
||||
- We will divide in to some chunks.
|
||||
- Transform the chunks in to vector using embedding algorithm
|
||||
- Save the vectors to a vector database.
|
||||
- If an user quer appears, we'll find some best matches.
|
||||
- If an user query appears, we'll find some best matches.
|
||||
So, these are the steps we do s preparation of dataset.
|
||||
Then,
|
||||
If a query appeared, we do the following:
|
||||
- We first take the quer and find matches with the data we have on ector database, like a semantic serch.
|
||||
- We first take the query and find matches with the data we have on vector database, like a semantic serch.
|
||||
- We take those contexts, and generate a prompt appropriate to the use case, including the contexts and the user's original question. We tell gpt-3 to
|
||||
answer based on the context.
|
||||
|
||||
Note: The embedding model used here has 384 dimensions.
|
||||
|
||||
Useful Docs:
|
||||
- [Openai](https://platform.openai.com/docs)
|
||||
- [Pinecone](https://docs.pinecone.io/docs/quickstart)
|
||||
|
||||
|
||||
Tasks:
|
||||
1. Load the text from the given docx file and split them in to some chunks. (A splitter is defined, you can use that.)
|
||||
2. Add all the splitted chunks to the vector database. (Use addData function)
|
||||
@@ -23,4 +28,4 @@ Tasks:
|
||||
5. Get all the things together such that, we can pass a query using the function user_query and get a solid answer.
|
||||
6. The embedding model we used here is a basic embedding model, change the model and use openai's embedding model 'text-embedding-ada-002'
|
||||
7. Can we improve something in this process? Any suggestion you think of list it down.
|
||||
8. Do you think you have a better idea to handle the whole process? Write a summarry about the alternative approach.
|
||||
8. Do you think you have a better idea to handle the whole process? Write a summary about the alternative approach.
|
||||
+78
-78
@@ -1,18 +1,4 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -27,18 +13,23 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import os\n",
|
||||
"os.environ['OPENAI_API_KEY'] = \"\""
|
||||
],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "BYUc4Z7vY2bb"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"os.environ['OPENAI_API_KEY'] = \"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "MZAy8TaKY6pI"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#This is for embedding. In here, one LM model from huggingface used.\n",
|
||||
"\n",
|
||||
@@ -48,15 +39,15 @@
|
||||
"\n",
|
||||
"text ='Abc'\n",
|
||||
"model.encode(text).tolist() #exmple how to do encoding."
|
||||
],
|
||||
"metadata": {
|
||||
"id": "MZAy8TaKY6pI"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "gu04tON0cZvT"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Function to split long documents in to smaller parts\n",
|
||||
"def split_text_into_chunks(plain_text, max_chars=2000):\n",
|
||||
@@ -71,28 +62,28 @@
|
||||
" if current_chunk:\n",
|
||||
" text_chunks.append(current_chunk.strip())\n",
|
||||
" return text_chunks"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "gu04tON0cZvT"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import pinecone\n",
|
||||
"pinecone.init(api_key=\"\", environment=\"\") #Initialization of vector database module\n",
|
||||
"index = pinecone.Index(\"\") #Fill out with index name."
|
||||
],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "EqCYELlQZN0m"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pinecone\n",
|
||||
"pinecone.init(api_key=\"\", environment=\"\") #Todo: Initialization of vector database module\n",
|
||||
"index = pinecone.Index(\"\") #Todo: Fill out with index name."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "CAqSjLcQZjjJ"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def addData(corpusData):\n",
|
||||
" id = index.describe_index_stats()['total_vector_count']\n",
|
||||
@@ -102,15 +93,15 @@
|
||||
" model.encode(chunk).tolist(), #We are using the model to encode the original chunk of text.\n",
|
||||
" {'context': chunk}) #In metadata we are storing the original text here as context. \n",
|
||||
" index.upsert(vectors=[chunkInfo])"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "CAqSjLcQZjjJ"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "8VIZ5_ufbRQ5"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#This function is responsible for matching the input string with alread existing data on vector database.\n",
|
||||
"\n",
|
||||
@@ -119,62 +110,71 @@
|
||||
" result = index.query(query_em, top_k=k, includeMetadata=True)\n",
|
||||
" \n",
|
||||
" return [result['matches'][i]['metadata']['context'] for i in range(k)]"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "8VIZ5_ufbRQ5"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def create_prompt(context,query):\n",
|
||||
" #Unfinished, Should be generated with the context/contexts we find by doing semantaic search\n",
|
||||
" pass"
|
||||
],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "AoRDzK85aF9E"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_prompt(context,query):\n",
|
||||
" #Todo: Should be generated with the context/contexts we find by doing semantaic search\n",
|
||||
" pass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def generate_answer(prompt):\n",
|
||||
" #Unfinished, Pass the generated prompt and pass it to gpt-3 to get answers.\n",
|
||||
" pass"
|
||||
],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "IyPNrKW3aeoD"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate_answer(prompt):\n",
|
||||
" #Todo: Pass the generated prompt and pass it to gpt-3 to get answers.\n",
|
||||
" pass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def user_query(query):\n",
|
||||
" \n",
|
||||
" pass\n"
|
||||
],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "uWM2IcOKarWz"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_query(query):\n",
|
||||
" #Todo: Make all the things together.\n",
|
||||
" pass\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"user_query(\"How can I do this?\")"
|
||||
],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "jBds94_gbJ_G"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"user_query(\"How can I do this?\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user