commit ffe41fb97c902dd77a2b79f8a7df515bcf0c7542 Author: ryan wong Date: Tue Feb 28 07:02:58 2023 -0500 first commit diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..b53c80a Binary files /dev/null and b/.DS_Store differ diff --git a/DataLaw.docx b/DataLaw.docx new file mode 100644 index 0000000..1a37551 Binary files /dev/null and b/DataLaw.docx differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..e8b9bf5 --- /dev/null +++ b/README.md @@ -0,0 +1,83 @@ +SET A: + +The objective of this project is to get better answers for user queries from gpt-3 on a specific matter. +So, there can be some sectors, the data for those are not updated on gpt-3. To handle that, +we tried to follow the following steps: + +- First we'll read the data we want to use in a specific case. +- We will divide in to some chunks. +- Transform the chunks in to vector using embedding algorithm +- Save the vectors to a vector database. +- If an user query appears, we'll find some best matches. + So, these are the steps we do s preparation of dataset. + Then, + If a query appeared, we do the following: +- We first take the query and find matches with the data we have on vector database, like a semantic serch. +- We take those contexts, and generate a prompt appropriate to the use case, including the contexts and the user's original question. We tell gpt-3 to + answer based on the context. + +Note: The embedding model used here has 384 dimensions. + +Useful Docs: + +- [Openai](https://platform.openai.com/docs) +- [Pinecone](https://docs.pinecone.io/docs/quickstart) +- [HuggingFace](https://huggingface.co/models) + +Tasks: + +1. Load the text from the given docx file and split them in to some chunks. (A splitter is defined, you can use that.) +2. Add all the splitted chunks to the vector database. (Use addData function) +3. Create a prompt using the process discussed above. +4. Get the answer from gpt-3 api. +5. Get all the things together such that, we can pass a query using the function user_query and get a solid answer. +6. The embedding model we used here is a basic embedding model, change the model and use openai's embedding model 'text-embedding-ada-002' +7. Can we improve something in this process? Any suggestion you think of list it down. +8. Do you think you have a better idea to handle the whole process? Write a summary about the alternative approach. + +SET B: + +Problem: +We have a sets of rules for a specific game. Based on the rules, we will need to implement a system +to predict the optimal next move of a player. + +Use this as reference of the rules: https://gamerules.com/rules/7-wonders-duel/ + +Make some different scenerios to test the system you built. + +SET C: +Problem: +Given these rules: + +``` +We have 5 ingredient: +oranges +apples +pears +grapes +watermelon +lemon +lime + + +Questions we ask client: +1.Do you go out to party on weekends? (yes or no) +2.What flavours do you like? (cider, sweet, waterlike) +3.What texture you don't like? (smooth, slimy, rough) +4.What price range will you buy drink for? ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) + +If they party on weekends, apples, pears, grapes, watermelon are allowed. +If they like cider, show apples, oranges, lemon, lime. +If they like sweet, show watermelon, orange. +If they like waterlike, show watermelon. +If grapes is chosen, remove watermelon from the list. +If texture you don't like is smooth, remove pears. +If texture you don't like is slimy, remove watermelon, lime and grape. +If texture you don't like is waterlike, remove watermelon. +If price < $3 remove lime, watermelon. +If price > $4 and < $7 remove pears, apples. +``` + +Make a function passing in the answer to the 4 questions and structure GPT3 prompt given these rules to give you the list of recommeded fruits. + +Make a simple flask POST API where we return the answers given the input in POST Body with content type application/json diff --git a/ds_task_1.ipynb b/ds_task_1.ipynb new file mode 100644 index 0000000..3d07bd7 --- /dev/null +++ b/ds_task_1.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xcccEva1WWrh" + }, + "outputs": [], + "source": [ + "!pip install sentence_transformers pinecone openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BYUc4Z7vY2bb" + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MZAy8TaKY6pI" + }, + "outputs": [], + "source": [ + "#This is for embedding. In here, one LM model from huggingface used.\n", + "\n", + "from sentence_transformers import SentenceTransformer, util\n", + "model = SentenceTransformer('all-MiniLM-L6-v2')\n", + "\n", + "\n", + "text ='Abc'\n", + "model.encode(text).tolist() #exmple how to do encoding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gu04tON0cZvT" + }, + "outputs": [], + "source": [ + "#Function to split long documents in to smaller parts\n", + "def split_text_into_chunks(plain_text, max_chars=2000):\n", + " text_chunks = []\n", + " current_chunk = \"\"\n", + " for line in plain_text.split(\"\\n\"):\n", + " if len(current_chunk) + len(line) + 1 <= max_chars:\n", + " current_chunk += line + \" \"\n", + " else:\n", + " text_chunks.append(current_chunk.strip())\n", + " current_chunk = line + \" \"\n", + " if current_chunk:\n", + " text_chunks.append(current_chunk.strip())\n", + " return text_chunks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EqCYELlQZN0m" + }, + "outputs": [], + "source": [ + "import pinecone\n", + "pinecone.init(api_key=\"\", environment=\"\") #Todo: Initialization of vector database module\n", + "index = pinecone.Index(\"\") #Todo: Fill out with index name." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CAqSjLcQZjjJ" + }, + "outputs": [], + "source": [ + "def addData(corpusData):\n", + " id = index.describe_index_stats()['total_vector_count']\n", + " for i in range(len(corpusData)):\n", + " chunk=corpusData[i]\n", + " chunkInfo=(str(id+i),\n", + " model.encode(chunk).tolist(), #We are using the model to encode the original chunk of text.\n", + " {'context': chunk}) #In metadata we are storing the original text here as context. \n", + " index.upsert(vectors=[chunkInfo])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8VIZ5_ufbRQ5" + }, + "outputs": [], + "source": [ + "#This function is responsible for matching the input string with alread existing data on vector database.\n", + "\n", + "def find_match(query,k):\n", + " query_em = model.encode(query).tolist()\n", + " result = index.query(query_em, top_k=k, includeMetadata=True)\n", + " \n", + " return [result['matches'][i]['metadata']['context'] for i in range(k)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AoRDzK85aF9E" + }, + "outputs": [], + "source": [ + "def create_prompt(context,query):\n", + " #Todo: Should be generated with the context/contexts we find by doing semantaic search\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IyPNrKW3aeoD" + }, + "outputs": [], + "source": [ + "def generate_answer(prompt):\n", + " #Todo: Pass the generated prompt and pass it to gpt-3 to get answers.\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uWM2IcOKarWz" + }, + "outputs": [], + "source": [ + "def user_query(query):\n", + " #Todo: Make all the things together.\n", + " pass\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jBds94_gbJ_G" + }, + "outputs": [], + "source": [ + "user_query(\"How can I do this?\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}