commit ffe41fb97c902dd77a2b79f8a7df515bcf0c7542
Author: ryan wong <wongryan2001@gmail.com>
Date:   Tue Feb 28 07:02:58 2023 -0500

    first commit

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..b53c80a
Binary files /dev/null and b/.DS_Store differ
diff --git a/DataLaw.docx b/DataLaw.docx
new file mode 100644
index 0000000..1a37551
Binary files /dev/null and b/DataLaw.docx differ
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e8b9bf5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,83 @@
+SET A:
+
+The objective of this project is to get better answers for user queries from gpt-3 on a specific matter.
+So, there can be some sectors, the data for those are not updated on gpt-3. To handle that,
+we tried to follow the following steps:
+
+- First we'll read the data we want to use in a specific case.
+- We will divide in to some chunks.
+- Transform the chunks in to vector using embedding algorithm
+- Save the vectors to a vector database.
+- If an user query appears, we'll find some best matches.
+  So, these are the steps we do s preparation of dataset.
+  Then,
+  If a query appeared, we do the following:
+- We first take the query and find matches with the data we have on vector database, like a semantic serch.
+- We take those contexts, and generate a prompt appropriate to the use case, including the contexts and the user's original question. We tell gpt-3 to
+  answer based on the context.
+
+Note: The embedding model used here has 384 dimensions.
+
+Useful Docs:
+
+- [Openai](https://platform.openai.com/docs)
+- [Pinecone](https://docs.pinecone.io/docs/quickstart)
+- [HuggingFace](https://huggingface.co/models)
+
+Tasks:
+
+1. Load the text from the given docx file and split them in to some chunks. (A splitter is defined, you can use that.)
+2. Add all the splitted chunks to the vector database. (Use addData function)
+3. Create a prompt using the process discussed above.
+4. Get the answer from gpt-3 api.
+5. Get all the things together such that, we can pass a query using the function user_query and get a solid answer.
+6. The embedding model we used here is a basic embedding model, change the model and use openai's embedding model 'text-embedding-ada-002'
+7. Can we improve something in this process? Any suggestion you think of list it down.
+8. Do you think you have a better idea to handle the whole process? Write a summary about the alternative approach.
+
+SET B:
+
+Problem:
+We have a sets of rules for a specific game. Based on the rules, we will need to implement a system
+to predict the optimal next move of a player.
+
+Use this as reference of the rules: https://gamerules.com/rules/7-wonders-duel/
+
+Make some different scenerios to test the system you built.
+
+SET C:
+Problem:
+Given these rules:
+
+```
+We have 5 ingredient:
+oranges
+apples
+pears
+grapes
+watermelon
+lemon
+lime
+
+
+Questions we ask client:
+1.Do you go out to party on weekends? (yes or no)
+2.What flavours do you like? (cider, sweet, waterlike)
+3.What texture you don't like? (smooth, slimy, rough)
+4.What price range will you buy drink for? ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
+
+If they party on weekends, apples, pears, grapes, watermelon are allowed.
+If they like cider, show apples, oranges, lemon, lime.
+If they like sweet, show watermelon, orange.
+If they like waterlike, show watermelon.
+If grapes is chosen, remove watermelon from the list.
+If texture you don't like is smooth, remove pears.
+If texture you don't like is slimy, remove watermelon, lime and grape.
+If texture you don't like is waterlike, remove watermelon.
+If price < $3 remove lime, watermelon.
+If price > $4 and < $7 remove pears, apples.
+```
+
+Make a function passing in the answer to the 4 questions and structure GPT3 prompt given these rules to give you the list of recommeded fruits.
+
+Make a simple flask POST API where we return the answers given the input in POST Body with content type application/json
diff --git a/ds_task_1.ipynb b/ds_task_1.ipynb
new file mode 100644
index 0000000..3d07bd7
--- /dev/null
+++ b/ds_task_1.ipynb
@@ -0,0 +1,180 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xcccEva1WWrh"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install sentence_transformers pinecone openai"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BYUc4Z7vY2bb"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "os.environ['OPENAI_API_KEY'] = \"\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MZAy8TaKY6pI"
+      },
+      "outputs": [],
+      "source": [
+        "#This is for embedding. In here, one LM model from huggingface used.\n",
+        "\n",
+        "from sentence_transformers import SentenceTransformer, util\n",
+        "model = SentenceTransformer('all-MiniLM-L6-v2')\n",
+        "\n",
+        "\n",
+        "text ='Abc'\n",
+        "model.encode(text).tolist() #exmple how to do encoding."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gu04tON0cZvT"
+      },
+      "outputs": [],
+      "source": [
+        "#Function to split long documents in to smaller parts\n",
+        "def split_text_into_chunks(plain_text, max_chars=2000):\n",
+        "    text_chunks = []\n",
+        "    current_chunk = \"\"\n",
+        "    for line in plain_text.split(\"\\n\"):\n",
+        "        if len(current_chunk) + len(line) + 1 <= max_chars:\n",
+        "            current_chunk += line + \" \"\n",
+        "        else:\n",
+        "            text_chunks.append(current_chunk.strip())\n",
+        "            current_chunk = line + \" \"\n",
+        "    if current_chunk:\n",
+        "        text_chunks.append(current_chunk.strip())\n",
+        "    return text_chunks"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "EqCYELlQZN0m"
+      },
+      "outputs": [],
+      "source": [
+        "import pinecone\n",
+        "pinecone.init(api_key=\"\", environment=\"\") #Todo: Initialization of vector database module\n",
+        "index = pinecone.Index(\"\") #Todo: Fill out with index name."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CAqSjLcQZjjJ"
+      },
+      "outputs": [],
+      "source": [
+        "def addData(corpusData):\n",
+        "    id  = index.describe_index_stats()['total_vector_count']\n",
+        "    for i in range(len(corpusData)):\n",
+        "        chunk=corpusData[i]\n",
+        "        chunkInfo=(str(id+i),\n",
+        "                model.encode(chunk).tolist(), #We are using the model to encode the original chunk of text.\n",
+        "                {'context': chunk}) #In metadata we are storing the original text here as context. \n",
+        "        index.upsert(vectors=[chunkInfo])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8VIZ5_ufbRQ5"
+      },
+      "outputs": [],
+      "source": [
+        "#This function is responsible for matching the input string with alread existing data on vector database.\n",
+        "\n",
+        "def find_match(query,k):\n",
+        "    query_em = model.encode(query).tolist()\n",
+        "    result = index.query(query_em, top_k=k, includeMetadata=True)\n",
+        "    \n",
+        "    return [result['matches'][i]['metadata']['context'] for i in range(k)]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "AoRDzK85aF9E"
+      },
+      "outputs": [],
+      "source": [
+        "def create_prompt(context,query):\n",
+        "  #Todo: Should be generated with the context/contexts we find by doing semantaic search\n",
+        "  pass"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IyPNrKW3aeoD"
+      },
+      "outputs": [],
+      "source": [
+        "def generate_answer(prompt):\n",
+        "  #Todo: Pass the generated prompt and pass it to gpt-3 to get answers.\n",
+        "  pass"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uWM2IcOKarWz"
+      },
+      "outputs": [],
+      "source": [
+        "def user_query(query):\n",
+        "  #Todo: Make all the things together.\n",
+        "  pass\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jBds94_gbJ_G"
+      },
+      "outputs": [],
+      "source": [
+        "user_query(\"How can I do this?\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}