first commit

2023-02-22 03:04:53 +06:00
parent 40442a9dab
commit 1247af9b8b
2 changed files with 87 additions and 82 deletions
@@ -1,18 +1,4 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
  "cells": [
    {
      "cell_type": "code",
@@ -27,18 +13,23 @@
    },
    {
      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "os.environ['OPENAI_API_KEY'] = \"\""
-      ],
+      "execution_count": null,
      "metadata": {
        "id": "BYUc4Z7vY2bb"
      },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "os.environ['OPENAI_API_KEY'] = \"\""
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MZAy8TaKY6pI"
+      },
+      "outputs": [],
      "source": [
        "#This is for embedding. In here, one LM model from huggingface used.\n",
        "\n",
@@ -48,15 +39,15 @@
        "\n",
        "text ='Abc'\n",
        "model.encode(text).tolist() #exmple how to do encoding."
-      ],
-      "metadata": {
-        "id": "MZAy8TaKY6pI"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gu04tON0cZvT"
+      },
+      "outputs": [],
      "source": [
        "#Function to split long documents in to smaller parts\n",
        "def split_text_into_chunks(plain_text, max_chars=2000):\n",
@@ -71,28 +62,28 @@
        "    if current_chunk:\n",
        "        text_chunks.append(current_chunk.strip())\n",
        "    return text_chunks"
-      ],
-      "metadata": {
-        "id": "gu04tON0cZvT"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
-      "source": [
-        "import pinecone\n",
-        "pinecone.init(api_key=\"\", environment=\"\") #Initialization of vector database module\n",
-        "index = pinecone.Index(\"\") #Fill out with index name."
-      ],
+      "execution_count": null,
      "metadata": {
        "id": "EqCYELlQZN0m"
      },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "import pinecone\n",
+        "pinecone.init(api_key=\"\", environment=\"\") #Todo: Initialization of vector database module\n",
+        "index = pinecone.Index(\"\") #Todo: Fill out with index name."
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CAqSjLcQZjjJ"
+      },
+      "outputs": [],
      "source": [
        "def addData(corpusData):\n",
        "    id  = index.describe_index_stats()['total_vector_count']\n",
@@ -102,15 +93,15 @@
        "                model.encode(chunk).tolist(), #We are using the model to encode the original chunk of text.\n",
        "                {'context': chunk}) #In metadata we are storing the original text here as context. \n",
        "        index.upsert(vectors=[chunkInfo])"
-      ],
-      "metadata": {
-        "id": "CAqSjLcQZjjJ"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8VIZ5_ufbRQ5"
+      },
+      "outputs": [],
      "source": [
        "#This function is responsible for matching the input string with alread existing data on vector database.\n",
        "\n",
@@ -119,62 +110,71 @@
        "    result = index.query(query_em, top_k=k, includeMetadata=True)\n",
        "    \n",
        "    return [result['matches'][i]['metadata']['context'] for i in range(k)]"
-      ],
-      "metadata": {
-        "id": "8VIZ5_ufbRQ5"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
-      "source": [
-        "def create_prompt(context,query):\n",
-        "  #Unfinished, Should be generated with the context/contexts we find by doing semantaic search\n",
-        "  pass"
-      ],
+      "execution_count": null,
      "metadata": {
        "id": "AoRDzK85aF9E"
      },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "def create_prompt(context,query):\n",
+        "  #Todo: Should be generated with the context/contexts we find by doing semantaic search\n",
+        "  pass"
+      ]
    },
    {
      "cell_type": "code",
-      "source": [
-        "def generate_answer(prompt):\n",
-        "  #Unfinished, Pass the generated prompt and pass it to gpt-3 to get answers.\n",
-        "  pass"
-      ],
+      "execution_count": null,
      "metadata": {
        "id": "IyPNrKW3aeoD"
      },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "def generate_answer(prompt):\n",
+        "  #Todo: Pass the generated prompt and pass it to gpt-3 to get answers.\n",
+        "  pass"
+      ]
    },
    {
      "cell_type": "code",
-      "source": [
-        "def user_query(query):\n",
-        "  \n",
-        "  pass\n"
-      ],
+      "execution_count": null,
      "metadata": {
        "id": "uWM2IcOKarWz"
      },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "def user_query(query):\n",
+        "  #Todo: Make all the things together.\n",
+        "  pass\n"
+      ]
    },
    {
      "cell_type": "code",
-      "source": [
-        "user_query(\"How can I do this?\")"
-      ],
+      "execution_count": null,
      "metadata": {
        "id": "jBds94_gbJ_G"
      },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "user_query(\"How can I do this?\")"
+      ]
    }
-  ]
-}
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}