RAG-Commercial-Embedding.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "dab6db2a-e284-4660-b431-ed2f1d643c1a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "! pip install cohere hnswlib unstructured python-dotenv -q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ad62bc1a-be68-4394-9570-bf79ba3d6c92",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting pinecone\n",
      "  Downloading pinecone-5.4.2-py3-none-any.whl.metadata (19 kB)\n",
      "Requirement already satisfied: certifi>=2019.11.17 in /usr/local/lib/python3.10/dist-packages (from pinecone) (2022.12.7)\n",
      "Collecting pinecone-plugin-inference<4.0.0,>=2.0.0 (from pinecone)\n",
      "  Downloading pinecone_plugin_inference-3.1.0-py3-none-any.whl.metadata (2.2 kB)\n",
      "Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)\n",
      "  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)\n",
      "Requirement already satisfied: python-dateutil>=2.5.3 in /usr/local/lib/python3.10/dist-packages (from pinecone) (2.8.2)\n",
      "Requirement already satisfied: tqdm>=4.64.1 in /usr/local/lib/python3.10/dist-packages (from pinecone) (4.67.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.10/dist-packages (from pinecone) (4.12.2)\n",
      "Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from pinecone) (2.3.0)\n",
      "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.5.3->pinecone) (1.16.0)\n",
      "Downloading pinecone-5.4.2-py3-none-any.whl (427 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m427.3/427.3 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading pinecone_plugin_inference-3.1.0-py3-none-any.whl (87 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m19.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)\n",
      "Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone\n",
      "Successfully installed pinecone-5.4.2 pinecone-plugin-inference-3.1.0 pinecone-plugin-interface-0.0.7\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install pinecone"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "248774c6-1e39-48b3-8a35-d8b02a2d1c27",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cohere\n",
    "from pinecone import Pinecone, PodSpec,ServerlessSpec\n",
    "import uuid\n",
    "import hnswlib\n",
    "from typing import List, Dict\n",
    "from unstructured.partition.html import partition_html\n",
    "from unstructured.chunking.title import chunk_by_title\n",
    "COHERE_API_KEY = \"AvFmArWCS6HtYD1Aa5vpFoAObjxYYK3JcO75pGcT\"\n",
    "PINECONE_API_KEY = \"pcsk_62sXvB_8KHmRrwQj9J35c18PStQr6deXFjJdy9Erq3Vkhnj2BG2WgUEiLA7aS95pKDYJg5\"\n",
    "co = cohere.Client(COHERE_API_KEY) # Get your API key here: https://dashboard.cohere.com/api-keys\n",
    "pc = Pinecone(api_key=PINECONE_API_KEY) # (get API key at app.pinecone.io)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "9ae9820a-819e-4d5b-be44-0fb85e21dec2",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_documents = [\n",
    "    {\n",
    "        \"title\": \"Text Embeddings\",\n",
    "        \"url\": \"https://docs.cohere.com/v2/docs/embeddings\"},\n",
    "    {\n",
    "        \"title\": \"Similarity Between Words and Sentences\",\n",
    "        \"url\": \"https://cohere.com/llmu/what-is-similarity-between-sentences\"},\n",
    "    {\n",
    "        \"title\": \"The Attention Mechanism\",\n",
    "        \"url\": \"https://cohere.com/llmu/what-is-attention-in-language-models\"},\n",
    "    {\n",
    "        \"title\": \"Transformer Models\",\n",
    "        \"url\": \"https://cohere.com/llmu/what-are-transformer-models\"}\n",
    "]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "2c679e9c-5fb1-45e6-a5a5-0d62365b4bbf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.9.1)\n",
      "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.8)\n",
      "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.4.2)\n",
      "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2024.11.6)\n",
      "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.67.1)\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "e456362a-8cef-46c2-a2cb-5e3ff1e38634",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
      "[nltk_data]   Unzipping tokenizers/punkt_tab.zip.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('punkt_tab')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "a0935092-c60a-4af4-9bce-35d2fb75bdc8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n",
      "[nltk_data]     /root/nltk_data...\n",
      "[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.download('averaged_perceptron_tagger_eng')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "95c8503a-cc9f-4c35-a0f6-216dfee1844d",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Vectorstore:\n",
    "    \"\"\"\n",
    "    A class representing a collection of documents indexed into a vectorstore.\n",
    "\n",
    "    Parameters:\n",
    "    raw_documents (list): A list of dictionaries representing the sources of the raw documents. Each dictionary should have 'title' and 'url' keys.\n",
    "\n",
    "    Attributes:\n",
    "    raw_documents (list): A list of dictionaries representing the raw documents.\n",
    "    docs (list): A list of dictionaries representing the chunked documents, with 'title', 'text', and 'url' keys.\n",
    "    docs_embs (list): A list of the associated embeddings for the document chunks.\n",
    "    docs_len (int): The number of document chunks in the collection.\n",
    "    idx (hnswlib.Index): The index used for document retrieval.\n",
    "\n",
    "    Methods:\n",
    "    load_and_chunk(): Loads the data from the sources and partitions the HTML content into chunks.\n",
    "    embed(): Embeds the document chunks using the Cohere API.\n",
    "    index(): Indexes the document chunks for efficient retrieval.\n",
    "    retrieve(): Retrieves document chunks based on the given query.\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self, raw_documents: List[Dict[str, str]]):\n",
    "        self.raw_documents = raw_documents\n",
    "        self.docs = []\n",
    "        self.docs_embs = []\n",
    "        self.retrieve_top_k = 10\n",
    "        self.rerank_top_k = 3\n",
    "        self.load_and_chunk()\n",
    "        self.embed()\n",
    "        self.index()\n",
    "\n",
    "\n",
    "    def load_and_chunk(self) -> None:\n",
    "        \"\"\"\n",
    "        Loads the text from the sources and chunks the HTML content.\n",
    "        \"\"\"\n",
    "        print(\"Loading documents...\")\n",
    "\n",
    "        for raw_document in self.raw_documents:\n",
    "            print(\"Loading URL:\", raw_document[\"url\"])\n",
    "            elements = partition_html(url=raw_document[\"url\"])\n",
    "            chunks = chunk_by_title(elements)\n",
    "            for chunk in chunks:\n",
    "                self.docs.append(\n",
    "                    {\n",
    "                        \"title\": raw_document[\"title\"],\n",
    "                        \"text\": str(chunk),\n",
    "                        \"url\": raw_document[\"url\"],\n",
    "                    }\n",
    "                )\n",
    "\n",
    "    def embed(self) -> None:\n",
    "        \"\"\"\n",
    "        Embeds the document chunks using the Cohere API.\n",
    "        \"\"\"\n",
    "        print(\"Embedding document chunks...\")\n",
    "\n",
    "        batch_size = 90\n",
    "        self.docs_len = len(self.docs)\n",
    "        for i in range(0, self.docs_len, batch_size):\n",
    "            batch = self.docs[i : min(i + batch_size, self.docs_len)]\n",
    "            texts = [item[\"text\"] for item in batch]\n",
    "            docs_embs_batch = co.embed(\n",
    "                texts=texts, model=\"embed-english-v3.0\", input_type=\"search_document\"\n",
    "            ).embeddings\n",
    "            self.docs_embs.extend(docs_embs_batch)\n",
    "\n",
    "    def index(self) -> None:\n",
    "        \"\"\"\n",
    "        Indexes the documents for efficient retrieval.\n",
    "        \"\"\"\n",
    "        print(\"Indexing documents...\")\n",
    "\n",
    "        index_name = 'rag-01'\n",
    "\n",
    "        # If the index does not exist, we create it\n",
    "        if index_name not in pc.list_indexes().names():\n",
    "            pc.create_index(\n",
    "                name=index_name,\n",
    "                dimension=len(self.docs_embs[0]),\n",
    "                metric=\"cosine\",\n",
    "                spec=ServerlessSpec(\n",
    "                    cloud=\"aws\",\n",
    "                    region=\"us-east-1\"\n",
    "                ) \n",
    "                )\n",
    "\n",
    "        # connect to index\n",
    "        self.idx = pc.Index(index_name)\n",
    "\n",
    "        batch_size = 128\n",
    "\n",
    "        ids = [str(i) for i in range(len(self.docs))]\n",
    "        # create list of metadata dictionaries\n",
    "        meta = self.docs\n",
    "\n",
    "        # create list of (id, vector, metadata) tuples to be upserted\n",
    "        to_upsert = list(zip(ids, self.docs_embs, meta))\n",
    "\n",
    "        for i in range(0, len(self.docs), batch_size):\n",
    "            i_end = min(i+batch_size, len(self.docs))\n",
    "            self.idx.upsert(vectors=to_upsert[i:i_end])\n",
    "\n",
    "        # let's view the index statistics\n",
    "        print(\"Indexing complete\")\n",
    "\n",
    "\n",
    "    def retrieve(self, query: str) -> List[Dict[str, str]]:\n",
    "        \"\"\"\n",
    "        Retrieves document chunks based on the given query.\n",
    "\n",
    "        Parameters:\n",
    "        query (str): The query to retrieve document chunks for.\n",
    "\n",
    "        Returns:\n",
    "        List[Dict[str, str]]: A list of dictionaries representing the retrieved document chunks, with 'title', 'text', and 'url' keys.\n",
    "        \"\"\"\n",
    "\n",
    "        docs_retrieved = []\n",
    "        query_emb = co.embed(\n",
    "            texts=[query], model=\"embed-english-v3.0\", input_type=\"search_query\"\n",
    "        ).embeddings\n",
    "\n",
    "\n",
    "        res = self.idx.query(vector=query_emb, top_k=self.retrieve_top_k, include_metadata=True)\n",
    "        docs_to_rerank = [match['metadata']['text'] for match in res['matches']]\n",
    "\n",
    "        rerank_results = co.rerank(\n",
    "            query=query,\n",
    "            documents=docs_to_rerank,\n",
    "            top_n=self.rerank_top_k,\n",
    "            model=\"rerank-english-v2.0\",\n",
    "        )\n",
    "\n",
    "        docs_reranked = [res['matches'][result.index] for result in rerank_results.results]\n",
    "\n",
    "        for doc in docs_reranked:\n",
    "            docs_retrieved.append(doc['metadata'])\n",
    "\n",
    "        return docs_retrieved\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "ffe58842-1825-4ca3-9a88-997b674f2e1f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading documents...\n",
      "Loading URL: https://docs.cohere.com/v2/docs/embeddings\n",
      "Loading URL: https://cohere.com/llmu/what-is-similarity-between-sentences\n",
      "Loading URL: https://cohere.com/llmu/what-is-attention-in-language-models\n",
      "Loading URL: https://cohere.com/llmu/what-are-transformer-models\n",
      "Embedding document chunks...\n",
      "Indexing documents...\n",
      "Indexing complete\n"
     ]
    }
   ],
   "source": [
    "vectorstore = Vectorstore(raw_documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "36e720d6-4fb4-4076-8320-20b9d9710096",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'text': '< Back to modules\\n\\nWhat Is Attention in Language Models?\\n\\nLuis Serrano\\n\\nA huge roadblock for language models is when a word can be used in two different contexts. When this problem is encountered, the model needs to use the context of the sentence in order to decipher which meaning of the word to use. This is precisely what self-attention models do.\\n\\nShare:',\n",
       "  'title': 'The Attention Mechanism',\n",
       "  'url': 'https://cohere.com/llmu/what-is-attention-in-language-models'},\n",
       " {'text': 'In the example below we use the embed-english-v3.0 model to generate embeddings for 3 phrases and compare them using a similarity function. The two similar phrases have a high similarity score, and the embeddings for two unrelated phrases have a low similarity score:\\n\\nPYTHON',\n",
       "  'title': 'Text Embeddings',\n",
       "  'url': 'https://docs.cohere.com/v2/docs/embeddings'},\n",
       " {'text': 'Embeddings are a way to represent the meaning of text as a list of numbers. Using a simple comparison function, we can then calculate a similarity score for two embeddings to figure out whether two texts are talking about similar things. Common use-cases for embeddings include semantic search, clustering, and classification.',\n",
       "  'title': 'Text Embeddings',\n",
       "  'url': 'https://docs.cohere.com/v2/docs/embeddings'}]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorstore.retrieve(\"multi-head attention definition\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "705eac53-6651-4352-87c3-47a8a76dfb92",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Chatbot:\n",
    "    def __init__(self, vectorstore: Vectorstore):\n",
    "        \"\"\"\n",
    "        Initializes an instance of the Chatbot class.\n",
    "\n",
    "        Parameters:\n",
    "        vectorstore (Vectorstore): An instance of the Vectorstore class.\n",
    "\n",
    "        \"\"\"\n",
    "        self.vectorstore = vectorstore\n",
    "        self.conversation_id = str(uuid.uuid4())\n",
    "\n",
    "    def run(self):\n",
    "        \"\"\"\n",
    "        Runs the chatbot application.\n",
    "\n",
    "        \"\"\"\n",
    "        while True:\n",
    "            # Get the user message\n",
    "            message = input(\"User: \")\n",
    "\n",
    "            # Typing \"quit\" ends the conversation\n",
    "            if message.lower() == \"quit\":\n",
    "              print(\"Ending chat.\")\n",
    "              break\n",
    "            # else:                       # Uncomment for Google Colab to avoid printing the same thing twice\n",
    "              # print(f\"User: {message}\") # Uncomment for Google Colab to avoid printing the same thing twice\n",
    "\n",
    "            # Generate search queries (if any)\n",
    "            response = co.chat(message=message,\n",
    "                               model=\"command-r\",\n",
    "                               search_queries_only=True)\n",
    "\n",
    "            # If there are search queries, retrieve document chunks and respond\n",
    "            if response.search_queries:\n",
    "                print(\"Retrieving information...\", end=\"\")\n",
    "\n",
    "                # Retrieve document chunks for each query\n",
    "                documents = []\n",
    "                for query in response.search_queries:\n",
    "                    documents.extend(self.vectorstore.retrieve(query.text))\n",
    "\n",
    "                # Use document chunks to respond\n",
    "                response = co.chat_stream(\n",
    "                    message=message,\n",
    "                    model=\"command-r\",\n",
    "                    documents=documents,\n",
    "                    conversation_id=self.conversation_id,\n",
    "                )\n",
    "\n",
    "            # If there is no search query, directly respond\n",
    "            else:\n",
    "                response = co.chat_stream(\n",
    "                    message=message,\n",
    "                    model=\"command-r\",\n",
    "                    conversation_id=self.conversation_id,\n",
    "                )\n",
    "\n",
    "            # Print the chatbot response, citations, and documents\n",
    "            print(\"\\nChatbot:\")\n",
    "            citations = []\n",
    "            cited_documents = []\n",
    "\n",
    "            # Display response\n",
    "            for event in response:\n",
    "                if event.event_type == \"text-generation\":\n",
    "                    print(event.text, end=\"\")\n",
    "                elif event.event_type == \"citation-generation\":\n",
    "                    citations.extend(event.citations)\n",
    "                elif event.event_type == \"search-results\":\n",
    "                    cited_documents = event.documents\n",
    "\n",
    "            # Display citations and source documents\n",
    "            if citations:\n",
    "              print(\"\\n\\nCITATIONS:\")\n",
    "              for citation in citations:\n",
    "                print(citation)\n",
    "\n",
    "              print(\"\\nDOCUMENTS:\")\n",
    "              for document in cited_documents:\n",
    "                print(document)\n",
    "\n",
    "            print(f\"\\n{'-'*100}\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "fcc1109e-2c6a-4f3d-8f1b-86474da9f796",
   "metadata": {},
   "outputs": [
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "User:  What is Embeddings\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Retrieving information...\n",
      "Chatbot:\n",
      "Embeddings are a method of representing the meaning of text numerically. The numerical representation of text can then be used to calculate a similarity score between two different texts. This can be useful for tasks such as semantic search, clustering, and classification.\n",
      "\n",
      "CITATIONS:\n",
      "start=27 end=72 text='representing the meaning of text numerically.' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=77 end=101 text='numerical representation' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=142 end=158 text='similarity score' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=167 end=187 text='two different texts.' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=225 end=240 text='semantic search' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=242 end=252 text='clustering' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=258 end=273 text='classification.' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "\n",
      "DOCUMENTS:\n",
      "{'id': 'doc_0', 'text': 'Embeddings are a way to represent the meaning of text as a list of numbers. Using a simple comparison function, we can then calculate a similarity score for two embeddings to figure out whether two texts are talking about similar things. Common use-cases for embeddings include semantic search, clustering, and classification.', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/v2/docs/embeddings'}\n",
      "{'id': 'doc_1', 'text': '24 embeddings = response.embeddings.float # All text embeddings 25 print(embeddings[0][:5]) # Print embeddings for the first text', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/v2/docs/embeddings'}\n",
      "{'id': 'doc_2', 'text': 'The input_type parameter\\n\\nCohere embeddings are optimized for different types of inputs.\\n\\nWhen using embeddings for semantic search, the search query should be embedded by setting input_type=\"search_query\"\\n\\nWhen using embeddings for semantic search, the text passages that are being searched over should be embedded with input_type=\"search_document\".', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/v2/docs/embeddings'}\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "User:  How to compute Similarity?\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Retrieving information...\n",
      "Chatbot:\n",
      "Text similarity can be calculated using the dot product. Here's an example Python code snippet that computes similarity between two texts using Cohere's API:\n",
      "```python\n",
      "import cohere\n",
      "import numpy as np\n",
      "\n",
      "co = cohere.Client(api_key=\"YOUR_API_KEY\")\n",
      "\n",
      "phrases = [\"i love soup\", \"soup is my favorite\"]\n",
      "model = \"embed-english-v3.0\"\n",
      "input_type = \"search_query\"\n",
      "\n",
      "res = co.embed(\n",
      "    texts=phrases,\n",
      "    model=model,\n",
      "    input_type=input_type,\n",
      "    embedding_types=[\"float\"],\n",
      ")\n",
      "\n",
      "(soup1, soup2) = res.embeddings.float\n",
      "\n",
      "def calculate_similarity(a, b):\n",
      "    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))\n",
      "\n",
      "similarity_score = calculate_similarity(soup1, soup2)\n",
      "print(similarity_score)\n",
      "```\n",
      "\n",
      "Make sure to replace \"YOUR_API_KEY\" with your actual API key. The code will output the similarity score between the two phrases. A value closer to 1 means the phrases are very similar, while a value closer to -1 implies they are very different.\n",
      "\n",
      "CITATIONS:\n",
      "start=44 end=56 text='dot product.' document_ids=['doc_1'] type='TEXT_CONTENT'\n",
      "start=207 end=243 text='cohere.Client(api_key=\"YOUR_API_KEY\"' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=246 end=293 text='phrases = [\"i love soup\", \"soup is my favorite\"' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=295 end=323 text='model = \"embed-english-v3.0\"' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=324 end=351 text='input_type = \"search_query\"' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=359 end=368 text='co.embed(' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=369 end=383 text='texts=phrases,' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=384 end=396 text='model=model,' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=397 end=419 text='input_type=input_type,' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=420 end=446 text='embedding_types=[\"float\"],' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=450 end=464 text='(soup1, soup2)' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=467 end=487 text='res.embeddings.float' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=489 end=519 text='def calculate_similarity(a, b)' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=521 end=581 text='return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)' document_ids=['doc_1'] type='TEXT_CONTENT'\n",
      "start=584 end=637 text='similarity_score = calculate_similarity(soup1, soup2)' document_ids=['doc_2'] type='TEXT_CONTENT'\n",
      "start=638 end=661 text='print(similarity_score)' document_ids=['doc_2'] type='TEXT_CONTENT'\n",
      "start=798 end=850 text='value closer to 1 means the phrases are very similar' document_ids=['doc_1'] type='TEXT_CONTENT'\n",
      "start=860 end=911 text='value closer to -1 implies they are very different.' document_ids=['doc_1'] type='TEXT_CONTENT'\n",
      "\n",
      "DOCUMENTS:\n",
      "{'id': 'doc_0', 'text': '1 import cohere 2 import numpy as np 3 4 co = cohere.Client(api_key=\"YOUR_API_KEY\") 5 6 # get the embeddings 7 phrases = [\"i love soup\", \"soup is my favorite\", \"london is far away\"] 8 9 model = \"embed-english-v3.0\" 10 input_type = \"search_query\" 11 12 res = co.embed( 13 texts=phrases, 14 model=model, 15 input_type=input_type, 16 embedding_types=[\"float\"], 17 ) 18 19 (soup1, soup2, london) = res.embeddings.float 20 21 22 # compare them 23 def calculate_similarity(a, b):', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/v2/docs/embeddings'}\n",
      "{'id': 'doc_1', 'text': '24 return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) 25 26 27 calculate_similarity(soup1, soup2) # 0.85 - very similar! 28 calculate_similarity(soup1, london) # 0.16 - not similar!', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/v2/docs/embeddings'}\n",
      "{'id': 'doc_2', 'text': '< Back to modules\\n\\nWhat is Similarity Between Sentences?\\n\\nLuis Serrano\\n\\nFor large language models, it is crucial to know when two words, or two sentences, are similar or different. This can be a hard problem, but luckily, word and sentence embeddings are very helpful for this task. In this post we go over some different notions of similarity.\\n\\nShare:', 'title': 'Similarity Between Words and Sentences', 'url': 'https://cohere.com/llmu/what-is-similarity-between-sentences'}\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "User:  What is large language models?\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Retrieving information...\n",
      "Chatbot:\n",
      "Large Language Models (LLMs) are models that have been trained on large corpora of text data and can answer questions, summarize text, extract information, and do many other tasks. They are designed to understand the nuances of human language.\n",
      "LLMs find applications in various sectors, from helping robots understand commands to aiding in customer service and healthcare solutions.\n",
      "\n",
      "CITATIONS:\n",
      "start=22 end=28 text='(LLMs)' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=55 end=92 text='trained on large corpora of text data' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=101 end=117 text='answer questions' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=119 end=133 text='summarize text' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=135 end=154 text='extract information' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=160 end=180 text='do many other tasks.' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=190 end=198 text='designed' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=202 end=243 text='understand the nuances of human language.' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=249 end=285 text='find applications in various sectors' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=292 end=326 text='helping robots understand commands' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=330 end=356 text='aiding in customer service' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=361 end=382 text='healthcare solutions.' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "\n",
      "DOCUMENTS:\n",
      "{'id': 'doc_0', 'text': '< Back to modules\\n\\nWhat is Similarity Between Sentences?\\n\\nLuis Serrano\\n\\nFor large language models, it is crucial to know when two words, or two sentences, are similar or different. This can be a hard problem, but luckily, word and sentence embeddings are very helpful for this task. In this post we go over some different notions of similarity.\\n\\nShare:', 'title': 'Similarity Between Words and Sentences', 'url': 'https://cohere.com/llmu/what-is-similarity-between-sentences'}\n",
      "{'id': 'doc_1', 'text': 'Multilingual Support\\n\\nIn addition to embed-english-v3.0 we offer a best-in-class multilingual model embed-multilingual-v3.0 with support for over 100 languages, including Chinese, Spanish, and French. This model can be used with the Embed API, just like its English counterpart:\\n\\nPYTHON', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/v2/docs/embeddings'}\n",
      "{'id': 'doc_2', 'text': 'In the example below we use the embed-english-v3.0 model to generate embeddings for 3 phrases and compare them using a similarity function. The two similar phrases have a high similarity score, and the embeddings for two unrelated phrases have a low similarity score:\\n\\nPYTHON', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/v2/docs/embeddings'}\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "User:  Multilingual Support in Cohere how to do this?\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Retrieving information...\n",
      "Chatbot:\n",
      "Cohere offers a multilingual model called \"embed-multilingual-v3.0\" with support for over 100 languages. This model can be used in a similar way to the English model by passing the language as a parameter. Here's an example Python code snippet that demonstrates the usage:\n",
      "```python\n",
      "import cohere\n",
      "co = cohere.Client(api_key=\"YOUR_API_KEY\")\n",
      "texts = [\n",
      "    \"Hello from Cohere!\",\n",
      "    \"مرحبًا من كوهير!\",\n",
      "    \"¡Hola desde Cohere!\",\n",
      "    \"Bonjour de Cohere!\",\n",
      "    \"你好，来自 Cohere！\",\n",
      "    \"कोहेरे से नमस्ते!\",\n",
      "    ]\n",
      "response = co.embed(\n",
      "    model=\"embed-multilingual-v3.0\",\n",
      "    texts=texts,\n",
      "    input_type=\"classification\",\n",
      "    embedding_types=[\"float\"],\n",
      "    )\n",
      "```\n",
      "Make sure to replace \"YOUR_API_KEY\" with your actual API key.\n",
      "\n",
      "CITATIONS:\n",
      "start=42 end=67 text='\"embed-multilingual-v3.0\"' document_ids=['doc_0', 'doc_2'] type='TEXT_CONTENT'\n",
      "start=85 end=104 text='over 100 languages.' document_ids=['doc_0'] type='TEXT_CONTENT'\n",
      "start=350 end=368 text='\"Hello from Cohere' document_ids=['doc_2'] type='TEXT_CONTENT'\n",
      "start=372 end=388 text='\"مرحبًا من كوهير' document_ids=['doc_2'] type='TEXT_CONTENT'\n",
      "start=392 end=411 text='\"¡Hola desde Cohere' document_ids=['doc_2'] type='TEXT_CONTENT'\n",
      "start=415 end=433 text='\"Bonjour de Cohere' document_ids=['doc_2'] type='TEXT_CONTENT'\n",
      "start=437 end=450 text='\"你好，来自 Cohere' document_ids=['doc_2'] type='TEXT_CONTENT'\n",
      "start=454 end=471 text='\"कोहेरे से नमस्ते' document_ids=['doc_2'] type='TEXT_CONTENT'\n",
      "start=503 end=528 text='\"embed-multilingual-v3.0\"' document_ids=['doc_2'] type='TEXT_CONTENT'\n",
      "start=553 end=569 text='\"classification\"' document_ids=['doc_2'] type='TEXT_CONTENT'\n",
      "\n",
      "DOCUMENTS:\n",
      "{'id': 'doc_0', 'text': 'Multilingual Support\\n\\nIn addition to embed-english-v3.0 we offer a best-in-class multilingual model embed-multilingual-v3.0 with support for over 100 languages, including Chinese, Spanish, and French. This model can be used with the Embed API, just like its English counterpart:\\n\\nPYTHON', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/v2/docs/embeddings'}\n",
      "{'id': 'doc_1', 'text': 'On this page\\n\\nThe input_type parameter\\n\\nMultilingual Support\\n\\nImage Embeddings\\n\\nCompression Levels\\n\\nText Embeddings (Vectors, Search, Retrieval)\\n\\nIntroduction to Embeddings at Cohere', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/v2/docs/embeddings'}\n",
      "{'id': 'doc_2', 'text': '1 import cohere 2 3 co = cohere.Client(api_key=\"<YOUR API KEY>\") 4 5 texts = [ 6 \"Hello from Cohere!\", 7 \"مرحبًا من كوهير!\", 8 \"Hallo von Cohere!\", 9 \"Bonjour de Cohere!\", 10 \"¡Hola desde Cohere!\", 11 \"Olá do Cohere!\", 12 \"Ciao da Cohere!\", 13 \"您好，来自 Cohere！\", 14 \"कोहेरे से नमस्ते!\", 15 ] 16 17 response = co.embed( 18 model=\"embed-multilingual-v3.0\", 19 texts=texts, 20 input_type=\"classification\", 21 embedding_types=[\"float\"], 22 ) 23', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/v2/docs/embeddings'}\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "User:  quit\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ending chat.\n"
     ]
    }
   ],
   "source": [
    "chatbot = Chatbot(vectorstore)\n",
    "\n",
    "chatbot.run()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c05c936-8fa6-4e18-b5a7-0d368056e7c7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}