181 lines
4.8 KiB
Plaintext
181 lines
4.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "xcccEva1WWrh"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install sentence_transformers pinecone openai"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "BYUc4Z7vY2bb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"os.environ['OPENAI_API_KEY'] = \"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "MZAy8TaKY6pI"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#This is for embedding. In here, one LM model from huggingface used.\n",
|
|
"\n",
|
|
"from sentence_transformers import SentenceTransformer, util\n",
|
|
"model = SentenceTransformer('all-MiniLM-L6-v2')\n",
|
|
"\n",
|
|
"\n",
|
|
"text ='Abc'\n",
|
|
"model.encode(text).tolist() #exmple how to do encoding."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "gu04tON0cZvT"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Function to split long documents in to smaller parts\n",
|
|
"def split_text_into_chunks(plain_text, max_chars=2000):\n",
|
|
" text_chunks = []\n",
|
|
" current_chunk = \"\"\n",
|
|
" for line in plain_text.split(\"\\n\"):\n",
|
|
" if len(current_chunk) + len(line) + 1 <= max_chars:\n",
|
|
" current_chunk += line + \" \"\n",
|
|
" else:\n",
|
|
" text_chunks.append(current_chunk.strip())\n",
|
|
" current_chunk = line + \" \"\n",
|
|
" if current_chunk:\n",
|
|
" text_chunks.append(current_chunk.strip())\n",
|
|
" return text_chunks"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "EqCYELlQZN0m"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pinecone\n",
|
|
"pinecone.init(api_key=\"\", environment=\"\") #Todo: Initialization of vector database module\n",
|
|
"index = pinecone.Index(\"\") #Todo: Fill out with index name."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "CAqSjLcQZjjJ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def addData(corpusData):\n",
|
|
" id = index.describe_index_stats()['total_vector_count']\n",
|
|
" for i in range(len(corpusData)):\n",
|
|
" chunk=corpusData[i]\n",
|
|
" chunkInfo=(str(id+i),\n",
|
|
" model.encode(chunk).tolist(), #We are using the model to encode the original chunk of text.\n",
|
|
" {'context': chunk}) #In metadata we are storing the original text here as context. \n",
|
|
" index.upsert(vectors=[chunkInfo])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "8VIZ5_ufbRQ5"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#This function is responsible for matching the input string with alread existing data on vector database.\n",
|
|
"\n",
|
|
"def find_match(query,k):\n",
|
|
" query_em = model.encode(query).tolist()\n",
|
|
" result = index.query(query_em, top_k=k, includeMetadata=True)\n",
|
|
" \n",
|
|
" return [result['matches'][i]['metadata']['context'] for i in range(k)]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "AoRDzK85aF9E"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def create_prompt(context,query):\n",
|
|
" #Todo: Should be generated with the context/contexts we find by doing semantaic search\n",
|
|
" pass"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "IyPNrKW3aeoD"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def generate_answer(prompt):\n",
|
|
" #Todo: Pass the generated prompt and pass it to gpt-3 to get answers.\n",
|
|
" pass"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "uWM2IcOKarWz"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def user_query(query):\n",
|
|
" #Todo: Make all the things together.\n",
|
|
" pass\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "jBds94_gbJ_G"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"user_query(\"How can I do this?\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0
|
|
}
|