From 14382ef8d3597d0ffe273350f49ed009160fee3c Mon Sep 17 00:00:00 2001 From: Roy Miara Date: Mon, 7 Aug 2023 17:45:54 +0300 Subject: [PATCH 1/3] add how to notebook for ds --- .../how_to_create_pinecone_datasets.ipynb | 560 ++++++++++++++++++ 1 file changed, 560 insertions(+) create mode 100644 docs/assets/how_to_create_pinecone_datasets.ipynb diff --git a/docs/assets/how_to_create_pinecone_datasets.ipynb b/docs/assets/how_to_create_pinecone_datasets.ipynb new file mode 100644 index 0000000..e637c83 --- /dev/null +++ b/docs/assets/how_to_create_pinecone_datasets.ipynb @@ -0,0 +1,560 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/docs/assets/how_to_create_pinecone_datasets.ipynb) \n", + "[![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/docs/assets/how_to_create_pinecone_datasets.ipynb)\n", + "\n", + "# Creaeting Pinecone Datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook will walk you through the process of creating a Pinecone dataset from a pandas Dataframe." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: create a simple sample dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: pandas in /Users/roymiara/Library/Python/3.9/lib/python/site-packages (2.0.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/roymiara/Library/Python/3.9/lib/python/site-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/roymiara/Library/Python/3.9/lib/python/site-packages (from pandas) (2022.2.1)\n", + "Requirement already satisfied: tzdata>=2022.1 in /Users/roymiara/Library/Python/3.9/lib/python/site-packages (from pandas) (2022.7)\n", + "Requirement already satisfied: numpy>=1.20.3 in /Users/roymiara/Library/Python/3.9/lib/python/site-packages (from pandas) (1.24.2)\n", + "Requirement already satisfied: six>=1.5 in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.15.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'extra_field': 'extra_value'}
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value'}
\n", + "
" + ], + "text/plain": [ + " id values sparse_values \\\n", + "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", + "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", + "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", + "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", + "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", + "\n", + " metadata blob \n", + "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n", + "1 {'title': 'title2', 'url': 'url2'} None \n", + "2 {'title': 'title3', 'url': 'url3'} None \n", + "3 {'title': 'title4', 'url': 'url4'} None \n", + "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents = [\n", + " {\n", + " \"id\": \"1\",\n", + " \"values\": [0.1, 0.2, 0.3],\n", + " \"sparse_values\": {\"indices\": [1, 2, 3], \"values\": [0.1, 0.2, 0.3]},\n", + " \"metadata\": {\"title\": \"title1\", \"url\": \"url1\"},\n", + " \"blob\": {\"extra_field\": \"extra_value\"},\n", + " },\n", + " {\n", + " \"id\": \"2\",\n", + " \"values\": [0.4, 0.5, 0.6],\n", + " \"sparse_values\": {\"indices\": [4, 5, 6], \"values\": [0.4, 0.5, 0.6]},\n", + " \"metadata\": {\"title\": \"title2\", \"url\": \"url2\"},\n", + " \"blob\": None,\n", + " },\n", + " {\n", + " \"id\": \"3\",\n", + " \"values\": [0.7, 0.8, 0.9],\n", + " \"sparse_values\": {\"indices\": [7, 8, 9], \"values\": [0.7, 0.8, 0.9]},\n", + " \"metadata\": {\"title\": \"title3\", \"url\": \"url3\"},\n", + " \"blob\": None,\n", + " },\n", + " {\n", + " \"id\": \"4\",\n", + " \"values\": [1.0, 1.1, 1.2],\n", + " \"sparse_values\": {\"indices\": [10, 11, 12], \"values\": [1.0, 1.1, 1.2]},\n", + " \"metadata\": {\"title\": \"title4\", \"url\": \"url4\"},\n", + " \"blob\": None,\n", + " },\n", + " {\n", + " \"id\": \"5\",\n", + " \"values\": [1.3, 1.4, 1.5],\n", + " \"sparse_values\": {\"indices\": [13, 14, 15], \"values\": [1.3, 1.4, 1.5]},\n", + " \"metadata\": {\"title\": \"title5\", \"url\": \"url5\"},\n", + " \"blob\": {\"another_field\": \"another_value\"},\n", + " }\n", + "]\n", + "\n", + "df = pd.DataFrame(documents)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some notes:\n", + "1. Note that we have both metadata field and 'blob' field, the metadata field is the acutal pinecone metadata we will use in our index, blob, is an additional field that we can use to store any additional information we want to store along with the Dataset.\n", + "2. here we used both 'values' and 'sparse_values', however, sparse_values is not a mandatory field, if you don't have sparse values keep it empty." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pinecone Dataset\n", + "\n", + "Now that we have our data Ready, we can create a Pinecone Dataset. A Pinecone Dataset is a collection of documtents, queries and Metadata. We can create a Pinecone \n", + "* Documents: a collection of records with Id, Vectors (dense, sparse) and metadata\n", + "* Queries: a collection of queries with Vectors (dense, sparse), metadata filter and top_k\n", + "* Metadata: a defintion of the dataset: Name, dimension, metric, embedding models, etc." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install pinecone-datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from pinecone_datasets import Dataset, DatasetMetadata" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'name': '',\n", + " 'created_at': '2023-08-07 17:13:11.949042',\n", + " 'documents': 0,\n", + " 'queries': 0,\n", + " 'source': None,\n", + " 'license': None,\n", + " 'bucket': None,\n", + " 'task': None,\n", + " 'dense_model': {'name': '', 'tokenizer': None, 'dimension': 0},\n", + " 'sparse_model': None,\n", + " 'description': None,\n", + " 'tags': None,\n", + " 'args': None}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# creating a new empty metadata\n", + "metadata = DatasetMetadata.empty()\n", + "metadata.dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'extra_field': 'extra_value'}
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value'}
\n", + "
" + ], + "text/plain": [ + " id values sparse_values \\\n", + "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", + "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", + "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", + "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", + "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", + "\n", + " metadata blob \n", + "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n", + "1 {'title': 'title2', 'url': 'url2'} None \n", + "2 {'title': 'title3', 'url': 'url3'} None \n", + "3 {'title': 'title4', 'url': 'url4'} None \n", + "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = Dataset.from_pandas(documents=df, q=None, metadata=metadata)\n", + "ds.documents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save dataset to local path\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/roymiara/Library/Python/3.9/lib/python/site-packages/pinecone_datasets/dataset.py:433: UserWarning: Queries are empty, not saving queries\n", + " warnings.warn(\"Queries are empty, not saving queries\")\n" + ] + } + ], + "source": [ + "ds.to_path('/tmp/ds')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Re-load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "new_ds = Dataset.from_path('/tmp/ds')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'another_field': None, 'extra_field': 'extra_...
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value', 'extra_fiel...
\n", + "
" + ], + "text/plain": [ + " id values sparse_values \\\n", + "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", + "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", + "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", + "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", + "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", + "\n", + " metadata \\\n", + "0 {'title': 'title1', 'url': 'url1'} \n", + "1 {'title': 'title2', 'url': 'url2'} \n", + "2 {'title': 'title3', 'url': 'url3'} \n", + "3 {'title': 'title4', 'url': 'url4'} \n", + "4 {'title': 'title5', 'url': 'url5'} \n", + "\n", + " blob \n", + "0 {'another_field': None, 'extra_field': 'extra_... \n", + "1 None \n", + "2 None \n", + "3 None \n", + "4 {'another_field': 'another_value', 'extra_fiel... " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_ds.documents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 1a1334e7908de0220bcc6b77ec5c1b24b518f9c2 Mon Sep 17 00:00:00 2001 From: Roy Miara Date: Mon, 7 Aug 2023 17:53:44 +0300 Subject: [PATCH 2/3] some edis --- .../how_to_create_pinecone_datasets.ipynb | 37 ++++++++----------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/docs/assets/how_to_create_pinecone_datasets.ipynb b/docs/assets/how_to_create_pinecone_datasets.ipynb index e637c83..75e7210 100644 --- a/docs/assets/how_to_create_pinecone_datasets.ipynb +++ b/docs/assets/how_to_create_pinecone_datasets.ipynb @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -153,7 +153,7 @@ "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " ] }, - "execution_count": 9, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -206,8 +206,8 @@ "metadata": {}, "source": [ "Some notes:\n", - "1. Note that we have both metadata field and 'blob' field, the metadata field is the acutal pinecone metadata we will use in our index, blob, is an additional field that we can use to store any additional information we want to store along with the Dataset.\n", - "2. here we used both 'values' and 'sparse_values', however, sparse_values is not a mandatory field, if you don't have sparse values keep it empty." + "* Note that we have both metadata field and 'blob' field, the metadata field is the acutal pinecone metadata we will use in our index, blob, is an additional field that we can use to store any additional information we want to store along with the Dataset.\n", + "* here we used both 'values' and 'sparse_values', however, sparse_values is not a mandatory field, if you don't have sparse values keep it empty." ] }, { @@ -233,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -242,14 +242,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'name': '',\n", - " 'created_at': '2023-08-07 17:13:11.949042',\n", + " 'created_at': '2023-08-07 17:52:49.166878',\n", " 'documents': 0,\n", " 'queries': 0,\n", " 'source': None,\n", @@ -263,7 +263,7 @@ " 'args': None}" ] }, - "execution_count": 16, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -276,7 +276,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -368,7 +368,7 @@ "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " ] }, - "execution_count": 20, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -387,7 +387,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -412,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -421,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -520,7 +520,7 @@ "4 {'another_field': 'another_value', 'extra_fiel... " ] }, - "execution_count": 24, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -528,11 +528,6 @@ "source": [ "new_ds.documents" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] } ], "metadata": { From c2b9f3dbcb4a71f6ccc95b26edac22539a16e609 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Mon, 14 Aug 2023 17:21:24 +0800 Subject: [PATCH 3/3] minor tweaks --- .../how-to-create-pinecone-datasets.ipynb | 1042 +++++++++++++++++ .../how_to_create_pinecone_datasets.ipynb | 555 --------- 2 files changed, 1042 insertions(+), 555 deletions(-) create mode 100644 docs/assets/how-to-create-pinecone-datasets.ipynb delete mode 100644 docs/assets/how_to_create_pinecone_datasets.ipynb diff --git a/docs/assets/how-to-create-pinecone-datasets.ipynb b/docs/assets/how-to-create-pinecone-datasets.ipynb new file mode 100644 index 0000000..8f53364 --- /dev/null +++ b/docs/assets/how-to-create-pinecone-datasets.ipynb @@ -0,0 +1,1042 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "cdN6QOXIUaUq" + }, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/docs/assets/how-to-create-pinecone-datasets.ipynb)\n", + "[![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/docs/assets/how-to-create-pinecone-datasets.ipynb)\n", + "\n", + "# Creating Pinecone Datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8Fiobs_oUaUr" + }, + "source": [ + "This notebook will walk you through the process of creating a Pinecone dataset from a pandas Dataframe." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DLuQirtzUaUs" + }, + "source": [ + "## Step 1: create a simple sample dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "bVW2DlVQUaUs", + "outputId": "bd3c9438-7c67-4097-b580-4bfdd695ab92", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [], + "source": [ + "!pip install -qU pandas==2.0.2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "fPebr9XNUaUs" + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "I_WRSqY8UaUs", + "outputId": "36348ad8-38ef-40b2-8b0c-fc7e34e12575", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id values sparse_values \\\n", + "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", + "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", + "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", + "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", + "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", + "\n", + " metadata blob \n", + "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n", + "1 {'title': 'title2', 'url': 'url2'} None \n", + "2 {'title': 'title3', 'url': 'url3'} None \n", + "3 {'title': 'title4', 'url': 'url4'} None \n", + "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'extra_field': 'extra_value'}
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value'}
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "documents = [\n", + " {\n", + " \"id\": \"1\",\n", + " \"values\": [0.1, 0.2, 0.3],\n", + " \"sparse_values\": {\"indices\": [1, 2, 3], \"values\": [0.1, 0.2, 0.3]},\n", + " \"metadata\": {\"title\": \"title1\", \"url\": \"url1\"},\n", + " \"blob\": {\"extra_field\": \"extra_value\"},\n", + " },\n", + " {\n", + " \"id\": \"2\",\n", + " \"values\": [0.4, 0.5, 0.6],\n", + " \"sparse_values\": {\"indices\": [4, 5, 6], \"values\": [0.4, 0.5, 0.6]},\n", + " \"metadata\": {\"title\": \"title2\", \"url\": \"url2\"},\n", + " \"blob\": None,\n", + " },\n", + " {\n", + " \"id\": \"3\",\n", + " \"values\": [0.7, 0.8, 0.9],\n", + " \"sparse_values\": {\"indices\": [7, 8, 9], \"values\": [0.7, 0.8, 0.9]},\n", + " \"metadata\": {\"title\": \"title3\", \"url\": \"url3\"},\n", + " \"blob\": None,\n", + " },\n", + " {\n", + " \"id\": \"4\",\n", + " \"values\": [1.0, 1.1, 1.2],\n", + " \"sparse_values\": {\"indices\": [10, 11, 12], \"values\": [1.0, 1.1, 1.2]},\n", + " \"metadata\": {\"title\": \"title4\", \"url\": \"url4\"},\n", + " \"blob\": None,\n", + " },\n", + " {\n", + " \"id\": \"5\",\n", + " \"values\": [1.3, 1.4, 1.5],\n", + " \"sparse_values\": {\"indices\": [13, 14, 15], \"values\": [1.3, 1.4, 1.5]},\n", + " \"metadata\": {\"title\": \"title5\", \"url\": \"url5\"},\n", + " \"blob\": {\"another_field\": \"another_value\"},\n", + " }\n", + "]\n", + "\n", + "df = pd.DataFrame(documents)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c_zwxJ_OUaUt" + }, + "source": [ + "Some notes:\n", + "* Note that we have both metadata field and 'blob' field, the metadata field is the acutal pinecone metadata we will use in our index, blob, is an additional field that we can use to store any additional information we want to store along with the Dataset.\n", + "* here we used both 'values' and 'sparse_values', however, sparse_values is not a mandatory field, if you don't have sparse values keep it empty." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BcFx1wFqUaUt" + }, + "source": [ + "## Pinecone Dataset\n", + "\n", + "Now that we have our data Ready, we can create a Pinecone Dataset. A Pinecone Dataset is a collection of documtents, queries and Metadata. We can create a Pinecone\n", + "* Documents: a collection of records with Id, Vectors (dense, sparse) and metadata\n", + "* Queries: a collection of queries with Vectors (dense, sparse), metadata filter and top_k\n", + "* Metadata: a defintion of the dataset: Name, dimension, metric, embedding models, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "DCGFhTtyUaUt" + }, + "outputs": [], + "source": [ + "!pip install -qU \\\n", + " pinecone-client==2.2.2 \\\n", + " pinecone-datasets==0.6.0" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "S9NCQyTqUaUt" + }, + "outputs": [], + "source": [ + "from pinecone_datasets import Dataset, DatasetMetadata" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "Eaiy3IjIUaUt", + "outputId": "4ff727bd-1a56-42bb-8cd2-e645b5ab390c", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'name': '',\n", + " 'created_at': '2023-08-14 09:18:50.196514',\n", + " 'documents': 0,\n", + " 'queries': 0,\n", + " 'source': None,\n", + " 'license': None,\n", + " 'bucket': None,\n", + " 'task': None,\n", + " 'dense_model': {'name': '', 'tokenizer': None, 'dimension': 0},\n", + " 'sparse_model': None,\n", + " 'description': None,\n", + " 'tags': None,\n", + " 'args': None}" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "# creating a new empty metadata\n", + "metadata = DatasetMetadata.empty()\n", + "metadata.dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "g_ACjKDOUaUt", + "outputId": "bc47c7d1-a3ef-4cf1-9e4b-7da6f82e111c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id values sparse_values \\\n", + "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", + "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", + "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", + "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", + "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", + "\n", + " metadata blob \n", + "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n", + "1 {'title': 'title2', 'url': 'url2'} None \n", + "2 {'title': 'title3', 'url': 'url3'} None \n", + "3 {'title': 'title4', 'url': 'url4'} None \n", + "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'extra_field': 'extra_value'}
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value'}
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "ds = Dataset.from_pandas(documents=df, q=None, metadata=metadata)\n", + "ds.documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CGzdg2sZUaUt" + }, + "source": [ + "## Save dataset to local path\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "IVkK6fJUUaUt", + "outputId": "943ff58d-91d6-4a75-e218-d833214fee1b", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pinecone_datasets/dataset.py:433: UserWarning: Queries are empty, not saving queries\n", + " warnings.warn(\"Queries are empty, not saving queries\")\n" + ] + } + ], + "source": [ + "ds.to_path('/tmp/ds')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B5tvJlnSUaUu" + }, + "source": [ + "### Reload dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "pLEhwSaRUaUu" + }, + "outputs": [], + "source": [ + "new_ds = Dataset.from_path('/tmp/ds')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "J5LJGYqxUaUu", + "outputId": "120f1ebf-e30a-4913-a84f-727e52e2add8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id values sparse_values \\\n", + "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", + "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", + "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", + "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", + "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", + "\n", + " metadata \\\n", + "0 {'title': 'title1', 'url': 'url1'} \n", + "1 {'title': 'title2', 'url': 'url2'} \n", + "2 {'title': 'title3', 'url': 'url3'} \n", + "3 {'title': 'title4', 'url': 'url4'} \n", + "4 {'title': 'title5', 'url': 'url5'} \n", + "\n", + " blob \n", + "0 {'another_field': None, 'extra_field': 'extra_... \n", + "1 None \n", + "2 None \n", + "3 None \n", + "4 {'another_field': 'another_value', 'extra_fiel... " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'another_field': None, 'extra_field': 'extra_...
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value', 'extra_fiel...
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "new_ds.documents" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "orig_nbformat": 4, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/docs/assets/how_to_create_pinecone_datasets.ipynb b/docs/assets/how_to_create_pinecone_datasets.ipynb deleted file mode 100644 index 75e7210..0000000 --- a/docs/assets/how_to_create_pinecone_datasets.ipynb +++ /dev/null @@ -1,555 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/docs/assets/how_to_create_pinecone_datasets.ipynb) \n", - "[![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/docs/assets/how_to_create_pinecone_datasets.ipynb)\n", - "\n", - "# Creaeting Pinecone Datasets" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook will walk you through the process of creating a Pinecone dataset from a pandas Dataframe." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 1: create a simple sample dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Defaulting to user installation because normal site-packages is not writeable\n", - "Requirement already satisfied: pandas in /Users/roymiara/Library/Python/3.9/lib/python/site-packages (2.0.2)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/roymiara/Library/Python/3.9/lib/python/site-packages (from pandas) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /Users/roymiara/Library/Python/3.9/lib/python/site-packages (from pandas) (2022.2.1)\n", - "Requirement already satisfied: tzdata>=2022.1 in /Users/roymiara/Library/Python/3.9/lib/python/site-packages (from pandas) (2022.7)\n", - "Requirement already satisfied: numpy>=1.20.3 in /Users/roymiara/Library/Python/3.9/lib/python/site-packages (from pandas) (1.24.2)\n", - "Requirement already satisfied: six>=1.5 in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.15.0)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install pandas" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'extra_field': 'extra_value'}
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value'}
\n", - "
" - ], - "text/plain": [ - " id values sparse_values \\\n", - "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", - "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", - "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", - "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", - "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", - "\n", - " metadata blob \n", - "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n", - "1 {'title': 'title2', 'url': 'url2'} None \n", - "2 {'title': 'title3', 'url': 'url3'} None \n", - "3 {'title': 'title4', 'url': 'url4'} None \n", - "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "documents = [\n", - " {\n", - " \"id\": \"1\",\n", - " \"values\": [0.1, 0.2, 0.3],\n", - " \"sparse_values\": {\"indices\": [1, 2, 3], \"values\": [0.1, 0.2, 0.3]},\n", - " \"metadata\": {\"title\": \"title1\", \"url\": \"url1\"},\n", - " \"blob\": {\"extra_field\": \"extra_value\"},\n", - " },\n", - " {\n", - " \"id\": \"2\",\n", - " \"values\": [0.4, 0.5, 0.6],\n", - " \"sparse_values\": {\"indices\": [4, 5, 6], \"values\": [0.4, 0.5, 0.6]},\n", - " \"metadata\": {\"title\": \"title2\", \"url\": \"url2\"},\n", - " \"blob\": None,\n", - " },\n", - " {\n", - " \"id\": \"3\",\n", - " \"values\": [0.7, 0.8, 0.9],\n", - " \"sparse_values\": {\"indices\": [7, 8, 9], \"values\": [0.7, 0.8, 0.9]},\n", - " \"metadata\": {\"title\": \"title3\", \"url\": \"url3\"},\n", - " \"blob\": None,\n", - " },\n", - " {\n", - " \"id\": \"4\",\n", - " \"values\": [1.0, 1.1, 1.2],\n", - " \"sparse_values\": {\"indices\": [10, 11, 12], \"values\": [1.0, 1.1, 1.2]},\n", - " \"metadata\": {\"title\": \"title4\", \"url\": \"url4\"},\n", - " \"blob\": None,\n", - " },\n", - " {\n", - " \"id\": \"5\",\n", - " \"values\": [1.3, 1.4, 1.5],\n", - " \"sparse_values\": {\"indices\": [13, 14, 15], \"values\": [1.3, 1.4, 1.5]},\n", - " \"metadata\": {\"title\": \"title5\", \"url\": \"url5\"},\n", - " \"blob\": {\"another_field\": \"another_value\"},\n", - " }\n", - "]\n", - "\n", - "df = pd.DataFrame(documents)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Some notes:\n", - "* Note that we have both metadata field and 'blob' field, the metadata field is the acutal pinecone metadata we will use in our index, blob, is an additional field that we can use to store any additional information we want to store along with the Dataset.\n", - "* here we used both 'values' and 'sparse_values', however, sparse_values is not a mandatory field, if you don't have sparse values keep it empty." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pinecone Dataset\n", - "\n", - "Now that we have our data Ready, we can create a Pinecone Dataset. A Pinecone Dataset is a collection of documtents, queries and Metadata. We can create a Pinecone \n", - "* Documents: a collection of records with Id, Vectors (dense, sparse) and metadata\n", - "* Queries: a collection of queries with Vectors (dense, sparse), metadata filter and top_k\n", - "* Metadata: a defintion of the dataset: Name, dimension, metric, embedding models, etc." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install pinecone-datasets" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "from pinecone_datasets import Dataset, DatasetMetadata" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'name': '',\n", - " 'created_at': '2023-08-07 17:52:49.166878',\n", - " 'documents': 0,\n", - " 'queries': 0,\n", - " 'source': None,\n", - " 'license': None,\n", - " 'bucket': None,\n", - " 'task': None,\n", - " 'dense_model': {'name': '', 'tokenizer': None, 'dimension': 0},\n", - " 'sparse_model': None,\n", - " 'description': None,\n", - " 'tags': None,\n", - " 'args': None}" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# creating a new empty metadata\n", - "metadata = DatasetMetadata.empty()\n", - "metadata.dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'extra_field': 'extra_value'}
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value'}
\n", - "
" - ], - "text/plain": [ - " id values sparse_values \\\n", - "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", - "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", - "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", - "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", - "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", - "\n", - " metadata blob \n", - "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n", - "1 {'title': 'title2', 'url': 'url2'} None \n", - "2 {'title': 'title3', 'url': 'url3'} None \n", - "3 {'title': 'title4', 'url': 'url4'} None \n", - "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds = Dataset.from_pandas(documents=df, q=None, metadata=metadata)\n", - "ds.documents" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save dataset to local path\n" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/roymiara/Library/Python/3.9/lib/python/site-packages/pinecone_datasets/dataset.py:433: UserWarning: Queries are empty, not saving queries\n", - " warnings.warn(\"Queries are empty, not saving queries\")\n" - ] - } - ], - "source": [ - "ds.to_path('/tmp/ds')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Re-load dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "new_ds = Dataset.from_path('/tmp/ds')" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'another_field': None, 'extra_field': 'extra_...
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value', 'extra_fiel...
\n", - "
" - ], - "text/plain": [ - " id values sparse_values \\\n", - "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", - "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", - "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", - "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", - "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", - "\n", - " metadata \\\n", - "0 {'title': 'title1', 'url': 'url1'} \n", - "1 {'title': 'title2', 'url': 'url2'} \n", - "2 {'title': 'title3', 'url': 'url3'} \n", - "3 {'title': 'title4', 'url': 'url4'} \n", - "4 {'title': 'title5', 'url': 'url5'} \n", - "\n", - " blob \n", - "0 {'another_field': None, 'extra_field': 'extra_... \n", - "1 None \n", - "2 None \n", - "3 None \n", - "4 {'another_field': 'another_value', 'extra_fiel... " - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_ds.documents" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -}