diff --git a/docs/assets/how-to-create-pinecone-datasets.ipynb b/docs/assets/how-to-create-pinecone-datasets.ipynb
new file mode 100644
index 0000000..8f53364
--- /dev/null
+++ b/docs/assets/how-to-create-pinecone-datasets.ipynb
@@ -0,0 +1,1042 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cdN6QOXIUaUq"
+ },
+ "source": [
+ "[](https://colab.research.google.com/github/pinecone-io/examples/blob/master/docs/assets/how-to-create-pinecone-datasets.ipynb)\n",
+ "[](https://nbviewer.org/github/pinecone-io/examples/blob/master/docs/assets/how-to-create-pinecone-datasets.ipynb)\n",
+ "\n",
+ "# Creating Pinecone Datasets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "8Fiobs_oUaUr"
+ },
+ "source": [
+ "This notebook will walk you through the process of creating a Pinecone dataset from a pandas Dataframe."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "DLuQirtzUaUs"
+ },
+ "source": [
+ "## Step 1: create a simple sample dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "bVW2DlVQUaUs",
+ "outputId": "bd3c9438-7c67-4097-b580-4bfdd695ab92",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!pip install -qU pandas==2.0.2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "fPebr9XNUaUs"
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "id": "I_WRSqY8UaUs",
+ "outputId": "36348ad8-38ef-40b2-8b0c-fc7e34e12575",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id values sparse_values \\\n",
+ "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n",
+ "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n",
+ "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n",
+ "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n",
+ "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n",
+ "\n",
+ " metadata blob \n",
+ "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n",
+ "1 {'title': 'title2', 'url': 'url2'} None \n",
+ "2 {'title': 'title3', 'url': 'url3'} None \n",
+ "3 {'title': 'title4', 'url': 'url4'} None \n",
+ "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " values | \n",
+ " sparse_values | \n",
+ " metadata | \n",
+ " blob | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " [0.1, 0.2, 0.3] | \n",
+ " {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} | \n",
+ " {'title': 'title1', 'url': 'url1'} | \n",
+ " {'extra_field': 'extra_value'} | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " [0.4, 0.5, 0.6] | \n",
+ " {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} | \n",
+ " {'title': 'title2', 'url': 'url2'} | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " [0.7, 0.8, 0.9] | \n",
+ " {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} | \n",
+ " {'title': 'title3', 'url': 'url3'} | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " [1.0, 1.1, 1.2] | \n",
+ " {'indices': [10, 11, 12], 'values': [1.0, 1.1,... | \n",
+ " {'title': 'title4', 'url': 'url4'} | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " [1.3, 1.4, 1.5] | \n",
+ " {'indices': [13, 14, 15], 'values': [1.3, 1.4,... | \n",
+ " {'title': 'title5', 'url': 'url5'} | \n",
+ " {'another_field': 'another_value'} | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ],
+ "source": [
+ "documents = [\n",
+ " {\n",
+ " \"id\": \"1\",\n",
+ " \"values\": [0.1, 0.2, 0.3],\n",
+ " \"sparse_values\": {\"indices\": [1, 2, 3], \"values\": [0.1, 0.2, 0.3]},\n",
+ " \"metadata\": {\"title\": \"title1\", \"url\": \"url1\"},\n",
+ " \"blob\": {\"extra_field\": \"extra_value\"},\n",
+ " },\n",
+ " {\n",
+ " \"id\": \"2\",\n",
+ " \"values\": [0.4, 0.5, 0.6],\n",
+ " \"sparse_values\": {\"indices\": [4, 5, 6], \"values\": [0.4, 0.5, 0.6]},\n",
+ " \"metadata\": {\"title\": \"title2\", \"url\": \"url2\"},\n",
+ " \"blob\": None,\n",
+ " },\n",
+ " {\n",
+ " \"id\": \"3\",\n",
+ " \"values\": [0.7, 0.8, 0.9],\n",
+ " \"sparse_values\": {\"indices\": [7, 8, 9], \"values\": [0.7, 0.8, 0.9]},\n",
+ " \"metadata\": {\"title\": \"title3\", \"url\": \"url3\"},\n",
+ " \"blob\": None,\n",
+ " },\n",
+ " {\n",
+ " \"id\": \"4\",\n",
+ " \"values\": [1.0, 1.1, 1.2],\n",
+ " \"sparse_values\": {\"indices\": [10, 11, 12], \"values\": [1.0, 1.1, 1.2]},\n",
+ " \"metadata\": {\"title\": \"title4\", \"url\": \"url4\"},\n",
+ " \"blob\": None,\n",
+ " },\n",
+ " {\n",
+ " \"id\": \"5\",\n",
+ " \"values\": [1.3, 1.4, 1.5],\n",
+ " \"sparse_values\": {\"indices\": [13, 14, 15], \"values\": [1.3, 1.4, 1.5]},\n",
+ " \"metadata\": {\"title\": \"title5\", \"url\": \"url5\"},\n",
+ " \"blob\": {\"another_field\": \"another_value\"},\n",
+ " }\n",
+ "]\n",
+ "\n",
+ "df = pd.DataFrame(documents)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "c_zwxJ_OUaUt"
+ },
+ "source": [
+ "Some notes:\n",
+ "* Note that we have both metadata field and 'blob' field, the metadata field is the acutal pinecone metadata we will use in our index, blob, is an additional field that we can use to store any additional information we want to store along with the Dataset.\n",
+ "* here we used both 'values' and 'sparse_values', however, sparse_values is not a mandatory field, if you don't have sparse values keep it empty."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BcFx1wFqUaUt"
+ },
+ "source": [
+ "## Pinecone Dataset\n",
+ "\n",
+ "Now that we have our data Ready, we can create a Pinecone Dataset. A Pinecone Dataset is a collection of documtents, queries and Metadata. We can create a Pinecone\n",
+ "* Documents: a collection of records with Id, Vectors (dense, sparse) and metadata\n",
+ "* Queries: a collection of queries with Vectors (dense, sparse), metadata filter and top_k\n",
+ "* Metadata: a defintion of the dataset: Name, dimension, metric, embedding models, etc."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "id": "DCGFhTtyUaUt"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install -qU \\\n",
+ " pinecone-client==2.2.2 \\\n",
+ " pinecone-datasets==0.6.0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "id": "S9NCQyTqUaUt"
+ },
+ "outputs": [],
+ "source": [
+ "from pinecone_datasets import Dataset, DatasetMetadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "id": "Eaiy3IjIUaUt",
+ "outputId": "4ff727bd-1a56-42bb-8cd2-e645b5ab390c",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "{'name': '',\n",
+ " 'created_at': '2023-08-14 09:18:50.196514',\n",
+ " 'documents': 0,\n",
+ " 'queries': 0,\n",
+ " 'source': None,\n",
+ " 'license': None,\n",
+ " 'bucket': None,\n",
+ " 'task': None,\n",
+ " 'dense_model': {'name': '', 'tokenizer': None, 'dimension': 0},\n",
+ " 'sparse_model': None,\n",
+ " 'description': None,\n",
+ " 'tags': None,\n",
+ " 'args': None}"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "source": [
+ "# creating a new empty metadata\n",
+ "metadata = DatasetMetadata.empty()\n",
+ "metadata.dict()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "id": "g_ACjKDOUaUt",
+ "outputId": "bc47c7d1-a3ef-4cf1-9e4b-7da6f82e111c",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id values sparse_values \\\n",
+ "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n",
+ "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n",
+ "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n",
+ "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n",
+ "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n",
+ "\n",
+ " metadata blob \n",
+ "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n",
+ "1 {'title': 'title2', 'url': 'url2'} None \n",
+ "2 {'title': 'title3', 'url': 'url3'} None \n",
+ "3 {'title': 'title4', 'url': 'url4'} None \n",
+ "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " values | \n",
+ " sparse_values | \n",
+ " metadata | \n",
+ " blob | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " [0.1, 0.2, 0.3] | \n",
+ " {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} | \n",
+ " {'title': 'title1', 'url': 'url1'} | \n",
+ " {'extra_field': 'extra_value'} | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " [0.4, 0.5, 0.6] | \n",
+ " {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} | \n",
+ " {'title': 'title2', 'url': 'url2'} | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " [0.7, 0.8, 0.9] | \n",
+ " {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} | \n",
+ " {'title': 'title3', 'url': 'url3'} | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " [1.0, 1.1, 1.2] | \n",
+ " {'indices': [10, 11, 12], 'values': [1.0, 1.1,... | \n",
+ " {'title': 'title4', 'url': 'url4'} | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " [1.3, 1.4, 1.5] | \n",
+ " {'indices': [13, 14, 15], 'values': [1.3, 1.4,... | \n",
+ " {'title': 'title5', 'url': 'url5'} | \n",
+ " {'another_field': 'another_value'} | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ],
+ "source": [
+ "ds = Dataset.from_pandas(documents=df, q=None, metadata=metadata)\n",
+ "ds.documents"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "CGzdg2sZUaUt"
+ },
+ "source": [
+ "## Save dataset to local path\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "id": "IVkK6fJUUaUt",
+ "outputId": "943ff58d-91d6-4a75-e218-d833214fee1b",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/pinecone_datasets/dataset.py:433: UserWarning: Queries are empty, not saving queries\n",
+ " warnings.warn(\"Queries are empty, not saving queries\")\n"
+ ]
+ }
+ ],
+ "source": [
+ "ds.to_path('/tmp/ds')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "B5tvJlnSUaUu"
+ },
+ "source": [
+ "### Reload dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "id": "pLEhwSaRUaUu"
+ },
+ "outputs": [],
+ "source": [
+ "new_ds = Dataset.from_path('/tmp/ds')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "id": "J5LJGYqxUaUu",
+ "outputId": "120f1ebf-e30a-4913-a84f-727e52e2add8",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id values sparse_values \\\n",
+ "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n",
+ "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n",
+ "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n",
+ "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n",
+ "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n",
+ "\n",
+ " metadata \\\n",
+ "0 {'title': 'title1', 'url': 'url1'} \n",
+ "1 {'title': 'title2', 'url': 'url2'} \n",
+ "2 {'title': 'title3', 'url': 'url3'} \n",
+ "3 {'title': 'title4', 'url': 'url4'} \n",
+ "4 {'title': 'title5', 'url': 'url5'} \n",
+ "\n",
+ " blob \n",
+ "0 {'another_field': None, 'extra_field': 'extra_... \n",
+ "1 None \n",
+ "2 None \n",
+ "3 None \n",
+ "4 {'another_field': 'another_value', 'extra_fiel... "
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " values | \n",
+ " sparse_values | \n",
+ " metadata | \n",
+ " blob | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " [0.1, 0.2, 0.3] | \n",
+ " {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} | \n",
+ " {'title': 'title1', 'url': 'url1'} | \n",
+ " {'another_field': None, 'extra_field': 'extra_... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " [0.4, 0.5, 0.6] | \n",
+ " {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} | \n",
+ " {'title': 'title2', 'url': 'url2'} | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " [0.7, 0.8, 0.9] | \n",
+ " {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} | \n",
+ " {'title': 'title3', 'url': 'url3'} | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " [1.0, 1.1, 1.2] | \n",
+ " {'indices': [10, 11, 12], 'values': [1.0, 1.1,... | \n",
+ " {'title': 'title4', 'url': 'url4'} | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " [1.3, 1.4, 1.5] | \n",
+ " {'indices': [13, 14, 15], 'values': [1.3, 1.4,... | \n",
+ " {'title': 'title5', 'url': 'url5'} | \n",
+ " {'another_field': 'another_value', 'extra_fiel... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ],
+ "source": [
+ "new_ds.documents"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.6"
+ },
+ "orig_nbformat": 4,
+ "colab": {
+ "provenance": []
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file