diff --git a/docs/assets/how-to-create-pinecone-datasets.ipynb b/docs/assets/how-to-create-pinecone-datasets.ipynb new file mode 100644 index 0000000..8f53364 --- /dev/null +++ b/docs/assets/how-to-create-pinecone-datasets.ipynb @@ -0,0 +1,1042 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "cdN6QOXIUaUq" + }, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/docs/assets/how-to-create-pinecone-datasets.ipynb)\n", + "[![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/docs/assets/how-to-create-pinecone-datasets.ipynb)\n", + "\n", + "# Creating Pinecone Datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8Fiobs_oUaUr" + }, + "source": [ + "This notebook will walk you through the process of creating a Pinecone dataset from a pandas Dataframe." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DLuQirtzUaUs" + }, + "source": [ + "## Step 1: create a simple sample dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "bVW2DlVQUaUs", + "outputId": "bd3c9438-7c67-4097-b580-4bfdd695ab92", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [], + "source": [ + "!pip install -qU pandas==2.0.2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "fPebr9XNUaUs" + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "I_WRSqY8UaUs", + "outputId": "36348ad8-38ef-40b2-8b0c-fc7e34e12575", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id values sparse_values \\\n", + "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", + "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", + "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", + "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", + "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", + "\n", + " metadata blob \n", + "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n", + "1 {'title': 'title2', 'url': 'url2'} None \n", + "2 {'title': 'title3', 'url': 'url3'} None \n", + "3 {'title': 'title4', 'url': 'url4'} None \n", + "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'extra_field': 'extra_value'}
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value'}
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "documents = [\n", + " {\n", + " \"id\": \"1\",\n", + " \"values\": [0.1, 0.2, 0.3],\n", + " \"sparse_values\": {\"indices\": [1, 2, 3], \"values\": [0.1, 0.2, 0.3]},\n", + " \"metadata\": {\"title\": \"title1\", \"url\": \"url1\"},\n", + " \"blob\": {\"extra_field\": \"extra_value\"},\n", + " },\n", + " {\n", + " \"id\": \"2\",\n", + " \"values\": [0.4, 0.5, 0.6],\n", + " \"sparse_values\": {\"indices\": [4, 5, 6], \"values\": [0.4, 0.5, 0.6]},\n", + " \"metadata\": {\"title\": \"title2\", \"url\": \"url2\"},\n", + " \"blob\": None,\n", + " },\n", + " {\n", + " \"id\": \"3\",\n", + " \"values\": [0.7, 0.8, 0.9],\n", + " \"sparse_values\": {\"indices\": [7, 8, 9], \"values\": [0.7, 0.8, 0.9]},\n", + " \"metadata\": {\"title\": \"title3\", \"url\": \"url3\"},\n", + " \"blob\": None,\n", + " },\n", + " {\n", + " \"id\": \"4\",\n", + " \"values\": [1.0, 1.1, 1.2],\n", + " \"sparse_values\": {\"indices\": [10, 11, 12], \"values\": [1.0, 1.1, 1.2]},\n", + " \"metadata\": {\"title\": \"title4\", \"url\": \"url4\"},\n", + " \"blob\": None,\n", + " },\n", + " {\n", + " \"id\": \"5\",\n", + " \"values\": [1.3, 1.4, 1.5],\n", + " \"sparse_values\": {\"indices\": [13, 14, 15], \"values\": [1.3, 1.4, 1.5]},\n", + " \"metadata\": {\"title\": \"title5\", \"url\": \"url5\"},\n", + " \"blob\": {\"another_field\": \"another_value\"},\n", + " }\n", + "]\n", + "\n", + "df = pd.DataFrame(documents)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c_zwxJ_OUaUt" + }, + "source": [ + "Some notes:\n", + "* Note that we have both metadata field and 'blob' field, the metadata field is the acutal pinecone metadata we will use in our index, blob, is an additional field that we can use to store any additional information we want to store along with the Dataset.\n", + "* here we used both 'values' and 'sparse_values', however, sparse_values is not a mandatory field, if you don't have sparse values keep it empty." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BcFx1wFqUaUt" + }, + "source": [ + "## Pinecone Dataset\n", + "\n", + "Now that we have our data Ready, we can create a Pinecone Dataset. A Pinecone Dataset is a collection of documtents, queries and Metadata. We can create a Pinecone\n", + "* Documents: a collection of records with Id, Vectors (dense, sparse) and metadata\n", + "* Queries: a collection of queries with Vectors (dense, sparse), metadata filter and top_k\n", + "* Metadata: a defintion of the dataset: Name, dimension, metric, embedding models, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "DCGFhTtyUaUt" + }, + "outputs": [], + "source": [ + "!pip install -qU \\\n", + " pinecone-client==2.2.2 \\\n", + " pinecone-datasets==0.6.0" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "S9NCQyTqUaUt" + }, + "outputs": [], + "source": [ + "from pinecone_datasets import Dataset, DatasetMetadata" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "Eaiy3IjIUaUt", + "outputId": "4ff727bd-1a56-42bb-8cd2-e645b5ab390c", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'name': '',\n", + " 'created_at': '2023-08-14 09:18:50.196514',\n", + " 'documents': 0,\n", + " 'queries': 0,\n", + " 'source': None,\n", + " 'license': None,\n", + " 'bucket': None,\n", + " 'task': None,\n", + " 'dense_model': {'name': '', 'tokenizer': None, 'dimension': 0},\n", + " 'sparse_model': None,\n", + " 'description': None,\n", + " 'tags': None,\n", + " 'args': None}" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "# creating a new empty metadata\n", + "metadata = DatasetMetadata.empty()\n", + "metadata.dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "g_ACjKDOUaUt", + "outputId": "bc47c7d1-a3ef-4cf1-9e4b-7da6f82e111c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id values sparse_values \\\n", + "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", + "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", + "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", + "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", + "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", + "\n", + " metadata blob \n", + "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n", + "1 {'title': 'title2', 'url': 'url2'} None \n", + "2 {'title': 'title3', 'url': 'url3'} None \n", + "3 {'title': 'title4', 'url': 'url4'} None \n", + "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'extra_field': 'extra_value'}
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value'}
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "ds = Dataset.from_pandas(documents=df, q=None, metadata=metadata)\n", + "ds.documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CGzdg2sZUaUt" + }, + "source": [ + "## Save dataset to local path\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "IVkK6fJUUaUt", + "outputId": "943ff58d-91d6-4a75-e218-d833214fee1b", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pinecone_datasets/dataset.py:433: UserWarning: Queries are empty, not saving queries\n", + " warnings.warn(\"Queries are empty, not saving queries\")\n" + ] + } + ], + "source": [ + "ds.to_path('/tmp/ds')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B5tvJlnSUaUu" + }, + "source": [ + "### Reload dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "pLEhwSaRUaUu" + }, + "outputs": [], + "source": [ + "new_ds = Dataset.from_path('/tmp/ds')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "J5LJGYqxUaUu", + "outputId": "120f1ebf-e30a-4913-a84f-727e52e2add8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id values sparse_values \\\n", + "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", + "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", + "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", + "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", + "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", + "\n", + " metadata \\\n", + "0 {'title': 'title1', 'url': 'url1'} \n", + "1 {'title': 'title2', 'url': 'url2'} \n", + "2 {'title': 'title3', 'url': 'url3'} \n", + "3 {'title': 'title4', 'url': 'url4'} \n", + "4 {'title': 'title5', 'url': 'url5'} \n", + "\n", + " blob \n", + "0 {'another_field': None, 'extra_field': 'extra_... \n", + "1 None \n", + "2 None \n", + "3 None \n", + "4 {'another_field': 'another_value', 'extra_fiel... " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'another_field': None, 'extra_field': 'extra_...
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value', 'extra_fiel...
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "new_ds.documents" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "orig_nbformat": 4, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file