mirror of
				https://github.com/anthropics/claude-cookbooks.git
				synced 2025-10-06 01:00:28 +03:00 
			
		
		
		
	WIP Text to SQL
This commit is contained in:
		| @@ -65,17 +65,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 56, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "Note: you may need to restart the kernel to use updated packages.\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "%pip install -q anthropic pandas voyageai" | ||||
|    ] | ||||
| @@ -91,7 +83,6 @@ | ||||
|     "import sqlite3\n", | ||||
|     "import pandas as pd\n", | ||||
|     "from IPython.display import display\n", | ||||
|     "from textwrap import dedent\n", | ||||
|     "\n", | ||||
|     "# Set your Anthropic API key\n", | ||||
|     "os.environ[\"ANTHROPIC_API_KEY\"] = \"YOUR_ANTHROPIC_API_KEY\"\n", | ||||
| @@ -114,176 +105,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 58, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "\n", | ||||
|       "Departments table:\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "data": { | ||||
|       "text/html": [ | ||||
|        "<div>\n", | ||||
|        "<style scoped>\n", | ||||
|        "    .dataframe tbody tr th:only-of-type {\n", | ||||
|        "        vertical-align: middle;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe tbody tr th {\n", | ||||
|        "        vertical-align: top;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe thead th {\n", | ||||
|        "        text-align: right;\n", | ||||
|        "    }\n", | ||||
|        "</style>\n", | ||||
|        "<table border=\"1\" class=\"dataframe\">\n", | ||||
|        "  <thead>\n", | ||||
|        "    <tr style=\"text-align: right;\">\n", | ||||
|        "      <th></th>\n", | ||||
|        "      <th>id</th>\n", | ||||
|        "      <th>name</th>\n", | ||||
|        "      <th>location</th>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </thead>\n", | ||||
|        "  <tbody>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>0</th>\n", | ||||
|        "      <td>1</td>\n", | ||||
|        "      <td>HR</td>\n", | ||||
|        "      <td>New York</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>1</th>\n", | ||||
|        "      <td>2</td>\n", | ||||
|        "      <td>Engineering</td>\n", | ||||
|        "      <td>San Francisco</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>2</th>\n", | ||||
|        "      <td>3</td>\n", | ||||
|        "      <td>Marketing</td>\n", | ||||
|        "      <td>Chicago</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </tbody>\n", | ||||
|        "</table>\n", | ||||
|        "</div>" | ||||
|       ], | ||||
|       "text/plain": [ | ||||
|        "   id         name       location\n", | ||||
|        "0   1           HR       New York\n", | ||||
|        "1   2  Engineering  San Francisco\n", | ||||
|        "2   3    Marketing        Chicago" | ||||
|       ] | ||||
|      }, | ||||
|      "metadata": {}, | ||||
|      "output_type": "display_data" | ||||
|     }, | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "\n", | ||||
|       "Employees table:\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "data": { | ||||
|       "text/html": [ | ||||
|        "<div>\n", | ||||
|        "<style scoped>\n", | ||||
|        "    .dataframe tbody tr th:only-of-type {\n", | ||||
|        "        vertical-align: middle;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe tbody tr th {\n", | ||||
|        "        vertical-align: top;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe thead th {\n", | ||||
|        "        text-align: right;\n", | ||||
|        "    }\n", | ||||
|        "</style>\n", | ||||
|        "<table border=\"1\" class=\"dataframe\">\n", | ||||
|        "  <thead>\n", | ||||
|        "    <tr style=\"text-align: right;\">\n", | ||||
|        "      <th></th>\n", | ||||
|        "      <th>id</th>\n", | ||||
|        "      <th>name</th>\n", | ||||
|        "      <th>age</th>\n", | ||||
|        "      <th>department_id</th>\n", | ||||
|        "      <th>salary</th>\n", | ||||
|        "      <th>hire_date</th>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </thead>\n", | ||||
|        "  <tbody>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>0</th>\n", | ||||
|        "      <td>1</td>\n", | ||||
|        "      <td>John Doe</td>\n", | ||||
|        "      <td>30</td>\n", | ||||
|        "      <td>2</td>\n", | ||||
|        "      <td>75000.0</td>\n", | ||||
|        "      <td>2020-01-15</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>1</th>\n", | ||||
|        "      <td>2</td>\n", | ||||
|        "      <td>Jane Smith</td>\n", | ||||
|        "      <td>35</td>\n", | ||||
|        "      <td>1</td>\n", | ||||
|        "      <td>65000.0</td>\n", | ||||
|        "      <td>2019-05-01</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>2</th>\n", | ||||
|        "      <td>3</td>\n", | ||||
|        "      <td>Bob Johnson</td>\n", | ||||
|        "      <td>28</td>\n", | ||||
|        "      <td>2</td>\n", | ||||
|        "      <td>80000.0</td>\n", | ||||
|        "      <td>2021-03-10</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>3</th>\n", | ||||
|        "      <td>4</td>\n", | ||||
|        "      <td>Alice Brown</td>\n", | ||||
|        "      <td>42</td>\n", | ||||
|        "      <td>3</td>\n", | ||||
|        "      <td>70000.0</td>\n", | ||||
|        "      <td>2018-11-20</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>4</th>\n", | ||||
|        "      <td>5</td>\n", | ||||
|        "      <td>Charlie Davis</td>\n", | ||||
|        "      <td>31</td>\n", | ||||
|        "      <td>2</td>\n", | ||||
|        "      <td>85000.0</td>\n", | ||||
|        "      <td>2022-07-01</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </tbody>\n", | ||||
|        "</table>\n", | ||||
|        "</div>" | ||||
|       ], | ||||
|       "text/plain": [ | ||||
|        "   id           name  age  department_id   salary   hire_date\n", | ||||
|        "0   1       John Doe   30              2  75000.0  2020-01-15\n", | ||||
|        "1   2     Jane Smith   35              1  65000.0  2019-05-01\n", | ||||
|        "2   3    Bob Johnson   28              2  80000.0  2021-03-10\n", | ||||
|        "3   4    Alice Brown   42              3  70000.0  2018-11-20\n", | ||||
|        "4   5  Charlie Davis   31              2  85000.0  2022-07-01" | ||||
|       ] | ||||
|      }, | ||||
|      "metadata": {}, | ||||
|      "output_type": "display_data" | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# Create a new SQLite database and tables\n", | ||||
|     "with sqlite3.connect(DATABASE_PATH) as conn:\n", | ||||
| @@ -339,28 +163,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 59, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "Table: departments\n", | ||||
|       "  - id (INTEGER)\n", | ||||
|       "  - name (TEXT)\n", | ||||
|       "  - location (TEXT)\n", | ||||
|       "\n", | ||||
|       "Table: employees\n", | ||||
|       "  - id (INTEGER)\n", | ||||
|       "  - name (TEXT)\n", | ||||
|       "  - age (INTEGER)\n", | ||||
|       "  - department_id (INTEGER)\n", | ||||
|       "  - salary (REAL)\n", | ||||
|       "  - hire_date (DATE)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def get_schema_info(db_path):\n", | ||||
|     "    conn = sqlite3.connect(db_path)\n", | ||||
| @@ -398,42 +203,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 60, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "\n", | ||||
|       "        You are an AI assistant that converts natural language queries into SQL.\n", | ||||
|       "        Given the following SQL database schema:\n", | ||||
|       "\n", | ||||
|       "        <schema>\n", | ||||
|       "        Table: departments\n", | ||||
|       "  - id (INTEGER)\n", | ||||
|       "  - name (TEXT)\n", | ||||
|       "  - location (TEXT)\n", | ||||
|       "\n", | ||||
|       "Table: employees\n", | ||||
|       "  - id (INTEGER)\n", | ||||
|       "  - name (TEXT)\n", | ||||
|       "  - age (INTEGER)\n", | ||||
|       "  - department_id (INTEGER)\n", | ||||
|       "  - salary (REAL)\n", | ||||
|       "  - hire_date (DATE)\n", | ||||
|       "        </schema>\n", | ||||
|       "\n", | ||||
|       "        Convert the following natural language query into SQL:\n", | ||||
|       "        <query>\n", | ||||
|       "        What are the names of all employees in the Engineering department?\n", | ||||
|       "        </query>\n", | ||||
|       "\n", | ||||
|       "        Provide only the SQL query in your response, without preamble or any explanation.\n", | ||||
|       "    \n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def generate_prompt(schema, query):\n", | ||||
|     "    return f\"\"\"\n", | ||||
| @@ -467,21 +239,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 61, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "Generated SQL:\n", | ||||
|       "SELECT e.name\n", | ||||
|       "FROM employees e\n", | ||||
|       "JOIN departments d ON e.department_id = d.id\n", | ||||
|       "WHERE d.name = 'Engineering';\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def generate_sql(prompt):\n", | ||||
|     "    response = client.messages.create(\n", | ||||
| @@ -509,68 +269,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 62, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "Query result:\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "data": { | ||||
|       "text/html": [ | ||||
|        "<div>\n", | ||||
|        "<style scoped>\n", | ||||
|        "    .dataframe tbody tr th:only-of-type {\n", | ||||
|        "        vertical-align: middle;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe tbody tr th {\n", | ||||
|        "        vertical-align: top;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe thead th {\n", | ||||
|        "        text-align: right;\n", | ||||
|        "    }\n", | ||||
|        "</style>\n", | ||||
|        "<table border=\"1\" class=\"dataframe\">\n", | ||||
|        "  <thead>\n", | ||||
|        "    <tr style=\"text-align: right;\">\n", | ||||
|        "      <th></th>\n", | ||||
|        "      <th>name</th>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </thead>\n", | ||||
|        "  <tbody>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>0</th>\n", | ||||
|        "      <td>John Doe</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>1</th>\n", | ||||
|        "      <td>Bob Johnson</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>2</th>\n", | ||||
|        "      <td>Charlie Davis</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </tbody>\n", | ||||
|        "</table>\n", | ||||
|        "</div>" | ||||
|       ], | ||||
|       "text/plain": [ | ||||
|        "            name\n", | ||||
|        "0       John Doe\n", | ||||
|        "1    Bob Johnson\n", | ||||
|        "2  Charlie Davis" | ||||
|       ] | ||||
|      }, | ||||
|      "metadata": {}, | ||||
|      "output_type": "display_data" | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def run_sql(sql):\n", | ||||
|     "    conn = sqlite3.connect(DATABASE_PATH)\n", | ||||
| @@ -596,60 +297,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 63, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "\n", | ||||
|       "        You are an AI assistant that converts natural language queries into SQL.\n", | ||||
|       "        Given the following SQL database schema:\n", | ||||
|       "\n", | ||||
|       "        <schema>\n", | ||||
|       "        Table: departments\n", | ||||
|       "  - id (INTEGER)\n", | ||||
|       "  - name (TEXT)\n", | ||||
|       "  - location (TEXT)\n", | ||||
|       "\n", | ||||
|       "Table: employees\n", | ||||
|       "  - id (INTEGER)\n", | ||||
|       "  - name (TEXT)\n", | ||||
|       "  - age (INTEGER)\n", | ||||
|       "  - department_id (INTEGER)\n", | ||||
|       "  - salary (REAL)\n", | ||||
|       "  - hire_date (DATE)\n", | ||||
|       "        </schema>\n", | ||||
|       "\n", | ||||
|       "        Here are some examples of natural language queries and their corresponding SQL:\n", | ||||
|       "\n", | ||||
|       "        <examples>\n", | ||||
|       "        \n", | ||||
|       "        Example 1:\n", | ||||
|       "        <query>List all employees in the HR department.</<query>\n", | ||||
|       "        <output>SELECT e.name FROM employees e JOIN departments d ON e.department_id = d.id WHERE d.name = 'HR';</output>\n", | ||||
|       "\n", | ||||
|       "        Example 2:\n", | ||||
|       "        User: What is the average salary of employees in the Engineering department?\n", | ||||
|       "        SQL: SELECT AVG(e.salary) FROM employees e JOIN departments d ON e.department_id = d.id WHERE d.name = 'Engineering';\n", | ||||
|       "\n", | ||||
|       "        Example 3:\n", | ||||
|       "        User: Who is the oldest employee?\n", | ||||
|       "        SQL: SELECT name, age FROM employees ORDER BY age DESC LIMIT 1;\n", | ||||
|       "    \n", | ||||
|       "        </examples>\n", | ||||
|       "\n", | ||||
|       "        Now, convert the following natural language query into SQL:\n", | ||||
|       "        <query>\n", | ||||
|       "        What are the names and salaries of employees in the Marketing department?\n", | ||||
|       "        </query>\n", | ||||
|       "\n", | ||||
|       "        Provide only the SQL query in your response, without preamble or any explanation.\n", | ||||
|       "    \n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def generate_prompt_with_examples(schema, query):\n", | ||||
|     "    examples = \"\"\"\n", | ||||
| @@ -703,66 +353,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 64, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "Generated SQL:\n", | ||||
|       "SELECT e.name, e.salary\n", | ||||
|       "FROM employees e\n", | ||||
|       "JOIN departments d ON e.department_id = d.id\n", | ||||
|       "WHERE d.name = 'Marketing';\n", | ||||
|       "\n", | ||||
|       "Query result:\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "data": { | ||||
|       "text/html": [ | ||||
|        "<div>\n", | ||||
|        "<style scoped>\n", | ||||
|        "    .dataframe tbody tr th:only-of-type {\n", | ||||
|        "        vertical-align: middle;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe tbody tr th {\n", | ||||
|        "        vertical-align: top;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe thead th {\n", | ||||
|        "        text-align: right;\n", | ||||
|        "    }\n", | ||||
|        "</style>\n", | ||||
|        "<table border=\"1\" class=\"dataframe\">\n", | ||||
|        "  <thead>\n", | ||||
|        "    <tr style=\"text-align: right;\">\n", | ||||
|        "      <th></th>\n", | ||||
|        "      <th>name</th>\n", | ||||
|        "      <th>salary</th>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </thead>\n", | ||||
|        "  <tbody>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>0</th>\n", | ||||
|        "      <td>Alice Brown</td>\n", | ||||
|        "      <td>70000.0</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </tbody>\n", | ||||
|        "</table>\n", | ||||
|        "</div>" | ||||
|       ], | ||||
|       "text/plain": [ | ||||
|        "          name   salary\n", | ||||
|        "0  Alice Brown  70000.0" | ||||
|       ] | ||||
|      }, | ||||
|      "metadata": {}, | ||||
|      "output_type": "display_data" | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# Generate SQL using the improved prompt\n", | ||||
|     "sql = generate_sql(prompt)\n", | ||||
| @@ -797,68 +390,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 65, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "You are an AI assistant that converts natural language queries into SQL.\n", | ||||
|       "    Given the following SQL database schema:\n", | ||||
|       "\n", | ||||
|       "    <schema>\n", | ||||
|       "    Table: departments\n", | ||||
|       "  - id (INTEGER)\n", | ||||
|       "  - name (TEXT)\n", | ||||
|       "  - location (TEXT)\n", | ||||
|       "\n", | ||||
|       "Table: employees\n", | ||||
|       "  - id (INTEGER)\n", | ||||
|       "  - name (TEXT)\n", | ||||
|       "  - age (INTEGER)\n", | ||||
|       "  - department_id (INTEGER)\n", | ||||
|       "  - salary (REAL)\n", | ||||
|       "  - hire_date (DATE)\n", | ||||
|       "    </schema>\n", | ||||
|       "\n", | ||||
|       "    Here are some examples of natural language queries, thought processes, and their corresponding SQL:\n", | ||||
|       "\n", | ||||
|       "    <examples>\n", | ||||
|       "    \n", | ||||
|       "    <example>\n", | ||||
|       "    <query>List all employees in the HR department.</query>\n", | ||||
|       "    <thought_process>\n", | ||||
|       "    1. We need to join the employees and departments tables.\n", | ||||
|       "    2. We'll match employees.department_id with departments.id.\n", | ||||
|       "    3. We'll filter for the HR department.\n", | ||||
|       "    4. We only need to return the employee names.\n", | ||||
|       "    </thought_process>\n", | ||||
|       "    <sql>SELECT e.name FROM employees e JOIN departments d ON e.department_id = d.id WHERE d.name = 'HR';</sql>\n", | ||||
|       "    </example>\n", | ||||
|       "\n", | ||||
|       "    <example>\n", | ||||
|       "    <query>What is the average salary of employees hired in 2022?</query>\n", | ||||
|       "    <thought_process>\n", | ||||
|       "    1. We need to work with the employees table.\n", | ||||
|       "    2. We need to filter for employees hired in 2022.\n", | ||||
|       "    3. We'll use the YEAR function to extract the year from the hire_date.\n", | ||||
|       "    4. We'll calculate the average of the salary column for the filtered rows.\n", | ||||
|       "    </thought_process>\n", | ||||
|       "    <sql>SELECT AVG(salary) FROM employees WHERE YEAR(hire_date) = 2022;</sql>\n", | ||||
|       "    </example>\n", | ||||
|       "    \n", | ||||
|       "    </examples>\n", | ||||
|       "\n", | ||||
|       "    Now, convert the following natural language query into SQL:\n", | ||||
|       "    What are the names and hire dates of employees in the Engineering department, ordered by their salary?\n", | ||||
|       "\n", | ||||
|       "    Within <thought_process> tags, explain your thought process for creating the SQL query.\n", | ||||
|       "    Then, within <sql> tags, provide your output SQL query.\n", | ||||
|       "    \n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def generate_cot_prompt(schema, query):\n", | ||||
|     "    examples = \"\"\"\n", | ||||
| @@ -920,105 +454,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 66, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "Raw response from Claude:\n", | ||||
|       "<thought_process>\n", | ||||
|       "1. We need to join the employees and departments tables to get the department information.\n", | ||||
|       "2. We'll match employees.department_id with departments.id.\n", | ||||
|       "3. We need to filter for the Engineering department.\n", | ||||
|       "4. We need to select the names and hire dates of the employees.\n", | ||||
|       "5. We need to order the results by the employees' salaries.\n", | ||||
|       "6. We don't need to show the salary in the output, but we'll use it for ordering.\n", | ||||
|       "</thought_process>\n", | ||||
|       "\n", | ||||
|       "<sql>\n", | ||||
|       "SELECT e.name, e.hire_date\n", | ||||
|       "FROM employees e\n", | ||||
|       "JOIN departments d ON e.department_id = d.id\n", | ||||
|       "WHERE d.name = 'Engineering'\n", | ||||
|       "ORDER BY e.salary;\n", | ||||
|       "</sql>\n", | ||||
|       "\n", | ||||
|       "Thought Process:\n", | ||||
|       "1. We need to join the employees and departments tables to get the department information.\n", | ||||
|       "2. We'll match employees.department_id with departments.id.\n", | ||||
|       "3. We need to filter for the Engineering department.\n", | ||||
|       "4. We need to select the names and hire dates of the employees.\n", | ||||
|       "5. We need to order the results by the employees' salaries.\n", | ||||
|       "6. We don't need to show the salary in the output, but we'll use it for ordering.\n", | ||||
|       "\n", | ||||
|       "Generated SQL:\n", | ||||
|       "SELECT e.name, e.hire_date\n", | ||||
|       "FROM employees e\n", | ||||
|       "JOIN departments d ON e.department_id = d.id\n", | ||||
|       "WHERE d.name = 'Engineering'\n", | ||||
|       "ORDER BY e.salary;\n", | ||||
|       "\n", | ||||
|       "Query result:\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "data": { | ||||
|       "text/html": [ | ||||
|        "<div>\n", | ||||
|        "<style scoped>\n", | ||||
|        "    .dataframe tbody tr th:only-of-type {\n", | ||||
|        "        vertical-align: middle;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe tbody tr th {\n", | ||||
|        "        vertical-align: top;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe thead th {\n", | ||||
|        "        text-align: right;\n", | ||||
|        "    }\n", | ||||
|        "</style>\n", | ||||
|        "<table border=\"1\" class=\"dataframe\">\n", | ||||
|        "  <thead>\n", | ||||
|        "    <tr style=\"text-align: right;\">\n", | ||||
|        "      <th></th>\n", | ||||
|        "      <th>name</th>\n", | ||||
|        "      <th>hire_date</th>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </thead>\n", | ||||
|        "  <tbody>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>0</th>\n", | ||||
|        "      <td>John Doe</td>\n", | ||||
|        "      <td>2020-01-15</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>1</th>\n", | ||||
|        "      <td>Bob Johnson</td>\n", | ||||
|        "      <td>2021-03-10</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>2</th>\n", | ||||
|        "      <td>Charlie Davis</td>\n", | ||||
|        "      <td>2022-07-01</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </tbody>\n", | ||||
|        "</table>\n", | ||||
|        "</div>" | ||||
|       ], | ||||
|       "text/plain": [ | ||||
|        "            name   hire_date\n", | ||||
|        "0       John Doe  2020-01-15\n", | ||||
|        "1    Bob Johnson  2021-03-10\n", | ||||
|        "2  Charlie Davis  2022-07-01" | ||||
|       ] | ||||
|      }, | ||||
|      "metadata": {}, | ||||
|      "output_type": "display_data" | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def generate_sql_with_explanation(prompt):\n", | ||||
|     "    response = client.messages.create(\n", | ||||
| @@ -1066,22 +504,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 110, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "Search results:\n", | ||||
|       "Similarity: 0.7318002364429477, Metadata: {'table': 'employees', 'column': 'salary', 'type': 'REAL'}\n", | ||||
|       "Similarity: 0.728456954795667, Metadata: {'table': 'employees', 'column': 'department_id', 'type': 'INTEGER'}\n", | ||||
|       "Similarity: 0.6810496067975434, Metadata: {'table': 'departments', 'column': 'name', 'type': 'TEXT'}\n", | ||||
|       "Similarity: 0.6697669330753087, Metadata: {'table': 'employees', 'column': 'name', 'type': 'TEXT'}\n", | ||||
|       "Similarity: 0.6666317064533498, Metadata: {'table': 'departments', 'column': 'location', 'type': 'TEXT'}\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "import os\n", | ||||
|     "import numpy as np\n", | ||||
| @@ -1159,140 +584,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 112, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "Generated prompt:\n", | ||||
|       "You are an AI assistant that converts natural language queries into SQL.\n", | ||||
|       "    Given the following relevant columns from the SQL database schema:\n", | ||||
|       "\n", | ||||
|       "    <schema>\n", | ||||
|       "    Table: employees, Column: salary, Type: REAL\n", | ||||
|       "Table: employees, Column: department_id, Type: INTEGER\n", | ||||
|       "Table: departments, Column: name, Type: TEXT\n", | ||||
|       "Table: employees, Column: name, Type: TEXT\n", | ||||
|       "Table: departments, Column: location, Type: TEXT\n", | ||||
|       "Table: employees, Column: id, Type: INTEGER\n", | ||||
|       "Table: departments, Column: id, Type: INTEGER\n", | ||||
|       "Table: employees, Column: age, Type: INTEGER\n", | ||||
|       "Table: employees, Column: hire_date, Type: DATE\n", | ||||
|       "    </schema>\n", | ||||
|       "\n", | ||||
|       "    Convert the following natural language query into SQL:\n", | ||||
|       "    <query>\n", | ||||
|       "    What is the average salary of employees in each department?\n", | ||||
|       "    </query>\n", | ||||
|       "\n", | ||||
|       "    Within <thought_process> tags, explain your thought process for creating the SQL query.\n", | ||||
|       "    Then, within <sql> tags, provide your output SQL query.\n", | ||||
|       "    \n", | ||||
|       "\n", | ||||
|       "Generated result:\n", | ||||
|       "<thought_process>\n", | ||||
|       "To answer this query, we need to:\n", | ||||
|       "1. Join the employees and departments tables to get department information for each employee.\n", | ||||
|       "2. Group the results by department.\n", | ||||
|       "3. Calculate the average salary for each group.\n", | ||||
|       "\n", | ||||
|       "Here's the step-by-step thought process:\n", | ||||
|       "1. We'll use the employees table as our main table since it contains the salary information.\n", | ||||
|       "2. We need to join the departments table to get the department names.\n", | ||||
|       "3. The join will be on employees.department_id = departments.id\n", | ||||
|       "4. We'll group the results by department name (or id, but name is more informative).\n", | ||||
|       "5. We'll use the AVG function to calculate the average salary for each group.\n", | ||||
|       "6. We'll select the department name and the average salary in the SELECT clause.\n", | ||||
|       "</thought_process>\n", | ||||
|       "\n", | ||||
|       "<sql>\n", | ||||
|       "SELECT \n", | ||||
|       "    d.name AS department_name,\n", | ||||
|       "    AVG(e.salary) AS average_salary\n", | ||||
|       "FROM \n", | ||||
|       "    employees e\n", | ||||
|       "JOIN \n", | ||||
|       "    departments d ON e.department_id = d.id\n", | ||||
|       "GROUP BY \n", | ||||
|       "    d.name\n", | ||||
|       "ORDER BY \n", | ||||
|       "    d.name\n", | ||||
|       "</sql>\n", | ||||
|       "\n", | ||||
|       "Extracted SQL:\n", | ||||
|       "SELECT \n", | ||||
|       "    d.name AS department_name,\n", | ||||
|       "    AVG(e.salary) AS average_salary\n", | ||||
|       "FROM \n", | ||||
|       "    employees e\n", | ||||
|       "JOIN \n", | ||||
|       "    departments d ON e.department_id = d.id\n", | ||||
|       "GROUP BY \n", | ||||
|       "    d.name\n", | ||||
|       "ORDER BY \n", | ||||
|       "    d.name\n", | ||||
|       "\n", | ||||
|       "Query result:\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "data": { | ||||
|       "text/html": [ | ||||
|        "<div>\n", | ||||
|        "<style scoped>\n", | ||||
|        "    .dataframe tbody tr th:only-of-type {\n", | ||||
|        "        vertical-align: middle;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe tbody tr th {\n", | ||||
|        "        vertical-align: top;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe thead th {\n", | ||||
|        "        text-align: right;\n", | ||||
|        "    }\n", | ||||
|        "</style>\n", | ||||
|        "<table border=\"1\" class=\"dataframe\">\n", | ||||
|        "  <thead>\n", | ||||
|        "    <tr style=\"text-align: right;\">\n", | ||||
|        "      <th></th>\n", | ||||
|        "      <th>department_name</th>\n", | ||||
|        "      <th>average_salary</th>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </thead>\n", | ||||
|        "  <tbody>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>0</th>\n", | ||||
|        "      <td>Engineering</td>\n", | ||||
|        "      <td>80000.0</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>1</th>\n", | ||||
|        "      <td>HR</td>\n", | ||||
|        "      <td>65000.0</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "    <tr>\n", | ||||
|        "      <th>2</th>\n", | ||||
|        "      <td>Marketing</td>\n", | ||||
|        "      <td>70000.0</td>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </tbody>\n", | ||||
|        "</table>\n", | ||||
|        "</div>" | ||||
|       ], | ||||
|       "text/plain": [ | ||||
|        "  department_name  average_salary\n", | ||||
|        "0     Engineering         80000.0\n", | ||||
|        "1              HR         65000.0\n", | ||||
|        "2       Marketing         70000.0" | ||||
|       ] | ||||
|      }, | ||||
|      "metadata": {}, | ||||
|      "output_type": "display_data" | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def generate_rag_prompt(query):\n", | ||||
|     "    relevant_schema = vectordb.search(query, k=10, similarity_threshold=0.3)\n", | ||||
| @@ -1382,84 +676,9 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 125, | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "\n", | ||||
|       "Attempt 1:\n", | ||||
|       "SQL failed to execute\n", | ||||
|       "\n", | ||||
|       "Attempt 2:\n", | ||||
|       "SQL executed successfully!\n", | ||||
|       "\n", | ||||
|       "Final SQL query:\n", | ||||
|       "SELECT *\n", | ||||
|       "FROM (\n", | ||||
|       "    SELECT \n", | ||||
|       "        d.name AS department_name,\n", | ||||
|       "        salary_range.max_salary / salary_range.min_salary AS salary_ratio\n", | ||||
|       "    FROM \n", | ||||
|       "        (SELECT \n", | ||||
|       "            department_id,\n", | ||||
|       "            MAX(salary) AS max_salary,\n", | ||||
|       "            MIN(salary) AS min_salary\n", | ||||
|       "        FROM \n", | ||||
|       "            employees\n", | ||||
|       "        GROUP BY \n", | ||||
|       "            department_id) AS salary_range\n", | ||||
|       "    JOIN \n", | ||||
|       "        departments d ON d.id = salary_range.department_id\n", | ||||
|       ") AS subquery\n", | ||||
|       "WHERE salary_ratio > 3\n", | ||||
|       "ORDER BY salary_ratio DESC;\n", | ||||
|       "\n", | ||||
|       "Query result:\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "data": { | ||||
|       "text/html": [ | ||||
|        "<div>\n", | ||||
|        "<style scoped>\n", | ||||
|        "    .dataframe tbody tr th:only-of-type {\n", | ||||
|        "        vertical-align: middle;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe tbody tr th {\n", | ||||
|        "        vertical-align: top;\n", | ||||
|        "    }\n", | ||||
|        "\n", | ||||
|        "    .dataframe thead th {\n", | ||||
|        "        text-align: right;\n", | ||||
|        "    }\n", | ||||
|        "</style>\n", | ||||
|        "<table border=\"1\" class=\"dataframe\">\n", | ||||
|        "  <thead>\n", | ||||
|        "    <tr style=\"text-align: right;\">\n", | ||||
|        "      <th></th>\n", | ||||
|        "      <th>department_name</th>\n", | ||||
|        "      <th>salary_ratio</th>\n", | ||||
|        "    </tr>\n", | ||||
|        "  </thead>\n", | ||||
|        "  <tbody>\n", | ||||
|        "  </tbody>\n", | ||||
|        "</table>\n", | ||||
|        "</div>" | ||||
|       ], | ||||
|       "text/plain": [ | ||||
|        "Empty DataFrame\n", | ||||
|        "Columns: [department_name, salary_ratio]\n", | ||||
|        "Index: []" | ||||
|       ] | ||||
|      }, | ||||
|      "metadata": {}, | ||||
|      "output_type": "display_data" | ||||
|     } | ||||
|    ], | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def execute_sql_with_feedback(sql):\n", | ||||
|     "    try:\n", | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Mahesh Murag
					Mahesh Murag