WIP Text to SQL

2025-10-06 01:00:28 +03:00 · 2024-09-23 22:05:45 +02:00
parent 81ab59f042
commit c982d54567
1 changed files with 26 additions and 807 deletions
--- a/skills/text_to_sql/guide.ipynb
+++ b/skills/text_to_sql/guide.ipynb
@@ -65,17 +65,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "%pip install -q anthropic pandas voyageai"
   ]
@@ -91,7 +83,6 @@
    "import sqlite3\n",
    "import pandas as pd\n",
    "from IPython.display import display\n",
-    "from textwrap import dedent\n",
    "\n",
    "# Set your Anthropic API key\n",
    "os.environ[\"ANTHROPIC_API_KEY\"] = \"YOUR_ANTHROPIC_API_KEY\"\n",
@@ -114,176 +105,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Departments table:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>name</th>\n",
-       "      <th>location</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>HR</td>\n",
-       "      <td>New York</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>Engineering</td>\n",
-       "      <td>San Francisco</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Marketing</td>\n",
-       "      <td>Chicago</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   id         name       location\n",
-       "0   1           HR       New York\n",
-       "1   2  Engineering  San Francisco\n",
-       "2   3    Marketing        Chicago"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Employees table:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>name</th>\n",
-       "      <th>age</th>\n",
-       "      <th>department_id</th>\n",
-       "      <th>salary</th>\n",
-       "      <th>hire_date</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>John Doe</td>\n",
-       "      <td>30</td>\n",
-       "      <td>2</td>\n",
-       "      <td>75000.0</td>\n",
-       "      <td>2020-01-15</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>Jane Smith</td>\n",
-       "      <td>35</td>\n",
-       "      <td>1</td>\n",
-       "      <td>65000.0</td>\n",
-       "      <td>2019-05-01</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Bob Johnson</td>\n",
-       "      <td>28</td>\n",
-       "      <td>2</td>\n",
-       "      <td>80000.0</td>\n",
-       "      <td>2021-03-10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>4</td>\n",
-       "      <td>Alice Brown</td>\n",
-       "      <td>42</td>\n",
-       "      <td>3</td>\n",
-       "      <td>70000.0</td>\n",
-       "      <td>2018-11-20</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>5</td>\n",
-       "      <td>Charlie Davis</td>\n",
-       "      <td>31</td>\n",
-       "      <td>2</td>\n",
-       "      <td>85000.0</td>\n",
-       "      <td>2022-07-01</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   id           name  age  department_id   salary   hire_date\n",
-       "0   1       John Doe   30              2  75000.0  2020-01-15\n",
-       "1   2     Jane Smith   35              1  65000.0  2019-05-01\n",
-       "2   3    Bob Johnson   28              2  80000.0  2021-03-10\n",
-       "3   4    Alice Brown   42              3  70000.0  2018-11-20\n",
-       "4   5  Charlie Davis   31              2  85000.0  2022-07-01"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Create a new SQLite database and tables\n",
    "with sqlite3.connect(DATABASE_PATH) as conn:\n",
@@ -339,28 +163,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Table: departments\n",
-      "  - id (INTEGER)\n",
-      "  - name (TEXT)\n",
-      "  - location (TEXT)\n",
-      "\n",
-      "Table: employees\n",
-      "  - id (INTEGER)\n",
-      "  - name (TEXT)\n",
-      "  - age (INTEGER)\n",
-      "  - department_id (INTEGER)\n",
-      "  - salary (REAL)\n",
-      "  - hire_date (DATE)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "def get_schema_info(db_path):\n",
    "    conn = sqlite3.connect(db_path)\n",
@@ -398,42 +203,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "        You are an AI assistant that converts natural language queries into SQL.\n",
-      "        Given the following SQL database schema:\n",
-      "\n",
-      "        <schema>\n",
-      "        Table: departments\n",
-      "  - id (INTEGER)\n",
-      "  - name (TEXT)\n",
-      "  - location (TEXT)\n",
-      "\n",
-      "Table: employees\n",
-      "  - id (INTEGER)\n",
-      "  - name (TEXT)\n",
-      "  - age (INTEGER)\n",
-      "  - department_id (INTEGER)\n",
-      "  - salary (REAL)\n",
-      "  - hire_date (DATE)\n",
-      "        </schema>\n",
-      "\n",
-      "        Convert the following natural language query into SQL:\n",
-      "        <query>\n",
-      "        What are the names of all employees in the Engineering department?\n",
-      "        </query>\n",
-      "\n",
-      "        Provide only the SQL query in your response, without preamble or any explanation.\n",
-      "    \n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "def generate_prompt(schema, query):\n",
    "    return f\"\"\"\n",
@@ -467,21 +239,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Generated SQL:\n",
-      "SELECT e.name\n",
-      "FROM employees e\n",
-      "JOIN departments d ON e.department_id = d.id\n",
-      "WHERE d.name = 'Engineering';\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "def generate_sql(prompt):\n",
    "    response = client.messages.create(\n",
@@ -509,68 +269,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Query result:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>John Doe</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Bob Johnson</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Charlie Davis</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            name\n",
-       "0       John Doe\n",
-       "1    Bob Johnson\n",
-       "2  Charlie Davis"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "def run_sql(sql):\n",
    "    conn = sqlite3.connect(DATABASE_PATH)\n",
@@ -596,60 +297,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "        You are an AI assistant that converts natural language queries into SQL.\n",
-      "        Given the following SQL database schema:\n",
-      "\n",
-      "        <schema>\n",
-      "        Table: departments\n",
-      "  - id (INTEGER)\n",
-      "  - name (TEXT)\n",
-      "  - location (TEXT)\n",
-      "\n",
-      "Table: employees\n",
-      "  - id (INTEGER)\n",
-      "  - name (TEXT)\n",
-      "  - age (INTEGER)\n",
-      "  - department_id (INTEGER)\n",
-      "  - salary (REAL)\n",
-      "  - hire_date (DATE)\n",
-      "        </schema>\n",
-      "\n",
-      "        Here are some examples of natural language queries and their corresponding SQL:\n",
-      "\n",
-      "        <examples>\n",
-      "        \n",
-      "        Example 1:\n",
-      "        <query>List all employees in the HR department.</<query>\n",
-      "        <output>SELECT e.name FROM employees e JOIN departments d ON e.department_id = d.id WHERE d.name = 'HR';</output>\n",
-      "\n",
-      "        Example 2:\n",
-      "        User: What is the average salary of employees in the Engineering department?\n",
-      "        SQL: SELECT AVG(e.salary) FROM employees e JOIN departments d ON e.department_id = d.id WHERE d.name = 'Engineering';\n",
-      "\n",
-      "        Example 3:\n",
-      "        User: Who is the oldest employee?\n",
-      "        SQL: SELECT name, age FROM employees ORDER BY age DESC LIMIT 1;\n",
-      "    \n",
-      "        </examples>\n",
-      "\n",
-      "        Now, convert the following natural language query into SQL:\n",
-      "        <query>\n",
-      "        What are the names and salaries of employees in the Marketing department?\n",
-      "        </query>\n",
-      "\n",
-      "        Provide only the SQL query in your response, without preamble or any explanation.\n",
-      "    \n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "def generate_prompt_with_examples(schema, query):\n",
    "    examples = \"\"\"\n",
@@ -703,66 +353,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Generated SQL:\n",
-      "SELECT e.name, e.salary\n",
-      "FROM employees e\n",
-      "JOIN departments d ON e.department_id = d.id\n",
-      "WHERE d.name = 'Marketing';\n",
-      "\n",
-      "Query result:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name</th>\n",
-       "      <th>salary</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Alice Brown</td>\n",
-       "      <td>70000.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "          name   salary\n",
-       "0  Alice Brown  70000.0"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Generate SQL using the improved prompt\n",
    "sql = generate_sql(prompt)\n",
@@ -797,68 +390,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "You are an AI assistant that converts natural language queries into SQL.\n",
-      "    Given the following SQL database schema:\n",
-      "\n",
-      "    <schema>\n",
-      "    Table: departments\n",
-      "  - id (INTEGER)\n",
-      "  - name (TEXT)\n",
-      "  - location (TEXT)\n",
-      "\n",
-      "Table: employees\n",
-      "  - id (INTEGER)\n",
-      "  - name (TEXT)\n",
-      "  - age (INTEGER)\n",
-      "  - department_id (INTEGER)\n",
-      "  - salary (REAL)\n",
-      "  - hire_date (DATE)\n",
-      "    </schema>\n",
-      "\n",
-      "    Here are some examples of natural language queries, thought processes, and their corresponding SQL:\n",
-      "\n",
-      "    <examples>\n",
-      "    \n",
-      "    <example>\n",
-      "    <query>List all employees in the HR department.</query>\n",
-      "    <thought_process>\n",
-      "    1. We need to join the employees and departments tables.\n",
-      "    2. We'll match employees.department_id with departments.id.\n",
-      "    3. We'll filter for the HR department.\n",
-      "    4. We only need to return the employee names.\n",
-      "    </thought_process>\n",
-      "    <sql>SELECT e.name FROM employees e JOIN departments d ON e.department_id = d.id WHERE d.name = 'HR';</sql>\n",
-      "    </example>\n",
-      "\n",
-      "    <example>\n",
-      "    <query>What is the average salary of employees hired in 2022?</query>\n",
-      "    <thought_process>\n",
-      "    1. We need to work with the employees table.\n",
-      "    2. We need to filter for employees hired in 2022.\n",
-      "    3. We'll use the YEAR function to extract the year from the hire_date.\n",
-      "    4. We'll calculate the average of the salary column for the filtered rows.\n",
-      "    </thought_process>\n",
-      "    <sql>SELECT AVG(salary) FROM employees WHERE YEAR(hire_date) = 2022;</sql>\n",
-      "    </example>\n",
-      "    \n",
-      "    </examples>\n",
-      "\n",
-      "    Now, convert the following natural language query into SQL:\n",
-      "    What are the names and hire dates of employees in the Engineering department, ordered by their salary?\n",
-      "\n",
-      "    Within <thought_process> tags, explain your thought process for creating the SQL query.\n",
-      "    Then, within <sql> tags, provide your output SQL query.\n",
-      "    \n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "def generate_cot_prompt(schema, query):\n",
    "    examples = \"\"\"\n",
@@ -920,105 +454,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Raw response from Claude:\n",
-      "<thought_process>\n",
-      "1. We need to join the employees and departments tables to get the department information.\n",
-      "2. We'll match employees.department_id with departments.id.\n",
-      "3. We need to filter for the Engineering department.\n",
-      "4. We need to select the names and hire dates of the employees.\n",
-      "5. We need to order the results by the employees' salaries.\n",
-      "6. We don't need to show the salary in the output, but we'll use it for ordering.\n",
-      "</thought_process>\n",
-      "\n",
-      "<sql>\n",
-      "SELECT e.name, e.hire_date\n",
-      "FROM employees e\n",
-      "JOIN departments d ON e.department_id = d.id\n",
-      "WHERE d.name = 'Engineering'\n",
-      "ORDER BY e.salary;\n",
-      "</sql>\n",
-      "\n",
-      "Thought Process:\n",
-      "1. We need to join the employees and departments tables to get the department information.\n",
-      "2. We'll match employees.department_id with departments.id.\n",
-      "3. We need to filter for the Engineering department.\n",
-      "4. We need to select the names and hire dates of the employees.\n",
-      "5. We need to order the results by the employees' salaries.\n",
-      "6. We don't need to show the salary in the output, but we'll use it for ordering.\n",
-      "\n",
-      "Generated SQL:\n",
-      "SELECT e.name, e.hire_date\n",
-      "FROM employees e\n",
-      "JOIN departments d ON e.department_id = d.id\n",
-      "WHERE d.name = 'Engineering'\n",
-      "ORDER BY e.salary;\n",
-      "\n",
-      "Query result:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name</th>\n",
-       "      <th>hire_date</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>John Doe</td>\n",
-       "      <td>2020-01-15</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Bob Johnson</td>\n",
-       "      <td>2021-03-10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Charlie Davis</td>\n",
-       "      <td>2022-07-01</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            name   hire_date\n",
-       "0       John Doe  2020-01-15\n",
-       "1    Bob Johnson  2021-03-10\n",
-       "2  Charlie Davis  2022-07-01"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "def generate_sql_with_explanation(prompt):\n",
    "    response = client.messages.create(\n",
@@ -1066,22 +504,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Search results:\n",
-      "Similarity: 0.7318002364429477, Metadata: {'table': 'employees', 'column': 'salary', 'type': 'REAL'}\n",
-      "Similarity: 0.728456954795667, Metadata: {'table': 'employees', 'column': 'department_id', 'type': 'INTEGER'}\n",
-      "Similarity: 0.6810496067975434, Metadata: {'table': 'departments', 'column': 'name', 'type': 'TEXT'}\n",
-      "Similarity: 0.6697669330753087, Metadata: {'table': 'employees', 'column': 'name', 'type': 'TEXT'}\n",
-      "Similarity: 0.6666317064533498, Metadata: {'table': 'departments', 'column': 'location', 'type': 'TEXT'}\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
@@ -1159,140 +584,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Generated prompt:\n",
-      "You are an AI assistant that converts natural language queries into SQL.\n",
-      "    Given the following relevant columns from the SQL database schema:\n",
-      "\n",
-      "    <schema>\n",
-      "    Table: employees, Column: salary, Type: REAL\n",
-      "Table: employees, Column: department_id, Type: INTEGER\n",
-      "Table: departments, Column: name, Type: TEXT\n",
-      "Table: employees, Column: name, Type: TEXT\n",
-      "Table: departments, Column: location, Type: TEXT\n",
-      "Table: employees, Column: id, Type: INTEGER\n",
-      "Table: departments, Column: id, Type: INTEGER\n",
-      "Table: employees, Column: age, Type: INTEGER\n",
-      "Table: employees, Column: hire_date, Type: DATE\n",
-      "    </schema>\n",
-      "\n",
-      "    Convert the following natural language query into SQL:\n",
-      "    <query>\n",
-      "    What is the average salary of employees in each department?\n",
-      "    </query>\n",
-      "\n",
-      "    Within <thought_process> tags, explain your thought process for creating the SQL query.\n",
-      "    Then, within <sql> tags, provide your output SQL query.\n",
-      "    \n",
-      "\n",
-      "Generated result:\n",
-      "<thought_process>\n",
-      "To answer this query, we need to:\n",
-      "1. Join the employees and departments tables to get department information for each employee.\n",
-      "2. Group the results by department.\n",
-      "3. Calculate the average salary for each group.\n",
-      "\n",
-      "Here's the step-by-step thought process:\n",
-      "1. We'll use the employees table as our main table since it contains the salary information.\n",
-      "2. We need to join the departments table to get the department names.\n",
-      "3. The join will be on employees.department_id = departments.id\n",
-      "4. We'll group the results by department name (or id, but name is more informative).\n",
-      "5. We'll use the AVG function to calculate the average salary for each group.\n",
-      "6. We'll select the department name and the average salary in the SELECT clause.\n",
-      "</thought_process>\n",
-      "\n",
-      "<sql>\n",
-      "SELECT \n",
-      "    d.name AS department_name,\n",
-      "    AVG(e.salary) AS average_salary\n",
-      "FROM \n",
-      "    employees e\n",
-      "JOIN \n",
-      "    departments d ON e.department_id = d.id\n",
-      "GROUP BY \n",
-      "    d.name\n",
-      "ORDER BY \n",
-      "    d.name\n",
-      "</sql>\n",
-      "\n",
-      "Extracted SQL:\n",
-      "SELECT \n",
-      "    d.name AS department_name,\n",
-      "    AVG(e.salary) AS average_salary\n",
-      "FROM \n",
-      "    employees e\n",
-      "JOIN \n",
-      "    departments d ON e.department_id = d.id\n",
-      "GROUP BY \n",
-      "    d.name\n",
-      "ORDER BY \n",
-      "    d.name\n",
-      "\n",
-      "Query result:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>department_name</th>\n",
-       "      <th>average_salary</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Engineering</td>\n",
-       "      <td>80000.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>HR</td>\n",
-       "      <td>65000.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Marketing</td>\n",
-       "      <td>70000.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  department_name  average_salary\n",
-       "0     Engineering         80000.0\n",
-       "1              HR         65000.0\n",
-       "2       Marketing         70000.0"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "def generate_rag_prompt(query):\n",
    "    relevant_schema = vectordb.search(query, k=10, similarity_threshold=0.3)\n",
@@ -1382,84 +676,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Attempt 1:\n",
-      "SQL failed to execute\n",
-      "\n",
-      "Attempt 2:\n",
-      "SQL executed successfully!\n",
-      "\n",
-      "Final SQL query:\n",
-      "SELECT *\n",
-      "FROM (\n",
-      "    SELECT \n",
-      "        d.name AS department_name,\n",
-      "        salary_range.max_salary / salary_range.min_salary AS salary_ratio\n",
-      "    FROM \n",
-      "        (SELECT \n",
-      "            department_id,\n",
-      "            MAX(salary) AS max_salary,\n",
-      "            MIN(salary) AS min_salary\n",
-      "        FROM \n",
-      "            employees\n",
-      "        GROUP BY \n",
-      "            department_id) AS salary_range\n",
-      "    JOIN \n",
-      "        departments d ON d.id = salary_range.department_id\n",
-      ") AS subquery\n",
-      "WHERE salary_ratio > 3\n",
-      "ORDER BY salary_ratio DESC;\n",
-      "\n",
-      "Query result:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>department_name</th>\n",
-       "      <th>salary_ratio</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "Empty DataFrame\n",
-       "Columns: [department_name, salary_ratio]\n",
-       "Index: []"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "def execute_sql_with_feedback(sql):\n",
    "    try:\n",