reasoning-gym/notebooks/verify_gsm_symbolic.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create open-router client, place your OPENROUTER_API_KEY in .env file\n",
    "# .env contents:\n",
    "# OPENROUTER_API_KEY=sk-or-v1- ...\n",
    "\n",
    "%load_ext dotenv\n",
    "%dotenv\n",
    "import os\n",
    "import re\n",
    "from random import Random\n",
    "from pathlib import Path\n",
    "from typing import Any, Iterable, Optional\n",
    "import json\n",
    "from openai import OpenAI\n",
    "from openai.types.chat import ChatCompletion, ChatCompletionMessageParam\n",
    "import time\n",
    "import reasoning_gym\n",
    "\n",
    "\n",
    "def llm_generate(\n",
    "    client: OpenAI,\n",
    "    messages: Iterable[ChatCompletionMessageParam],\n",
    "    sampling_params: dict[str, Any],\n",
    ") -> ChatCompletion:\n",
    "    max_retry = 3\n",
    "    for trial in range(max_retry):\n",
    "        try:\n",
    "            return client.chat.completions.create(\n",
    "                messages=messages,\n",
    "                **sampling_params,\n",
    "            )\n",
    "        except Exception as e:\n",
    "            print(\"failure response:\", e)\n",
    "            time.sleep(trial * trial)  # quadratic backoff\n",
    "            if trial == max_retry - 1:\n",
    "                raise\n",
    "\n",
    "def generate_simple_request(user_prompt: str, developer_prompt: Optional[str] = None) -> list[dict]:\n",
    "    prompt = []\n",
    "    if developer_prompt is not None:\n",
    "        prompt.append( { \"role\": \"system\", \"content\": developer_prompt } )\n",
    "    \n",
    "    prompt.append( { \"role\": \"user\", \"content\": user_prompt })\n",
    "    return prompt\n",
    "\n",
    "open_router_client = OpenAI(\n",
    "    base_url=\"https://openrouter.ai/api/v1\",\n",
    "    api_key=os.getenv(\"OPENROUTER_API_KEY\"),\n",
    "    timeout=90.0,\n",
    ")\n",
    "\n",
    "sampling_params = {\n",
    "    \"model\": \"anthropic/claude-3.5-sonnet\",\n",
    "    \"max_tokens\": 4096,\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "48"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(reasoning_gym.factory.DATASETS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test all gsm_symoblic generators\n",
    "import reasoning_gym.arithmetic.gsm_symbolic\n",
    "x = reasoning_gym.create_dataset(\"gsm_symbolic\")\n",
    "\n",
    "generators = x.generators"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import reasoning_gym.utils\n",
    "\n",
    "difficulty = 1.0\n",
    "\n",
    "prompt_template = \"Solve the following math task and return the answer (just the number) in <answer></answer> tags:\\n\\n{question}\"\n",
    "\n",
    "def query_llm(x: dict) -> tuple[int, int]:\n",
    "    q = x[\"question\"]\n",
    "    ground_truth = x[\"answer\"]\n",
    "    user_prompt = prompt_template.format(question=q)\n",
    "    msgs = generate_simple_request(user_prompt)\n",
    "    output = llm_generate(client=open_router_client, messages=msgs, sampling_params=sampling_params)\n",
    "    full_answer = output.choices[0].message.content\n",
    "    answer = reasoning_gym.utils.extract_answer(completion=full_answer, tag_name=\"answer\").strip()\n",
    "    return answer, ground_truth, full_answer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cross_check_generator(rng: Random, index: int, difficulty: float = 1.0, num_generations = 3, verbose: bool = False) -> int:\n",
    "    num_matching = 0\n",
    "    try:\n",
    "        g = generators[index]        \n",
    "        for j in range(num_generations):\n",
    "            try:\n",
    "                x = g(rng, difficulty=difficulty)\n",
    "                a, gt, full_answer = query_llm(x)\n",
    "\n",
    "                print(f\"[{index}.{j}], llm={a}, ground_truth={gt}, match={a==gt}\")\n",
    "                if verbose:\n",
    "                    print(x[\"question\"])\n",
    "                    print(full_answer)\n",
    "                if a == gt:\n",
    "                    num_matching += 1\n",
    "            except Exception as ex:\n",
    "                print(f\"[{index}.{j}] error: {ex}\")\n",
    "    except Exception as ex:\n",
    "        print(f\"[{index}] generator failure: {ex}\")\n",
    "        return -1\n",
    "    return num_matching\n",
    "\n",
    "def cross_check_generators(rng: Random, difficulty: float = 1.0, num_generations = 3):\n",
    "    results = [0] * len(generators)\n",
    "    for i in range(len(generators)):\n",
    "        results[i] = cross_check_generator(rng, index=i, difficulty=difficulty, num_generations=num_generations)\n",
    "    return results\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[35.0] error: Could not find valid time_per_room\n",
      "[35.1], llm=30.196, ground_truth=30, match=False\n",
      "A cleaner has to clean a office building with 21 floors. They have 3 days to get it done. It takes them 44 minutes per floor. If they work 17 hour day, what percentage of their day, on average, is spent cleaning floors?\n",
      "Let me solve this step by step:\n",
      "\n",
      "1. Total floors to clean: 21\n",
      "2. Time per floor: 44 minutes\n",
      "3. Total time needed: 21 × 44 = 924 minutes\n",
      "4. Working hours per day: 17 hours = 17 × 60 = 1020 minutes\n",
      "5. Days available: 3\n",
      "6. Time needed per day: 924 ÷ 3 = 308 minutes\n",
      "7. Percentage of day spent cleaning: (308 ÷ 1020) × 100 = 30.196%\n",
      "\n",
      "<answer>30.196</answer>\n",
      "[35.2], llm=34.23, ground_truth=34, match=False\n",
      "A cleaner has to clean a hospital with 12 floors. They have 4 days to get it done. It takes them 89 minutes per floor. If they work 13 hour day, what percentage of their day, on average, is spent cleaning floors?\n",
      "Let me solve this step by step:\n",
      "\n",
      "1. Total floors to clean = 12\n",
      "2. Total time for all floors = 12 × 89 minutes = 1,068 minutes\n",
      "3. Days available = 4\n",
      "4. Time needed per day = 1,068 ÷ 4 = 267 minutes\n",
      "5. Hours per day working = 13\n",
      "6. Minutes in work day = 13 × 60 = 780 minutes\n",
      "7. Percentage calculation = (267 ÷ 780) × 100 = 34.23%\n",
      "\n",
      "<answer>34.23</answer>\n",
      "[35.3], llm=61, ground_truth=61, match=True\n",
      "A cleaner has to clean a university with 20 floors. They have 10 days to get it done. It takes them 202 minutes per floor. If they work 11 hour day, what percentage of their day, on average, is spent cleaning floors?\n",
      "Let me solve this step by step:\n",
      "\n",
      "1. First, let's calculate total time needed to clean all floors:\n",
      "   * 202 minutes × 20 floors = 4040 minutes total\n",
      "\n",
      "2. They have 10 days to do it, so per day:\n",
      "   * 4040 ÷ 10 = 404 minutes per day cleaning\n",
      "\n",
      "3. 11 hour workday in minutes:\n",
      "   * 11 × 60 = 660 minutes per day working\n",
      "\n",
      "4. Calculate percentage:\n",
      "   * (404 ÷ 660) × 100 = 61.21212121...%\n",
      "\n",
      "<answer>61</answer>\n",
      "[35.4], llm=61.83, ground_truth=61, match=False\n",
      "A cleaner has to clean a office building with 28 floors. They have 4 days to get it done. It takes them 53 minutes per floor. If they work 10 hour day, what percentage of their day, on average, is spent cleaning floors?\n",
      "Let me solve this step by step:\n",
      "\n",
      "1. Time per floor = 53 minutes\n",
      "2. Total floors = 28\n",
      "3. Total minutes needed = 53 × 28 = 1,484 minutes\n",
      "4. Days available = 4\n",
      "5. Minutes needed per day = 1,484 ÷ 4 = 371 minutes\n",
      "6. Hours per day working = 10\n",
      "7. Minutes in work day = 10 × 60 = 600 minutes\n",
      "8. Percentage = (371 ÷ 600) × 100 = 61.833...%\n",
      "\n",
      "<answer>61.83</answer>\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rng = Random(200)\n",
    "re_check = [2,11,21,27,32,35,37]\n",
    "\n",
    "# for i in re_check:\n",
    "#     cross_check_generator(rng, index=i, difficulty=1.0, num_generations=3)\n",
    "# 11 not ok\n",
    "\n",
    "cross_check_generator(rng, index=35, difficulty=1.0, num_generations=5, verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.0], llm=43, ground_truth=43, match=True\n",
      "[0.1], llm=104, ground_truth=104, match=True\n",
      "[0.2], llm=21, ground_truth=21, match=True\n",
      "[0.3], llm=300, ground_truth=300, match=True\n",
      "[0.4], llm=76, ground_truth=76, match=True\n",
      "[1.0], llm=59, ground_truth=59, match=True\n",
      "[1.1], llm=61, ground_truth=61, match=True\n",
      "[1.2], llm=42, ground_truth=42, match=True\n",
      "[1.3], llm=76, ground_truth=76, match=True\n",
      "[1.4], llm=80, ground_truth=80, match=True\n",
      "[2.0], llm=84, ground_truth=84, match=True\n",
      "[2.1], llm=91, ground_truth=91, match=True\n",
      "[2.2], llm=79, ground_truth=79, match=True\n",
      "[2.3], llm=60, ground_truth=60, match=True\n",
      "[2.4], llm=72, ground_truth=72, match=True\n",
      "[3.0], llm=110, ground_truth=110, match=True\n",
      "[3.1], llm=8, ground_truth=8, match=True\n",
      "[3.2], llm=33, ground_truth=33, match=True\n",
      "[3.3], llm=7, ground_truth=7, match=True\n",
      "[3.4], llm=24, ground_truth=24, match=True\n",
      "[4.0], llm=15, ground_truth=15, match=True\n",
      "[4.1], llm=15, ground_truth=15, match=True\n",
      "[4.2], llm=4, ground_truth=4, match=True\n",
      "[4.3], llm=9, ground_truth=9, match=True\n",
      "[4.4], llm=3, ground_truth=3, match=True\n",
      "[5.0], llm=20, ground_truth=20, match=True\n",
      "[5.1], llm=43, ground_truth=43, match=True\n",
      "[5.2], llm=10, ground_truth=10, match=True\n",
      "[5.3], llm=63, ground_truth=63, match=True\n",
      "[5.4], llm=58, ground_truth=58, match=True\n",
      "[6.0], llm=120, ground_truth=120, match=True\n",
      "[6.1], llm=124, ground_truth=124, match=True\n",
      "[6.2], llm=24, ground_truth=24, match=True\n",
      "[6.3], llm=55, ground_truth=55, match=True\n",
      "[6.4], llm=92, ground_truth=92, match=True\n",
      "[7.0], llm=527, ground_truth=527, match=True\n",
      "[7.1], llm=515, ground_truth=515, match=True\n",
      "[7.2], llm=401, ground_truth=401, match=True\n",
      "[7.3], llm=44, ground_truth=44, match=True\n",
      "[7.4], llm=218, ground_truth=218, match=True\n",
      "[8.0], llm=1014, ground_truth=1014, match=True\n",
      "[8.1], llm=1010, ground_truth=1010, match=True\n",
      "[8.2], llm=300, ground_truth=300, match=True\n",
      "[8.3], llm=540, ground_truth=540, match=True\n",
      "[8.4], llm=864, ground_truth=864, match=True\n",
      "[9.0], llm=40, ground_truth=40, match=True\n",
      "[9.1], llm=0, ground_truth=0, match=True\n",
      "[9.2], llm=30, ground_truth=30, match=True\n",
      "[9.3], llm=80, ground_truth=80, match=True\n",
      "[9.4], llm=4, ground_truth=4, match=True\n",
      "[10.0], llm=45, ground_truth=45, match=True\n",
      "[10.1], llm=44, ground_truth=44, match=True\n",
      "[10.2], llm=24, ground_truth=24, match=True\n",
      "[10.3], llm=67, ground_truth=67, match=True\n",
      "[10.4], llm=90, ground_truth=90, match=True\n",
      "[11.0], llm=330, ground_truth=330, match=True\n",
      "[11.1], llm=386, ground_truth=386, match=True\n",
      "[11.2], llm=390, ground_truth=390, match=True\n",
      "[11.3], llm=386, ground_truth=386, match=True\n",
      "[11.4], llm=231, ground_truth=231, match=True\n",
      "[12.0], llm=351, ground_truth=351, match=True\n",
      "[12.1], llm=269, ground_truth=269, match=True\n",
      "[12.2], llm=286, ground_truth=286, match=True\n",
      "[12.3], llm=72, ground_truth=72, match=True\n",
      "[12.4], llm=368, ground_truth=368, match=True\n",
      "[13.0], llm=2, ground_truth=2, match=True\n",
      "[13.1], llm=2, ground_truth=2, match=True\n",
      "[13.2], llm=2, ground_truth=2, match=True\n",
      "[13.3], llm=2, ground_truth=2, match=True\n",
      "[13.4], llm=2, ground_truth=2, match=True\n",
      "[14.0], llm=128, ground_truth=128, match=True\n",
      "[14.1], llm=132, ground_truth=132, match=True\n",
      "[14.2], llm=120, ground_truth=120, match=True\n",
      "[14.3], llm=188, ground_truth=188, match=True\n",
      "[14.4], llm=168, ground_truth=168, match=True\n",
      "[15.0], llm=67, ground_truth=67, match=True\n",
      "[15.1], llm=89, ground_truth=89, match=True\n",
      "[15.2], llm=90, ground_truth=90, match=True\n",
      "[15.3], llm=67, ground_truth=67, match=True\n",
      "[15.4], llm=96, ground_truth=96, match=True\n",
      "[16.0], llm=51, ground_truth=51, match=True\n",
      "[16.1], llm=57, ground_truth=57, match=True\n",
      "[16.2], llm=32, ground_truth=32, match=True\n",
      "[16.3], llm=38, ground_truth=38, match=True\n",
      "[16.4], llm=32, ground_truth=32, match=True\n",
      "[17.0], llm=280, ground_truth=280, match=True\n",
      "[17.1], llm=210, ground_truth=210, match=True\n",
      "[17.2], llm=770, ground_truth=770, match=True\n",
      "[17.3], llm=190, ground_truth=190, match=True\n",
      "[17.4], llm=1060, ground_truth=1060, match=True\n",
      "[18.0], llm=775, ground_truth=775, match=True\n",
      "[18.1], llm=484, ground_truth=484, match=True\n",
      "[18.2], llm=359, ground_truth=359, match=True\n",
      "[18.3], llm=697, ground_truth=697, match=True\n",
      "[18.4], llm=740, ground_truth=740, match=True\n",
      "[19.0], llm=885, ground_truth=885, match=True\n",
      "[19.1], llm=950, ground_truth=950, match=True\n",
      "[19.2], llm=695, ground_truth=695, match=True\n",
      "[19.3], llm=1530, ground_truth=1530, match=True\n",
      "[19.4], llm=475, ground_truth=475, match=True\n",
      "[20.0], llm=4, ground_truth=4, match=True\n",
      "[20.1], llm=18, ground_truth=18, match=True\n",
      "[20.2], llm=1, ground_truth=1, match=True\n",
      "[20.3], llm=3, ground_truth=3, match=True\n",
      "[20.4], llm=3, ground_truth=3, match=True\n",
      "[21.0], llm=630, ground_truth=630, match=True\n",
      "[21.1], llm=525, ground_truth=525, match=True\n",
      "[21.2], llm=504, ground_truth=504, match=True\n",
      "[21.3], llm=350, ground_truth=350, match=True\n",
      "[21.4], llm=475, ground_truth=475, match=True\n",
      "[22.0], llm=19500, ground_truth=19500, match=True\n",
      "[22.1], llm=20800, ground_truth=20800, match=True\n",
      "[22.2], llm=69800, ground_truth=69800, match=True\n",
      "[22.3], llm=67400, ground_truth=67400, match=True\n",
      "[22.4], llm=33100, ground_truth=33100, match=True\n",
      "[23.0], llm=305, ground_truth=305, match=True\n",
      "[23.1], llm=206, ground_truth=206, match=True\n",
      "[23.2], llm=99, ground_truth=99, match=True\n",
      "[23.3], llm=389, ground_truth=389, match=True\n",
      "[23.4], llm=86, ground_truth=86, match=True\n",
      "[24.0], llm=20, ground_truth=20, match=True\n",
      "[24.1], llm=3, ground_truth=3, match=True\n",
      "[24.2], llm=41, ground_truth=41, match=True\n",
      "[24.3], llm=1, ground_truth=1, match=True\n",
      "[24.4], llm=3, ground_truth=3, match=True\n",
      "[25.0], llm=36, ground_truth=36, match=True\n",
      "[25.1], llm=42, ground_truth=42, match=True\n",
      "[25.2], llm=54, ground_truth=54, match=True\n",
      "[25.3], llm=28, ground_truth=28, match=True\n",
      "[25.4], llm=36, ground_truth=36, match=True\n",
      "[26.0], llm=2, ground_truth=2, match=True\n",
      "[26.1], llm=9, ground_truth=9, match=True\n",
      "[26.2], llm=2, ground_truth=2, match=True\n",
      "[26.3], llm=8, ground_truth=8, match=True\n",
      "[26.4], llm=6, ground_truth=6, match=True\n",
      "[27.0], llm=2916, ground_truth=2916, match=True\n",
      "[27.1], llm=3510, ground_truth=3510, match=True\n",
      "[27.2], llm=990, ground_truth=990, match=True\n",
      "[27.3], llm=3150, ground_truth=3150, match=True\n",
      "[27.4], llm=6063.75, ground_truth=6063.75, match=True\n",
      "[28.0], llm=570, ground_truth=570, match=True\n",
      "[28.1], llm=610, ground_truth=610, match=True\n",
      "[28.2], llm=382, ground_truth=382, match=True\n",
      "[28.3], llm=257, ground_truth=257, match=True\n",
      "[28.4], llm=467, ground_truth=467, match=True\n",
      "[29.0], llm=20, ground_truth=20, match=True\n",
      "[29.1], llm=20, ground_truth=20, match=True\n",
      "[29.2], llm=25, ground_truth=25, match=True\n",
      "[29.3], llm=20, ground_truth=20, match=True\n",
      "[29.4], llm=20, ground_truth=20, match=True\n",
      "[30.0], llm=17, ground_truth=17, match=True\n",
      "[30.1], llm=26, ground_truth=26, match=True\n",
      "[30.2], llm=93, ground_truth=93, match=True\n",
      "[30.3], llm=81, ground_truth=81, match=True\n",
      "[30.4], llm=26, ground_truth=26, match=True\n",
      "[31.0], llm=24, ground_truth=24, match=True\n",
      "[31.1], llm=26, ground_truth=26, match=True\n",
      "[31.2], llm=32, ground_truth=32, match=True\n",
      "[31.3], llm=30, ground_truth=30, match=True\n",
      "[31.4], llm=22, ground_truth=22, match=True\n",
      "[32.0], llm=52.5, ground_truth=63, match=False\n",
      "[32.1], llm=27, ground_truth=27, match=True\n",
      "[32.2], llm=60, ground_truth=100, match=False\n",
      "[32.3], llm=42, ground_truth=84, match=False\n",
      "[32.4], llm=30, ground_truth=30, match=True\n",
      "[33.0], llm=1715, ground_truth=1715, match=True\n",
      "[33.1], llm=1568, ground_truth=1568, match=True\n",
      "[33.2], llm=1568, ground_truth=1568, match=True\n",
      "[33.3], llm=1960, ground_truth=1960, match=True\n",
      "[33.4], llm=1029, ground_truth=1029, match=True\n",
      "[34.0], llm=1, ground_truth=1, match=True\n",
      "[34.1], llm=78, ground_truth=78, match=True\n",
      "[34.2], llm=4, ground_truth=4, match=True\n",
      "[34.3], llm=25, ground_truth=25, match=True\n",
      "[34.4], llm=151, ground_truth=151, match=True\n",
      "[35.0], llm=60, ground_truth=60, match=True\n",
      "[35.1], llm=51.76, ground_truth=51, match=False\n",
      "[35.2], llm=37, ground_truth=37, match=True\n",
      "[35.3], llm=23.33, ground_truth=23, match=False\n",
      "[35.4], llm=43.11, ground_truth=43, match=False\n",
      "[36.0], llm=75, ground_truth=75, match=True\n",
      "[36.1], llm=90, ground_truth=90, match=True\n",
      "[36.2], llm=27, ground_truth=27, match=True\n",
      "[36.3], llm=63, ground_truth=63, match=True\n",
      "[36.4], llm=34, ground_truth=34, match=True\n",
      "[37.0], llm=38454, ground_truth=38454, match=True\n",
      "[37.1], llm=30856, ground_truth=30856, match=True\n",
      "[37.2], llm=10962, ground_truth=10710, match=False\n",
      "[37.3], llm=15590.4, ground_truth=15232, match=False\n",
      "[37.4], llm=16224, ground_truth=16224, match=True\n",
      "[38.0], llm=159, ground_truth=159, match=True\n",
      "[38.1], llm=284, ground_truth=284, match=True\n",
      "[38.2], llm=325, ground_truth=325, match=True\n",
      "[38.3], llm=126, ground_truth=126, match=True\n",
      "[38.4], llm=285, ground_truth=285, match=True\n",
      "[39.0], llm=54, ground_truth=54, match=True\n",
      "[39.1], llm=25, ground_truth=25, match=True\n",
      "[39.2], llm=23, ground_truth=23, match=True\n",
      "[39.3], llm=52, ground_truth=52, match=True\n",
      "[39.4], llm=53, ground_truth=53, match=True\n",
      "[40.0], llm=96, ground_truth=96, match=True\n",
      "[40.1], llm=184, ground_truth=184, match=True\n",
      "[40.2], llm=134, ground_truth=134, match=True\n",
      "[40.3], llm=190, ground_truth=190, match=True\n",
      "[40.4], llm=320, ground_truth=320, match=True\n",
      "[41.0], llm=230, ground_truth=230, match=True\n",
      "[41.1], llm=165, ground_truth=165, match=True\n",
      "[41.2], llm=445, ground_truth=445, match=True\n",
      "[41.3], llm=195, ground_truth=195, match=True\n",
      "[41.4], llm=260, ground_truth=260, match=True\n",
      "[42.0], llm=171500, ground_truth=171500, match=True\n",
      "[42.1], llm=429600, ground_truth=429600, match=True\n",
      "[42.2], llm=100400, ground_truth=100400, match=True\n",
      "[42.3], llm=636000, ground_truth=636000, match=True\n",
      "[42.4], llm=490000, ground_truth=490000, match=True\n",
      "[43.0], llm=16, ground_truth=16, match=True\n",
      "[43.1], llm=20, ground_truth=20, match=True\n",
      "[43.2], llm=20, ground_truth=20, match=True\n",
      "[43.3], llm=27, ground_truth=27, match=True\n",
      "[43.4], llm=11, ground_truth=11, match=True\n",
      "[44.0], llm=417, ground_truth=417, match=True\n",
      "[44.1], llm=420, ground_truth=420, match=True\n",
      "[44.2], llm=674, ground_truth=674, match=True\n",
      "[44.3], llm=374, ground_truth=374, match=True\n",
      "[44.4], llm=500, ground_truth=500, match=True\n",
      "[45.0], llm=15, ground_truth=15, match=True\n",
      "[45.1], llm=29, ground_truth=29, match=True\n",
      "[45.2], llm=23, ground_truth=23, match=True\n",
      "[45.3], llm=23, ground_truth=23, match=True\n",
      "[45.4], llm=11, ground_truth=11, match=True\n",
      "[46.0], llm=26, ground_truth=26, match=True\n",
      "[46.1], llm=16, ground_truth=16, match=True\n",
      "[46.2], llm=23, ground_truth=23, match=True\n",
      "[46.3], llm=18, ground_truth=18, match=True\n",
      "[46.4], llm=18, ground_truth=18, match=True\n",
      "[47.0], llm=385, ground_truth=385, match=True\n",
      "[47.1], llm=156, ground_truth=156, match=True\n",
      "[47.2], llm=415, ground_truth=415, match=True\n",
      "[47.3], llm=149, ground_truth=149, match=True\n",
      "[47.4], llm=306, ground_truth=306, match=True\n",
      "[48.0], llm=20, ground_truth=20, match=True\n",
      "[48.1], llm=43, ground_truth=43, match=True\n",
      "[48.2], llm=6, ground_truth=6, match=True\n",
      "[48.3], llm=17, ground_truth=17, match=True\n",
      "[48.4], llm=43, ground_truth=43, match=True\n",
      "[49.0], llm=620, ground_truth=620, match=True\n",
      "[49.1], llm=366, ground_truth=366, match=True\n",
      "[49.2], llm=670, ground_truth=670, match=True\n",
      "[49.3], llm=1345, ground_truth=1345, match=True\n",
      "[49.4], llm=616, ground_truth=616, match=True\n",
      "[50.0], llm=983, ground_truth=983, match=True\n",
      "[50.1], llm=1084, ground_truth=1084, match=True\n",
      "[50.2], llm=862, ground_truth=862, match=True\n",
      "[50.3], llm=988, ground_truth=988, match=True\n",
      "[50.4], llm=591, ground_truth=591, match=True\n",
      "[51.0], llm=3, ground_truth=2, match=False\n",
      "[51.1], llm=7, ground_truth=7, match=True\n",
      "[51.2], llm=5, ground_truth=5, match=True\n",
      "[51.3], llm=7, ground_truth=7, match=True\n",
      "[51.4], llm=8, ground_truth=7, match=False\n",
      "[52.0], llm=288, ground_truth=288, match=True\n",
      "[52.1], llm=272, ground_truth=272, match=True\n",
      "[52.2], llm=238, ground_truth=238, match=True\n",
      "[52.3], llm=224, ground_truth=224, match=True\n",
      "[52.4], llm=130, ground_truth=130, match=True\n",
      "[53.0], llm=65, ground_truth=65, match=True\n",
      "[53.1], llm=25, ground_truth=25, match=True\n",
      "[53.2], llm=50, ground_truth=50, match=True\n",
      "[53.3], llm=50, ground_truth=50, match=True\n",
      "[53.4], llm=25, ground_truth=25, match=True\n",
      "[54.0], llm=32, ground_truth=32, match=True\n",
      "[54.1], llm=80, ground_truth=80, match=True\n",
      "[54.2], llm=20, ground_truth=20, match=True\n",
      "[54.3], llm=13, ground_truth=13, match=True\n",
      "[54.4], llm=53, ground_truth=53, match=True\n",
      "[55.0], llm=300, ground_truth=300, match=True\n",
      "[55.1], llm=159, ground_truth=159, match=True\n",
      "[55.2], llm=144, ground_truth=144, match=True\n",
      "[55.3], llm=132, ground_truth=132, match=True\n",
      "[55.4], llm=42, ground_truth=42, match=True\n",
      "[56.0], llm=5565, ground_truth=5565, match=True\n",
      "[56.1], llm=1576, ground_truth=1576, match=True\n",
      "[56.2], llm=1338, ground_truth=1338, match=True\n",
      "[56.3], llm=5675, ground_truth=5675, match=True\n",
      "[56.4], llm=3894, ground_truth=3894, match=True\n",
      "[57.0], llm=90, ground_truth=90, match=True\n",
      "[57.1], llm=86, ground_truth=86, match=True\n",
      "[57.2], llm=68, ground_truth=68, match=True\n",
      "[57.3], llm=71, ground_truth=71, match=True\n",
      "[57.4], llm=72, ground_truth=72, match=True\n",
      "[58.0], llm=128, ground_truth=128, match=True\n",
      "[58.1], llm=150, ground_truth=150, match=True\n",
      "[58.2], llm=672, ground_truth=672, match=True\n",
      "[58.3], llm=360, ground_truth=360, match=True\n",
      "[58.4], llm=350, ground_truth=350, match=True\n",
      "[59.0], llm=846, ground_truth=846, match=True\n",
      "[59.1], llm=298, ground_truth=298, match=True\n",
      "[59.2], llm=368, ground_truth=368, match=True\n",
      "[59.3], llm=2992, ground_truth=2992, match=True\n",
      "[59.4], llm=864, ground_truth=864, match=True\n",
      "[60.0], llm=92.5, ground_truth=92, match=False\n",
      "[60.1], llm=74, ground_truth=74, match=True\n",
      "[60.2], llm=57, ground_truth=57, match=True\n",
      "[60.3], llm=90, ground_truth=87, match=False\n",
      "[60.4], llm=102.5, ground_truth=102, match=False\n",
      "[61.0], llm=384.20, ground_truth=385, match=False\n",
      "[61.1], llm=566.02, ground_truth=567, match=False\n",
      "[61.2], llm=366.92, ground_truth=354, match=False\n",
      "[61.3], llm=431.35, ground_truth=506, match=False\n",
      "[61.4], llm=476.17, ground_truth=564, match=False\n",
      "[62.0], llm=8, ground_truth=8, match=True\n",
      "[62.1], llm=3, ground_truth=3, match=True\n",
      "[62.2], llm=7, ground_truth=7, match=True\n",
      "[62.3], llm=8, ground_truth=8, match=True\n",
      "[62.4], llm=5, ground_truth=5, match=True\n",
      "[63.0], llm=4644, ground_truth=4644, match=True\n",
      "[63.1], llm=6808, ground_truth=6808, match=True\n",
      "[63.2], llm=3496, ground_truth=3496, match=True\n",
      "[63.3], llm=5012, ground_truth=4616, match=False\n",
      "[63.4], llm=4024, ground_truth=4024, match=True\n",
      "[64.0], llm=56, ground_truth=56, match=True\n",
      "[64.1], llm=64, ground_truth=64, match=True\n",
      "[64.2], llm=64, ground_truth=64, match=True\n",
      "[64.3], llm=49, ground_truth=49, match=True\n",
      "[64.4], llm=57, ground_truth=57, match=True\n",
      "[65.0], llm=454.98, ground_truth=363, match=False\n",
      "[65.1], llm=520, ground_truth=420, match=False\n",
      "[65.2], llm=insufficient data, ground_truth=398, match=False\n",
      "[65.3], llm=missing data, ground_truth=141, match=False\n",
      "[65.4], llm=431.65, ground_truth=380, match=False\n",
      "[66.0], llm=2, ground_truth=2, match=True\n",
      "[66.1], llm=7, ground_truth=7, match=True\n",
      "[66.2], llm=1, ground_truth=1, match=True\n",
      "[66.3], llm=4, ground_truth=4, match=True\n",
      "[66.4], llm=7, ground_truth=7, match=True\n",
      "[67.0], llm=814, ground_truth=814, match=True\n",
      "[67.1], llm=1928, ground_truth=1928, match=True\n",
      "[67.2], llm=512, ground_truth=512, match=True\n",
      "[67.3], llm=1314, ground_truth=1314, match=True\n",
      "[67.4], llm=1381, ground_truth=1381, match=True\n",
      "[68.0], llm=3773, ground_truth=3773, match=True\n",
      "[68.1], llm=1715, ground_truth=1715, match=True\n",
      "[68.2], llm=4320, ground_truth=4320, match=True\n",
      "[68.3], llm=1715, ground_truth=1715, match=True\n",
      "[68.4], llm=513, ground_truth=513, match=True\n",
      "[69.0], llm=147, ground_truth=147, match=True\n",
      "[69.1], llm=74, ground_truth=74, match=True\n",
      "[69.2], llm=159, ground_truth=159, match=True\n",
      "[69.3], llm=68, ground_truth=68, match=True\n",
      "[69.4], llm=10, ground_truth=10, match=True\n",
      "[70.0], llm=27, ground_truth=27, match=True\n",
      "[70.1], llm=52, ground_truth=52, match=True\n",
      "[70.2], llm=23, ground_truth=23, match=True\n",
      "[70.3], llm=14, ground_truth=14, match=True\n",
      "[70.4], llm=85, ground_truth=85, match=True\n",
      "[71.0], llm=1047, ground_truth=1047, match=True\n",
      "[71.1], llm=776, ground_truth=776, match=True\n",
      "[71.2], llm=1285, ground_truth=1285, match=True\n",
      "[71.3], llm=1113, ground_truth=1113, match=True\n",
      "[71.4], llm=1060, ground_truth=1060, match=True\n",
      "[72.0], llm=4, ground_truth=4, match=True\n",
      "[72.1], llm=4, ground_truth=4, match=True\n",
      "[72.2], llm=19, ground_truth=19, match=True\n",
      "[72.3], llm=2, ground_truth=2, match=True\n",
      "[72.4], llm=8, ground_truth=8, match=True\n",
      "[73.0], llm=1280, ground_truth=1280, match=True\n",
      "[73.1], llm=1620, ground_truth=1620, match=True\n",
      "[73.2], llm=1728, ground_truth=1728, match=True\n",
      "[73.3], llm=1379, ground_truth=1379, match=True\n",
      "[73.4], llm=1826, ground_truth=1826, match=True\n",
      "[74.0], llm=100, ground_truth=100, match=True\n",
      "[74.1], llm=100, ground_truth=100, match=True\n",
      "[74.2], llm=10, ground_truth=10, match=True\n",
      "[74.3], llm=4.76, ground_truth=4, match=False\n",
      "[74.4], llm=1.19, ground_truth=1, match=False\n",
      "[75.0], llm=14.5, ground_truth=14, match=False\n",
      "[75.1], llm=60, ground_truth=60, match=True\n",
      "[75.2], llm=25.5, ground_truth=25, match=False\n",
      "[75.3], llm=44, ground_truth=44, match=True\n",
      "[75.4], llm=10.5, ground_truth=10, match=False\n",
      "[76.0], llm=27, ground_truth=26, match=False\n",
      "[76.1], llm=22.5, ground_truth=22, match=False\n",
      "[76.2], llm=42, ground_truth=42, match=True\n",
      "[76.3], llm=6, ground_truth=6, match=True\n",
      "[76.4], llm=9, ground_truth=9, match=True\n",
      "[77.0], llm=6, ground_truth=6, match=True\n",
      "[77.1], llm=21, ground_truth=21, match=True\n",
      "[77.2], llm=40, ground_truth=40, match=True\n",
      "[77.3], llm=5, ground_truth=21, match=False\n",
      "[77.4], llm=5, ground_truth=15, match=False\n",
      "[78.0], llm=53, ground_truth=53, match=True\n",
      "[78.1], llm=55, ground_truth=55, match=True\n",
      "[78.2], llm=38, ground_truth=38, match=True\n",
      "[78.3], llm=66, ground_truth=66, match=True\n",
      "[78.4], llm=76, ground_truth=76, match=True\n",
      "[79.0], llm=78, ground_truth=78, match=True\n",
      "[79.1], llm=329, ground_truth=235, match=False\n",
      "[79.2], llm=231, ground_truth=231, match=True\n",
      "[79.3], llm=81, ground_truth=81, match=True\n",
      "[79.4], llm=231, ground_truth=231, match=True\n",
      "[80.0], llm=50, ground_truth=50, match=True\n",
      "[80.1], llm=25, ground_truth=25, match=True\n",
      "[80.2], llm=50, ground_truth=50, match=True\n",
      "[80.3], llm=50, ground_truth=50, match=True\n",
      "[80.4], llm=50, ground_truth=50, match=True\n",
      "[81.0], llm=29160, ground_truth=29160, match=True\n",
      "[81.1], llm=10080, ground_truth=10080, match=True\n",
      "[81.2], llm=28080, ground_truth=28080, match=True\n",
      "[81.3], llm=27000, ground_truth=27000, match=True\n",
      "[81.4], llm=8160, ground_truth=8160, match=True\n",
      "[82.0], llm=480, ground_truth=480, match=True\n",
      "[82.1], llm=475, ground_truth=475, match=True\n",
      "[82.2], llm=320, ground_truth=320, match=True\n",
      "[82.3], llm=840, ground_truth=840, match=True\n",
      "[82.4], llm=540, ground_truth=540, match=True\n",
      "[83.0], llm=95, ground_truth=95, match=True\n",
      "[83.1], llm=92, ground_truth=92, match=True\n",
      "[83.2], llm=48, ground_truth=48, match=True\n",
      "[83.3], llm=53, ground_truth=53, match=True\n",
      "[83.4], llm=91, ground_truth=91, match=True\n",
      "[84.0], llm=84, ground_truth=84, match=True\n",
      "[84.1], llm=161, ground_truth=161, match=True\n",
      "[84.2], llm=114, ground_truth=114, match=True\n",
      "[84.3], llm=145, ground_truth=145, match=True\n",
      "[84.4], llm=192, ground_truth=192, match=True\n",
      "[85.0], llm=166, ground_truth=166, match=True\n",
      "[85.1], llm=90, ground_truth=90, match=True\n",
      "[85.2], llm=150, ground_truth=150, match=True\n",
      "[85.3], llm=152, ground_truth=152, match=True\n",
      "[85.4], llm=178, ground_truth=178, match=True\n",
      "[86.0], llm=5, ground_truth=4, match=False\n",
      "[86.1], llm=3, ground_truth=2, match=False\n",
      "[86.2], llm=4, ground_truth=3, match=False\n",
      "[86.3], llm=5, ground_truth=4, match=False\n",
      "[86.4], llm=3, ground_truth=3, match=True\n",
      "[87.0], llm=3, ground_truth=3, match=True\n",
      "[87.1], llm=2, ground_truth=2, match=True\n",
      "[87.2], llm=7.5, ground_truth=7, match=False\n",
      "[87.3], llm=10, ground_truth=10, match=True\n",
      "[87.4], llm=2, ground_truth=2, match=True\n",
      "[88.0], llm=3, ground_truth=3, match=True\n",
      "[88.1], llm=7, ground_truth=7, match=True\n",
      "[88.2], llm=2, ground_truth=2, match=True\n",
      "[88.3], llm=5, ground_truth=5, match=True\n",
      "[88.4], llm=5, ground_truth=5, match=True\n",
      "[89.0], llm=54, ground_truth=54, match=True\n",
      "[89.1], llm=63, ground_truth=63, match=True\n",
      "[89.2], llm=66, ground_truth=66, match=True\n",
      "[89.3], llm=27, ground_truth=27, match=True\n",
      "[89.4], llm=42, ground_truth=42, match=True\n",
      "[90.0], llm=11.76, ground_truth=11, match=False\n",
      "[90.1], llm=10.95, ground_truth=10, match=False\n",
      "[90.2], llm=15.28, ground_truth=15, match=False\n",
      "[90.3], llm=7.81, ground_truth=7, match=False\n",
      "[90.4], llm=11.20, ground_truth=11, match=False\n",
      "[91.0], llm=14400, ground_truth=14400, match=True\n",
      "[91.1], llm=5040, ground_truth=5040, match=True\n",
      "[91.2], llm=3520, ground_truth=3520, match=True\n",
      "[91.3], llm=6300, ground_truth=6300, match=True\n",
      "[91.4], llm=33630, ground_truth=33630, match=True\n",
      "[92.0], llm=406, ground_truth=406, match=True\n",
      "[92.1], llm=308, ground_truth=308, match=True\n",
      "[92.2], llm=325, ground_truth=325, match=True\n",
      "[92.3], llm=278, ground_truth=278, match=True\n",
      "[92.4], llm=315, ground_truth=315, match=True\n",
      "[93.0], llm=225, ground_truth=225, match=True\n",
      "[93.1], llm=25, ground_truth=25, match=True\n",
      "[93.2], llm=150, ground_truth=150, match=True\n",
      "[93.3], llm=50, ground_truth=50, match=True\n",
      "[93.4], llm=150, ground_truth=150, match=True\n",
      "[94.0], llm=1406, ground_truth=1406, match=True\n",
      "[94.1], llm=504, ground_truth=504, match=True\n",
      "[94.2], llm=1320, ground_truth=1320, match=True\n",
      "[94.3], llm=1656, ground_truth=1656, match=True\n",
      "[94.4], llm=108, ground_truth=108, match=True\n",
      "[95.0], llm=360, ground_truth=360, match=True\n",
      "[95.1], llm=510, ground_truth=510, match=True\n",
      "[95.2], llm=112, ground_truth=112, match=True\n",
      "[95.3], llm=91, ground_truth=91, match=True\n",
      "[95.4], llm=450, ground_truth=450, match=True\n",
      "[96.0], llm=808, ground_truth=808, match=True\n",
      "[96.1], llm=352, ground_truth=352, match=True\n",
      "[96.2], llm=1062, ground_truth=1062, match=True\n",
      "[96.3], llm=1203, ground_truth=1203, match=True\n",
      "[96.4], llm=347, ground_truth=347, match=True\n",
      "[97.0], llm=11.136, ground_truth=11, match=False\n",
      "[97.1], llm=22.272, ground_truth=22, match=False\n",
      "[97.2], llm=16.704, ground_truth=16, match=False\n",
      "[97.3], llm=26.57, ground_truth=26, match=False\n",
      "[97.4], llm=71.64, ground_truth=72, match=False\n",
      "[98.0], llm=82, ground_truth=82, match=True\n",
      "[98.1], llm=70, ground_truth=70, match=True\n",
      "[98.2], llm=83.25, ground_truth=83, match=False\n",
      "[98.3], llm=88.25, ground_truth=88, match=False\n",
      "[98.4], llm=70.5, ground_truth=70, match=False\n",
      "[99.0], llm=30.00, ground_truth=30, match=False\n",
      "[99.1], llm=51.00, ground_truth=51, match=False\n",
      "[99.2], llm=59.00, ground_truth=59, match=False\n",
      "[99.3], llm=2.00, ground_truth=2, match=False\n",
      "[99.4], llm=44.00, ground_truth=44, match=False\n"
     ]
    }
   ],
   "source": [
    "rng = Random(55)\n",
    "result_1 = cross_check_generators(rng, difficulty=1.0, num_generations=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "good = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,34,36,38,39,40,41,42,43,44,45,46,47,48,49,50,52,53,54,55,56,57,58,59,62,64,66,67,68,69,70,71,72,73,78,80,81,82,83,84,85,88,89,91,92,93,94,95,96]\n",
      "not_good = [32,35,37,51,60,61,63,65,74,75,76,77,79,86,87,90,97,98,99]\n"
     ]
    }
   ],
   "source": [
    "good = [str(i) for i in range(len(result_1)) if result_1[i] == 5]\n",
    "not_good = [str(i) for i in range(len(result_1)) if result_1[i] < 5]\n",
    "\n",
    "print('good = [' +  \",\".join(good) + ']')\n",
    "print('not_good = [' +  \",\".join(not_good) + ']')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "reasoning-gym",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}