mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2025-10-09 13:40:09 +03:00
816 lines
38 KiB
Plaintext
816 lines
38 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# create open-router client, place your OPENROUTER_API_KEY in .env file\n",
|
||
"# .env contents:\n",
|
||
"# OPENROUTER_API_KEY=sk-or-v1- ...\n",
|
||
"\n",
|
||
"%load_ext dotenv\n",
|
||
"%dotenv\n",
|
||
"import os\n",
|
||
"import re\n",
|
||
"from random import Random\n",
|
||
"from pathlib import Path\n",
|
||
"from typing import Any, Iterable, Optional\n",
|
||
"import json\n",
|
||
"from openai import OpenAI\n",
|
||
"from openai.types.chat import ChatCompletion, ChatCompletionMessageParam\n",
|
||
"import time\n",
|
||
"import reasoning_gym\n",
|
||
"\n",
|
||
"\n",
|
||
"def llm_generate(\n",
|
||
" client: OpenAI,\n",
|
||
" messages: Iterable[ChatCompletionMessageParam],\n",
|
||
" sampling_params: dict[str, Any],\n",
|
||
") -> ChatCompletion:\n",
|
||
" max_retry = 3\n",
|
||
" for trial in range(max_retry):\n",
|
||
" try:\n",
|
||
" return client.chat.completions.create(\n",
|
||
" messages=messages,\n",
|
||
" **sampling_params,\n",
|
||
" )\n",
|
||
" except Exception as e:\n",
|
||
" print(\"failure response:\", e)\n",
|
||
" time.sleep(trial * trial) # quadratic backoff\n",
|
||
" if trial == max_retry - 1:\n",
|
||
" raise\n",
|
||
"\n",
|
||
"def generate_simple_request(user_prompt: str, developer_prompt: Optional[str] = None) -> list[dict]:\n",
|
||
" prompt = []\n",
|
||
" if developer_prompt is not None:\n",
|
||
" prompt.append( { \"role\": \"system\", \"content\": developer_prompt } )\n",
|
||
" \n",
|
||
" prompt.append( { \"role\": \"user\", \"content\": user_prompt })\n",
|
||
" return prompt\n",
|
||
"\n",
|
||
"open_router_client = OpenAI(\n",
|
||
" base_url=\"https://openrouter.ai/api/v1\",\n",
|
||
" api_key=os.getenv(\"OPENROUTER_API_KEY\"),\n",
|
||
" timeout=90.0,\n",
|
||
")\n",
|
||
"\n",
|
||
"sampling_params = {\n",
|
||
" \"model\": \"anthropic/claude-3.5-sonnet\",\n",
|
||
" \"max_tokens\": 4096,\n",
|
||
"}\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"48"
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(reasoning_gym.factory.DATASETS)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# test all gsm_symoblic generators\n",
|
||
"import reasoning_gym.arithmetic.gsm_symbolic\n",
|
||
"x = reasoning_gym.create_dataset(\"gsm_symbolic\")\n",
|
||
"\n",
|
||
"generators = x.generators"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import reasoning_gym.utils\n",
|
||
"\n",
|
||
"difficulty = 1.0\n",
|
||
"\n",
|
||
"prompt_template = \"Solve the following math task and return the answer (just the number) in <answer></answer> tags:\\n\\n{question}\"\n",
|
||
"\n",
|
||
"def query_llm(x: dict) -> tuple[int, int]:\n",
|
||
" q = x[\"question\"]\n",
|
||
" ground_truth = x[\"answer\"]\n",
|
||
" user_prompt = prompt_template.format(question=q)\n",
|
||
" msgs = generate_simple_request(user_prompt)\n",
|
||
" output = llm_generate(client=open_router_client, messages=msgs, sampling_params=sampling_params)\n",
|
||
" full_answer = output.choices[0].message.content\n",
|
||
" answer = reasoning_gym.utils.extract_answer(completion=full_answer, tag_name=\"answer\").strip()\n",
|
||
" return answer, ground_truth, full_answer\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def cross_check_generator(rng: Random, index: int, difficulty: float = 1.0, num_generations = 3, verbose: bool = False) -> int:\n",
|
||
" num_matching = 0\n",
|
||
" try:\n",
|
||
" g = generators[index] \n",
|
||
" for j in range(num_generations):\n",
|
||
" try:\n",
|
||
" x = g(rng, difficulty=difficulty)\n",
|
||
" a, gt, full_answer = query_llm(x)\n",
|
||
"\n",
|
||
" print(f\"[{index}.{j}], llm={a}, ground_truth={gt}, match={a==gt}\")\n",
|
||
" if verbose:\n",
|
||
" print(x[\"question\"])\n",
|
||
" print(full_answer)\n",
|
||
" if a == gt:\n",
|
||
" num_matching += 1\n",
|
||
" except Exception as ex:\n",
|
||
" print(f\"[{index}.{j}] error: {ex}\")\n",
|
||
" except Exception as ex:\n",
|
||
" print(f\"[{index}] generator failure: {ex}\")\n",
|
||
" return -1\n",
|
||
" return num_matching\n",
|
||
"\n",
|
||
"def cross_check_generators(rng: Random, difficulty: float = 1.0, num_generations = 3):\n",
|
||
" results = [0] * len(generators)\n",
|
||
" for i in range(len(generators)):\n",
|
||
" results[i] = cross_check_generator(rng, index=i, difficulty=difficulty, num_generations=num_generations)\n",
|
||
" return results\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[35.0] error: Could not find valid time_per_room\n",
|
||
"[35.1], llm=30.196, ground_truth=30, match=False\n",
|
||
"A cleaner has to clean a office building with 21 floors. They have 3 days to get it done. It takes them 44 minutes per floor. If they work 17 hour day, what percentage of their day, on average, is spent cleaning floors?\n",
|
||
"Let me solve this step by step:\n",
|
||
"\n",
|
||
"1. Total floors to clean: 21\n",
|
||
"2. Time per floor: 44 minutes\n",
|
||
"3. Total time needed: 21 × 44 = 924 minutes\n",
|
||
"4. Working hours per day: 17 hours = 17 × 60 = 1020 minutes\n",
|
||
"5. Days available: 3\n",
|
||
"6. Time needed per day: 924 ÷ 3 = 308 minutes\n",
|
||
"7. Percentage of day spent cleaning: (308 ÷ 1020) × 100 = 30.196%\n",
|
||
"\n",
|
||
"<answer>30.196</answer>\n",
|
||
"[35.2], llm=34.23, ground_truth=34, match=False\n",
|
||
"A cleaner has to clean a hospital with 12 floors. They have 4 days to get it done. It takes them 89 minutes per floor. If they work 13 hour day, what percentage of their day, on average, is spent cleaning floors?\n",
|
||
"Let me solve this step by step:\n",
|
||
"\n",
|
||
"1. Total floors to clean = 12\n",
|
||
"2. Total time for all floors = 12 × 89 minutes = 1,068 minutes\n",
|
||
"3. Days available = 4\n",
|
||
"4. Time needed per day = 1,068 ÷ 4 = 267 minutes\n",
|
||
"5. Hours per day working = 13\n",
|
||
"6. Minutes in work day = 13 × 60 = 780 minutes\n",
|
||
"7. Percentage calculation = (267 ÷ 780) × 100 = 34.23%\n",
|
||
"\n",
|
||
"<answer>34.23</answer>\n",
|
||
"[35.3], llm=61, ground_truth=61, match=True\n",
|
||
"A cleaner has to clean a university with 20 floors. They have 10 days to get it done. It takes them 202 minutes per floor. If they work 11 hour day, what percentage of their day, on average, is spent cleaning floors?\n",
|
||
"Let me solve this step by step:\n",
|
||
"\n",
|
||
"1. First, let's calculate total time needed to clean all floors:\n",
|
||
" * 202 minutes × 20 floors = 4040 minutes total\n",
|
||
"\n",
|
||
"2. They have 10 days to do it, so per day:\n",
|
||
" * 4040 ÷ 10 = 404 minutes per day cleaning\n",
|
||
"\n",
|
||
"3. 11 hour workday in minutes:\n",
|
||
" * 11 × 60 = 660 minutes per day working\n",
|
||
"\n",
|
||
"4. Calculate percentage:\n",
|
||
" * (404 ÷ 660) × 100 = 61.21212121...%\n",
|
||
"\n",
|
||
"<answer>61</answer>\n",
|
||
"[35.4], llm=61.83, ground_truth=61, match=False\n",
|
||
"A cleaner has to clean a office building with 28 floors. They have 4 days to get it done. It takes them 53 minutes per floor. If they work 10 hour day, what percentage of their day, on average, is spent cleaning floors?\n",
|
||
"Let me solve this step by step:\n",
|
||
"\n",
|
||
"1. Time per floor = 53 minutes\n",
|
||
"2. Total floors = 28\n",
|
||
"3. Total minutes needed = 53 × 28 = 1,484 minutes\n",
|
||
"4. Days available = 4\n",
|
||
"5. Minutes needed per day = 1,484 ÷ 4 = 371 minutes\n",
|
||
"6. Hours per day working = 10\n",
|
||
"7. Minutes in work day = 10 × 60 = 600 minutes\n",
|
||
"8. Percentage = (371 ÷ 600) × 100 = 61.833...%\n",
|
||
"\n",
|
||
"<answer>61.83</answer>\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"1"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"rng = Random(200)\n",
|
||
"re_check = [2,11,21,27,32,35,37]\n",
|
||
"\n",
|
||
"# for i in re_check:\n",
|
||
"# cross_check_generator(rng, index=i, difficulty=1.0, num_generations=3)\n",
|
||
"# 11 not ok\n",
|
||
"\n",
|
||
"cross_check_generator(rng, index=35, difficulty=1.0, num_generations=5, verbose=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[0.0], llm=43, ground_truth=43, match=True\n",
|
||
"[0.1], llm=104, ground_truth=104, match=True\n",
|
||
"[0.2], llm=21, ground_truth=21, match=True\n",
|
||
"[0.3], llm=300, ground_truth=300, match=True\n",
|
||
"[0.4], llm=76, ground_truth=76, match=True\n",
|
||
"[1.0], llm=59, ground_truth=59, match=True\n",
|
||
"[1.1], llm=61, ground_truth=61, match=True\n",
|
||
"[1.2], llm=42, ground_truth=42, match=True\n",
|
||
"[1.3], llm=76, ground_truth=76, match=True\n",
|
||
"[1.4], llm=80, ground_truth=80, match=True\n",
|
||
"[2.0], llm=84, ground_truth=84, match=True\n",
|
||
"[2.1], llm=91, ground_truth=91, match=True\n",
|
||
"[2.2], llm=79, ground_truth=79, match=True\n",
|
||
"[2.3], llm=60, ground_truth=60, match=True\n",
|
||
"[2.4], llm=72, ground_truth=72, match=True\n",
|
||
"[3.0], llm=110, ground_truth=110, match=True\n",
|
||
"[3.1], llm=8, ground_truth=8, match=True\n",
|
||
"[3.2], llm=33, ground_truth=33, match=True\n",
|
||
"[3.3], llm=7, ground_truth=7, match=True\n",
|
||
"[3.4], llm=24, ground_truth=24, match=True\n",
|
||
"[4.0], llm=15, ground_truth=15, match=True\n",
|
||
"[4.1], llm=15, ground_truth=15, match=True\n",
|
||
"[4.2], llm=4, ground_truth=4, match=True\n",
|
||
"[4.3], llm=9, ground_truth=9, match=True\n",
|
||
"[4.4], llm=3, ground_truth=3, match=True\n",
|
||
"[5.0], llm=20, ground_truth=20, match=True\n",
|
||
"[5.1], llm=43, ground_truth=43, match=True\n",
|
||
"[5.2], llm=10, ground_truth=10, match=True\n",
|
||
"[5.3], llm=63, ground_truth=63, match=True\n",
|
||
"[5.4], llm=58, ground_truth=58, match=True\n",
|
||
"[6.0], llm=120, ground_truth=120, match=True\n",
|
||
"[6.1], llm=124, ground_truth=124, match=True\n",
|
||
"[6.2], llm=24, ground_truth=24, match=True\n",
|
||
"[6.3], llm=55, ground_truth=55, match=True\n",
|
||
"[6.4], llm=92, ground_truth=92, match=True\n",
|
||
"[7.0], llm=527, ground_truth=527, match=True\n",
|
||
"[7.1], llm=515, ground_truth=515, match=True\n",
|
||
"[7.2], llm=401, ground_truth=401, match=True\n",
|
||
"[7.3], llm=44, ground_truth=44, match=True\n",
|
||
"[7.4], llm=218, ground_truth=218, match=True\n",
|
||
"[8.0], llm=1014, ground_truth=1014, match=True\n",
|
||
"[8.1], llm=1010, ground_truth=1010, match=True\n",
|
||
"[8.2], llm=300, ground_truth=300, match=True\n",
|
||
"[8.3], llm=540, ground_truth=540, match=True\n",
|
||
"[8.4], llm=864, ground_truth=864, match=True\n",
|
||
"[9.0], llm=40, ground_truth=40, match=True\n",
|
||
"[9.1], llm=0, ground_truth=0, match=True\n",
|
||
"[9.2], llm=30, ground_truth=30, match=True\n",
|
||
"[9.3], llm=80, ground_truth=80, match=True\n",
|
||
"[9.4], llm=4, ground_truth=4, match=True\n",
|
||
"[10.0], llm=45, ground_truth=45, match=True\n",
|
||
"[10.1], llm=44, ground_truth=44, match=True\n",
|
||
"[10.2], llm=24, ground_truth=24, match=True\n",
|
||
"[10.3], llm=67, ground_truth=67, match=True\n",
|
||
"[10.4], llm=90, ground_truth=90, match=True\n",
|
||
"[11.0], llm=330, ground_truth=330, match=True\n",
|
||
"[11.1], llm=386, ground_truth=386, match=True\n",
|
||
"[11.2], llm=390, ground_truth=390, match=True\n",
|
||
"[11.3], llm=386, ground_truth=386, match=True\n",
|
||
"[11.4], llm=231, ground_truth=231, match=True\n",
|
||
"[12.0], llm=351, ground_truth=351, match=True\n",
|
||
"[12.1], llm=269, ground_truth=269, match=True\n",
|
||
"[12.2], llm=286, ground_truth=286, match=True\n",
|
||
"[12.3], llm=72, ground_truth=72, match=True\n",
|
||
"[12.4], llm=368, ground_truth=368, match=True\n",
|
||
"[13.0], llm=2, ground_truth=2, match=True\n",
|
||
"[13.1], llm=2, ground_truth=2, match=True\n",
|
||
"[13.2], llm=2, ground_truth=2, match=True\n",
|
||
"[13.3], llm=2, ground_truth=2, match=True\n",
|
||
"[13.4], llm=2, ground_truth=2, match=True\n",
|
||
"[14.0], llm=128, ground_truth=128, match=True\n",
|
||
"[14.1], llm=132, ground_truth=132, match=True\n",
|
||
"[14.2], llm=120, ground_truth=120, match=True\n",
|
||
"[14.3], llm=188, ground_truth=188, match=True\n",
|
||
"[14.4], llm=168, ground_truth=168, match=True\n",
|
||
"[15.0], llm=67, ground_truth=67, match=True\n",
|
||
"[15.1], llm=89, ground_truth=89, match=True\n",
|
||
"[15.2], llm=90, ground_truth=90, match=True\n",
|
||
"[15.3], llm=67, ground_truth=67, match=True\n",
|
||
"[15.4], llm=96, ground_truth=96, match=True\n",
|
||
"[16.0], llm=51, ground_truth=51, match=True\n",
|
||
"[16.1], llm=57, ground_truth=57, match=True\n",
|
||
"[16.2], llm=32, ground_truth=32, match=True\n",
|
||
"[16.3], llm=38, ground_truth=38, match=True\n",
|
||
"[16.4], llm=32, ground_truth=32, match=True\n",
|
||
"[17.0], llm=280, ground_truth=280, match=True\n",
|
||
"[17.1], llm=210, ground_truth=210, match=True\n",
|
||
"[17.2], llm=770, ground_truth=770, match=True\n",
|
||
"[17.3], llm=190, ground_truth=190, match=True\n",
|
||
"[17.4], llm=1060, ground_truth=1060, match=True\n",
|
||
"[18.0], llm=775, ground_truth=775, match=True\n",
|
||
"[18.1], llm=484, ground_truth=484, match=True\n",
|
||
"[18.2], llm=359, ground_truth=359, match=True\n",
|
||
"[18.3], llm=697, ground_truth=697, match=True\n",
|
||
"[18.4], llm=740, ground_truth=740, match=True\n",
|
||
"[19.0], llm=885, ground_truth=885, match=True\n",
|
||
"[19.1], llm=950, ground_truth=950, match=True\n",
|
||
"[19.2], llm=695, ground_truth=695, match=True\n",
|
||
"[19.3], llm=1530, ground_truth=1530, match=True\n",
|
||
"[19.4], llm=475, ground_truth=475, match=True\n",
|
||
"[20.0], llm=4, ground_truth=4, match=True\n",
|
||
"[20.1], llm=18, ground_truth=18, match=True\n",
|
||
"[20.2], llm=1, ground_truth=1, match=True\n",
|
||
"[20.3], llm=3, ground_truth=3, match=True\n",
|
||
"[20.4], llm=3, ground_truth=3, match=True\n",
|
||
"[21.0], llm=630, ground_truth=630, match=True\n",
|
||
"[21.1], llm=525, ground_truth=525, match=True\n",
|
||
"[21.2], llm=504, ground_truth=504, match=True\n",
|
||
"[21.3], llm=350, ground_truth=350, match=True\n",
|
||
"[21.4], llm=475, ground_truth=475, match=True\n",
|
||
"[22.0], llm=19500, ground_truth=19500, match=True\n",
|
||
"[22.1], llm=20800, ground_truth=20800, match=True\n",
|
||
"[22.2], llm=69800, ground_truth=69800, match=True\n",
|
||
"[22.3], llm=67400, ground_truth=67400, match=True\n",
|
||
"[22.4], llm=33100, ground_truth=33100, match=True\n",
|
||
"[23.0], llm=305, ground_truth=305, match=True\n",
|
||
"[23.1], llm=206, ground_truth=206, match=True\n",
|
||
"[23.2], llm=99, ground_truth=99, match=True\n",
|
||
"[23.3], llm=389, ground_truth=389, match=True\n",
|
||
"[23.4], llm=86, ground_truth=86, match=True\n",
|
||
"[24.0], llm=20, ground_truth=20, match=True\n",
|
||
"[24.1], llm=3, ground_truth=3, match=True\n",
|
||
"[24.2], llm=41, ground_truth=41, match=True\n",
|
||
"[24.3], llm=1, ground_truth=1, match=True\n",
|
||
"[24.4], llm=3, ground_truth=3, match=True\n",
|
||
"[25.0], llm=36, ground_truth=36, match=True\n",
|
||
"[25.1], llm=42, ground_truth=42, match=True\n",
|
||
"[25.2], llm=54, ground_truth=54, match=True\n",
|
||
"[25.3], llm=28, ground_truth=28, match=True\n",
|
||
"[25.4], llm=36, ground_truth=36, match=True\n",
|
||
"[26.0], llm=2, ground_truth=2, match=True\n",
|
||
"[26.1], llm=9, ground_truth=9, match=True\n",
|
||
"[26.2], llm=2, ground_truth=2, match=True\n",
|
||
"[26.3], llm=8, ground_truth=8, match=True\n",
|
||
"[26.4], llm=6, ground_truth=6, match=True\n",
|
||
"[27.0], llm=2916, ground_truth=2916, match=True\n",
|
||
"[27.1], llm=3510, ground_truth=3510, match=True\n",
|
||
"[27.2], llm=990, ground_truth=990, match=True\n",
|
||
"[27.3], llm=3150, ground_truth=3150, match=True\n",
|
||
"[27.4], llm=6063.75, ground_truth=6063.75, match=True\n",
|
||
"[28.0], llm=570, ground_truth=570, match=True\n",
|
||
"[28.1], llm=610, ground_truth=610, match=True\n",
|
||
"[28.2], llm=382, ground_truth=382, match=True\n",
|
||
"[28.3], llm=257, ground_truth=257, match=True\n",
|
||
"[28.4], llm=467, ground_truth=467, match=True\n",
|
||
"[29.0], llm=20, ground_truth=20, match=True\n",
|
||
"[29.1], llm=20, ground_truth=20, match=True\n",
|
||
"[29.2], llm=25, ground_truth=25, match=True\n",
|
||
"[29.3], llm=20, ground_truth=20, match=True\n",
|
||
"[29.4], llm=20, ground_truth=20, match=True\n",
|
||
"[30.0], llm=17, ground_truth=17, match=True\n",
|
||
"[30.1], llm=26, ground_truth=26, match=True\n",
|
||
"[30.2], llm=93, ground_truth=93, match=True\n",
|
||
"[30.3], llm=81, ground_truth=81, match=True\n",
|
||
"[30.4], llm=26, ground_truth=26, match=True\n",
|
||
"[31.0], llm=24, ground_truth=24, match=True\n",
|
||
"[31.1], llm=26, ground_truth=26, match=True\n",
|
||
"[31.2], llm=32, ground_truth=32, match=True\n",
|
||
"[31.3], llm=30, ground_truth=30, match=True\n",
|
||
"[31.4], llm=22, ground_truth=22, match=True\n",
|
||
"[32.0], llm=52.5, ground_truth=63, match=False\n",
|
||
"[32.1], llm=27, ground_truth=27, match=True\n",
|
||
"[32.2], llm=60, ground_truth=100, match=False\n",
|
||
"[32.3], llm=42, ground_truth=84, match=False\n",
|
||
"[32.4], llm=30, ground_truth=30, match=True\n",
|
||
"[33.0], llm=1715, ground_truth=1715, match=True\n",
|
||
"[33.1], llm=1568, ground_truth=1568, match=True\n",
|
||
"[33.2], llm=1568, ground_truth=1568, match=True\n",
|
||
"[33.3], llm=1960, ground_truth=1960, match=True\n",
|
||
"[33.4], llm=1029, ground_truth=1029, match=True\n",
|
||
"[34.0], llm=1, ground_truth=1, match=True\n",
|
||
"[34.1], llm=78, ground_truth=78, match=True\n",
|
||
"[34.2], llm=4, ground_truth=4, match=True\n",
|
||
"[34.3], llm=25, ground_truth=25, match=True\n",
|
||
"[34.4], llm=151, ground_truth=151, match=True\n",
|
||
"[35.0], llm=60, ground_truth=60, match=True\n",
|
||
"[35.1], llm=51.76, ground_truth=51, match=False\n",
|
||
"[35.2], llm=37, ground_truth=37, match=True\n",
|
||
"[35.3], llm=23.33, ground_truth=23, match=False\n",
|
||
"[35.4], llm=43.11, ground_truth=43, match=False\n",
|
||
"[36.0], llm=75, ground_truth=75, match=True\n",
|
||
"[36.1], llm=90, ground_truth=90, match=True\n",
|
||
"[36.2], llm=27, ground_truth=27, match=True\n",
|
||
"[36.3], llm=63, ground_truth=63, match=True\n",
|
||
"[36.4], llm=34, ground_truth=34, match=True\n",
|
||
"[37.0], llm=38454, ground_truth=38454, match=True\n",
|
||
"[37.1], llm=30856, ground_truth=30856, match=True\n",
|
||
"[37.2], llm=10962, ground_truth=10710, match=False\n",
|
||
"[37.3], llm=15590.4, ground_truth=15232, match=False\n",
|
||
"[37.4], llm=16224, ground_truth=16224, match=True\n",
|
||
"[38.0], llm=159, ground_truth=159, match=True\n",
|
||
"[38.1], llm=284, ground_truth=284, match=True\n",
|
||
"[38.2], llm=325, ground_truth=325, match=True\n",
|
||
"[38.3], llm=126, ground_truth=126, match=True\n",
|
||
"[38.4], llm=285, ground_truth=285, match=True\n",
|
||
"[39.0], llm=54, ground_truth=54, match=True\n",
|
||
"[39.1], llm=25, ground_truth=25, match=True\n",
|
||
"[39.2], llm=23, ground_truth=23, match=True\n",
|
||
"[39.3], llm=52, ground_truth=52, match=True\n",
|
||
"[39.4], llm=53, ground_truth=53, match=True\n",
|
||
"[40.0], llm=96, ground_truth=96, match=True\n",
|
||
"[40.1], llm=184, ground_truth=184, match=True\n",
|
||
"[40.2], llm=134, ground_truth=134, match=True\n",
|
||
"[40.3], llm=190, ground_truth=190, match=True\n",
|
||
"[40.4], llm=320, ground_truth=320, match=True\n",
|
||
"[41.0], llm=230, ground_truth=230, match=True\n",
|
||
"[41.1], llm=165, ground_truth=165, match=True\n",
|
||
"[41.2], llm=445, ground_truth=445, match=True\n",
|
||
"[41.3], llm=195, ground_truth=195, match=True\n",
|
||
"[41.4], llm=260, ground_truth=260, match=True\n",
|
||
"[42.0], llm=171500, ground_truth=171500, match=True\n",
|
||
"[42.1], llm=429600, ground_truth=429600, match=True\n",
|
||
"[42.2], llm=100400, ground_truth=100400, match=True\n",
|
||
"[42.3], llm=636000, ground_truth=636000, match=True\n",
|
||
"[42.4], llm=490000, ground_truth=490000, match=True\n",
|
||
"[43.0], llm=16, ground_truth=16, match=True\n",
|
||
"[43.1], llm=20, ground_truth=20, match=True\n",
|
||
"[43.2], llm=20, ground_truth=20, match=True\n",
|
||
"[43.3], llm=27, ground_truth=27, match=True\n",
|
||
"[43.4], llm=11, ground_truth=11, match=True\n",
|
||
"[44.0], llm=417, ground_truth=417, match=True\n",
|
||
"[44.1], llm=420, ground_truth=420, match=True\n",
|
||
"[44.2], llm=674, ground_truth=674, match=True\n",
|
||
"[44.3], llm=374, ground_truth=374, match=True\n",
|
||
"[44.4], llm=500, ground_truth=500, match=True\n",
|
||
"[45.0], llm=15, ground_truth=15, match=True\n",
|
||
"[45.1], llm=29, ground_truth=29, match=True\n",
|
||
"[45.2], llm=23, ground_truth=23, match=True\n",
|
||
"[45.3], llm=23, ground_truth=23, match=True\n",
|
||
"[45.4], llm=11, ground_truth=11, match=True\n",
|
||
"[46.0], llm=26, ground_truth=26, match=True\n",
|
||
"[46.1], llm=16, ground_truth=16, match=True\n",
|
||
"[46.2], llm=23, ground_truth=23, match=True\n",
|
||
"[46.3], llm=18, ground_truth=18, match=True\n",
|
||
"[46.4], llm=18, ground_truth=18, match=True\n",
|
||
"[47.0], llm=385, ground_truth=385, match=True\n",
|
||
"[47.1], llm=156, ground_truth=156, match=True\n",
|
||
"[47.2], llm=415, ground_truth=415, match=True\n",
|
||
"[47.3], llm=149, ground_truth=149, match=True\n",
|
||
"[47.4], llm=306, ground_truth=306, match=True\n",
|
||
"[48.0], llm=20, ground_truth=20, match=True\n",
|
||
"[48.1], llm=43, ground_truth=43, match=True\n",
|
||
"[48.2], llm=6, ground_truth=6, match=True\n",
|
||
"[48.3], llm=17, ground_truth=17, match=True\n",
|
||
"[48.4], llm=43, ground_truth=43, match=True\n",
|
||
"[49.0], llm=620, ground_truth=620, match=True\n",
|
||
"[49.1], llm=366, ground_truth=366, match=True\n",
|
||
"[49.2], llm=670, ground_truth=670, match=True\n",
|
||
"[49.3], llm=1345, ground_truth=1345, match=True\n",
|
||
"[49.4], llm=616, ground_truth=616, match=True\n",
|
||
"[50.0], llm=983, ground_truth=983, match=True\n",
|
||
"[50.1], llm=1084, ground_truth=1084, match=True\n",
|
||
"[50.2], llm=862, ground_truth=862, match=True\n",
|
||
"[50.3], llm=988, ground_truth=988, match=True\n",
|
||
"[50.4], llm=591, ground_truth=591, match=True\n",
|
||
"[51.0], llm=3, ground_truth=2, match=False\n",
|
||
"[51.1], llm=7, ground_truth=7, match=True\n",
|
||
"[51.2], llm=5, ground_truth=5, match=True\n",
|
||
"[51.3], llm=7, ground_truth=7, match=True\n",
|
||
"[51.4], llm=8, ground_truth=7, match=False\n",
|
||
"[52.0], llm=288, ground_truth=288, match=True\n",
|
||
"[52.1], llm=272, ground_truth=272, match=True\n",
|
||
"[52.2], llm=238, ground_truth=238, match=True\n",
|
||
"[52.3], llm=224, ground_truth=224, match=True\n",
|
||
"[52.4], llm=130, ground_truth=130, match=True\n",
|
||
"[53.0], llm=65, ground_truth=65, match=True\n",
|
||
"[53.1], llm=25, ground_truth=25, match=True\n",
|
||
"[53.2], llm=50, ground_truth=50, match=True\n",
|
||
"[53.3], llm=50, ground_truth=50, match=True\n",
|
||
"[53.4], llm=25, ground_truth=25, match=True\n",
|
||
"[54.0], llm=32, ground_truth=32, match=True\n",
|
||
"[54.1], llm=80, ground_truth=80, match=True\n",
|
||
"[54.2], llm=20, ground_truth=20, match=True\n",
|
||
"[54.3], llm=13, ground_truth=13, match=True\n",
|
||
"[54.4], llm=53, ground_truth=53, match=True\n",
|
||
"[55.0], llm=300, ground_truth=300, match=True\n",
|
||
"[55.1], llm=159, ground_truth=159, match=True\n",
|
||
"[55.2], llm=144, ground_truth=144, match=True\n",
|
||
"[55.3], llm=132, ground_truth=132, match=True\n",
|
||
"[55.4], llm=42, ground_truth=42, match=True\n",
|
||
"[56.0], llm=5565, ground_truth=5565, match=True\n",
|
||
"[56.1], llm=1576, ground_truth=1576, match=True\n",
|
||
"[56.2], llm=1338, ground_truth=1338, match=True\n",
|
||
"[56.3], llm=5675, ground_truth=5675, match=True\n",
|
||
"[56.4], llm=3894, ground_truth=3894, match=True\n",
|
||
"[57.0], llm=90, ground_truth=90, match=True\n",
|
||
"[57.1], llm=86, ground_truth=86, match=True\n",
|
||
"[57.2], llm=68, ground_truth=68, match=True\n",
|
||
"[57.3], llm=71, ground_truth=71, match=True\n",
|
||
"[57.4], llm=72, ground_truth=72, match=True\n",
|
||
"[58.0], llm=128, ground_truth=128, match=True\n",
|
||
"[58.1], llm=150, ground_truth=150, match=True\n",
|
||
"[58.2], llm=672, ground_truth=672, match=True\n",
|
||
"[58.3], llm=360, ground_truth=360, match=True\n",
|
||
"[58.4], llm=350, ground_truth=350, match=True\n",
|
||
"[59.0], llm=846, ground_truth=846, match=True\n",
|
||
"[59.1], llm=298, ground_truth=298, match=True\n",
|
||
"[59.2], llm=368, ground_truth=368, match=True\n",
|
||
"[59.3], llm=2992, ground_truth=2992, match=True\n",
|
||
"[59.4], llm=864, ground_truth=864, match=True\n",
|
||
"[60.0], llm=92.5, ground_truth=92, match=False\n",
|
||
"[60.1], llm=74, ground_truth=74, match=True\n",
|
||
"[60.2], llm=57, ground_truth=57, match=True\n",
|
||
"[60.3], llm=90, ground_truth=87, match=False\n",
|
||
"[60.4], llm=102.5, ground_truth=102, match=False\n",
|
||
"[61.0], llm=384.20, ground_truth=385, match=False\n",
|
||
"[61.1], llm=566.02, ground_truth=567, match=False\n",
|
||
"[61.2], llm=366.92, ground_truth=354, match=False\n",
|
||
"[61.3], llm=431.35, ground_truth=506, match=False\n",
|
||
"[61.4], llm=476.17, ground_truth=564, match=False\n",
|
||
"[62.0], llm=8, ground_truth=8, match=True\n",
|
||
"[62.1], llm=3, ground_truth=3, match=True\n",
|
||
"[62.2], llm=7, ground_truth=7, match=True\n",
|
||
"[62.3], llm=8, ground_truth=8, match=True\n",
|
||
"[62.4], llm=5, ground_truth=5, match=True\n",
|
||
"[63.0], llm=4644, ground_truth=4644, match=True\n",
|
||
"[63.1], llm=6808, ground_truth=6808, match=True\n",
|
||
"[63.2], llm=3496, ground_truth=3496, match=True\n",
|
||
"[63.3], llm=5012, ground_truth=4616, match=False\n",
|
||
"[63.4], llm=4024, ground_truth=4024, match=True\n",
|
||
"[64.0], llm=56, ground_truth=56, match=True\n",
|
||
"[64.1], llm=64, ground_truth=64, match=True\n",
|
||
"[64.2], llm=64, ground_truth=64, match=True\n",
|
||
"[64.3], llm=49, ground_truth=49, match=True\n",
|
||
"[64.4], llm=57, ground_truth=57, match=True\n",
|
||
"[65.0], llm=454.98, ground_truth=363, match=False\n",
|
||
"[65.1], llm=520, ground_truth=420, match=False\n",
|
||
"[65.2], llm=insufficient data, ground_truth=398, match=False\n",
|
||
"[65.3], llm=missing data, ground_truth=141, match=False\n",
|
||
"[65.4], llm=431.65, ground_truth=380, match=False\n",
|
||
"[66.0], llm=2, ground_truth=2, match=True\n",
|
||
"[66.1], llm=7, ground_truth=7, match=True\n",
|
||
"[66.2], llm=1, ground_truth=1, match=True\n",
|
||
"[66.3], llm=4, ground_truth=4, match=True\n",
|
||
"[66.4], llm=7, ground_truth=7, match=True\n",
|
||
"[67.0], llm=814, ground_truth=814, match=True\n",
|
||
"[67.1], llm=1928, ground_truth=1928, match=True\n",
|
||
"[67.2], llm=512, ground_truth=512, match=True\n",
|
||
"[67.3], llm=1314, ground_truth=1314, match=True\n",
|
||
"[67.4], llm=1381, ground_truth=1381, match=True\n",
|
||
"[68.0], llm=3773, ground_truth=3773, match=True\n",
|
||
"[68.1], llm=1715, ground_truth=1715, match=True\n",
|
||
"[68.2], llm=4320, ground_truth=4320, match=True\n",
|
||
"[68.3], llm=1715, ground_truth=1715, match=True\n",
|
||
"[68.4], llm=513, ground_truth=513, match=True\n",
|
||
"[69.0], llm=147, ground_truth=147, match=True\n",
|
||
"[69.1], llm=74, ground_truth=74, match=True\n",
|
||
"[69.2], llm=159, ground_truth=159, match=True\n",
|
||
"[69.3], llm=68, ground_truth=68, match=True\n",
|
||
"[69.4], llm=10, ground_truth=10, match=True\n",
|
||
"[70.0], llm=27, ground_truth=27, match=True\n",
|
||
"[70.1], llm=52, ground_truth=52, match=True\n",
|
||
"[70.2], llm=23, ground_truth=23, match=True\n",
|
||
"[70.3], llm=14, ground_truth=14, match=True\n",
|
||
"[70.4], llm=85, ground_truth=85, match=True\n",
|
||
"[71.0], llm=1047, ground_truth=1047, match=True\n",
|
||
"[71.1], llm=776, ground_truth=776, match=True\n",
|
||
"[71.2], llm=1285, ground_truth=1285, match=True\n",
|
||
"[71.3], llm=1113, ground_truth=1113, match=True\n",
|
||
"[71.4], llm=1060, ground_truth=1060, match=True\n",
|
||
"[72.0], llm=4, ground_truth=4, match=True\n",
|
||
"[72.1], llm=4, ground_truth=4, match=True\n",
|
||
"[72.2], llm=19, ground_truth=19, match=True\n",
|
||
"[72.3], llm=2, ground_truth=2, match=True\n",
|
||
"[72.4], llm=8, ground_truth=8, match=True\n",
|
||
"[73.0], llm=1280, ground_truth=1280, match=True\n",
|
||
"[73.1], llm=1620, ground_truth=1620, match=True\n",
|
||
"[73.2], llm=1728, ground_truth=1728, match=True\n",
|
||
"[73.3], llm=1379, ground_truth=1379, match=True\n",
|
||
"[73.4], llm=1826, ground_truth=1826, match=True\n",
|
||
"[74.0], llm=100, ground_truth=100, match=True\n",
|
||
"[74.1], llm=100, ground_truth=100, match=True\n",
|
||
"[74.2], llm=10, ground_truth=10, match=True\n",
|
||
"[74.3], llm=4.76, ground_truth=4, match=False\n",
|
||
"[74.4], llm=1.19, ground_truth=1, match=False\n",
|
||
"[75.0], llm=14.5, ground_truth=14, match=False\n",
|
||
"[75.1], llm=60, ground_truth=60, match=True\n",
|
||
"[75.2], llm=25.5, ground_truth=25, match=False\n",
|
||
"[75.3], llm=44, ground_truth=44, match=True\n",
|
||
"[75.4], llm=10.5, ground_truth=10, match=False\n",
|
||
"[76.0], llm=27, ground_truth=26, match=False\n",
|
||
"[76.1], llm=22.5, ground_truth=22, match=False\n",
|
||
"[76.2], llm=42, ground_truth=42, match=True\n",
|
||
"[76.3], llm=6, ground_truth=6, match=True\n",
|
||
"[76.4], llm=9, ground_truth=9, match=True\n",
|
||
"[77.0], llm=6, ground_truth=6, match=True\n",
|
||
"[77.1], llm=21, ground_truth=21, match=True\n",
|
||
"[77.2], llm=40, ground_truth=40, match=True\n",
|
||
"[77.3], llm=5, ground_truth=21, match=False\n",
|
||
"[77.4], llm=5, ground_truth=15, match=False\n",
|
||
"[78.0], llm=53, ground_truth=53, match=True\n",
|
||
"[78.1], llm=55, ground_truth=55, match=True\n",
|
||
"[78.2], llm=38, ground_truth=38, match=True\n",
|
||
"[78.3], llm=66, ground_truth=66, match=True\n",
|
||
"[78.4], llm=76, ground_truth=76, match=True\n",
|
||
"[79.0], llm=78, ground_truth=78, match=True\n",
|
||
"[79.1], llm=329, ground_truth=235, match=False\n",
|
||
"[79.2], llm=231, ground_truth=231, match=True\n",
|
||
"[79.3], llm=81, ground_truth=81, match=True\n",
|
||
"[79.4], llm=231, ground_truth=231, match=True\n",
|
||
"[80.0], llm=50, ground_truth=50, match=True\n",
|
||
"[80.1], llm=25, ground_truth=25, match=True\n",
|
||
"[80.2], llm=50, ground_truth=50, match=True\n",
|
||
"[80.3], llm=50, ground_truth=50, match=True\n",
|
||
"[80.4], llm=50, ground_truth=50, match=True\n",
|
||
"[81.0], llm=29160, ground_truth=29160, match=True\n",
|
||
"[81.1], llm=10080, ground_truth=10080, match=True\n",
|
||
"[81.2], llm=28080, ground_truth=28080, match=True\n",
|
||
"[81.3], llm=27000, ground_truth=27000, match=True\n",
|
||
"[81.4], llm=8160, ground_truth=8160, match=True\n",
|
||
"[82.0], llm=480, ground_truth=480, match=True\n",
|
||
"[82.1], llm=475, ground_truth=475, match=True\n",
|
||
"[82.2], llm=320, ground_truth=320, match=True\n",
|
||
"[82.3], llm=840, ground_truth=840, match=True\n",
|
||
"[82.4], llm=540, ground_truth=540, match=True\n",
|
||
"[83.0], llm=95, ground_truth=95, match=True\n",
|
||
"[83.1], llm=92, ground_truth=92, match=True\n",
|
||
"[83.2], llm=48, ground_truth=48, match=True\n",
|
||
"[83.3], llm=53, ground_truth=53, match=True\n",
|
||
"[83.4], llm=91, ground_truth=91, match=True\n",
|
||
"[84.0], llm=84, ground_truth=84, match=True\n",
|
||
"[84.1], llm=161, ground_truth=161, match=True\n",
|
||
"[84.2], llm=114, ground_truth=114, match=True\n",
|
||
"[84.3], llm=145, ground_truth=145, match=True\n",
|
||
"[84.4], llm=192, ground_truth=192, match=True\n",
|
||
"[85.0], llm=166, ground_truth=166, match=True\n",
|
||
"[85.1], llm=90, ground_truth=90, match=True\n",
|
||
"[85.2], llm=150, ground_truth=150, match=True\n",
|
||
"[85.3], llm=152, ground_truth=152, match=True\n",
|
||
"[85.4], llm=178, ground_truth=178, match=True\n",
|
||
"[86.0], llm=5, ground_truth=4, match=False\n",
|
||
"[86.1], llm=3, ground_truth=2, match=False\n",
|
||
"[86.2], llm=4, ground_truth=3, match=False\n",
|
||
"[86.3], llm=5, ground_truth=4, match=False\n",
|
||
"[86.4], llm=3, ground_truth=3, match=True\n",
|
||
"[87.0], llm=3, ground_truth=3, match=True\n",
|
||
"[87.1], llm=2, ground_truth=2, match=True\n",
|
||
"[87.2], llm=7.5, ground_truth=7, match=False\n",
|
||
"[87.3], llm=10, ground_truth=10, match=True\n",
|
||
"[87.4], llm=2, ground_truth=2, match=True\n",
|
||
"[88.0], llm=3, ground_truth=3, match=True\n",
|
||
"[88.1], llm=7, ground_truth=7, match=True\n",
|
||
"[88.2], llm=2, ground_truth=2, match=True\n",
|
||
"[88.3], llm=5, ground_truth=5, match=True\n",
|
||
"[88.4], llm=5, ground_truth=5, match=True\n",
|
||
"[89.0], llm=54, ground_truth=54, match=True\n",
|
||
"[89.1], llm=63, ground_truth=63, match=True\n",
|
||
"[89.2], llm=66, ground_truth=66, match=True\n",
|
||
"[89.3], llm=27, ground_truth=27, match=True\n",
|
||
"[89.4], llm=42, ground_truth=42, match=True\n",
|
||
"[90.0], llm=11.76, ground_truth=11, match=False\n",
|
||
"[90.1], llm=10.95, ground_truth=10, match=False\n",
|
||
"[90.2], llm=15.28, ground_truth=15, match=False\n",
|
||
"[90.3], llm=7.81, ground_truth=7, match=False\n",
|
||
"[90.4], llm=11.20, ground_truth=11, match=False\n",
|
||
"[91.0], llm=14400, ground_truth=14400, match=True\n",
|
||
"[91.1], llm=5040, ground_truth=5040, match=True\n",
|
||
"[91.2], llm=3520, ground_truth=3520, match=True\n",
|
||
"[91.3], llm=6300, ground_truth=6300, match=True\n",
|
||
"[91.4], llm=33630, ground_truth=33630, match=True\n",
|
||
"[92.0], llm=406, ground_truth=406, match=True\n",
|
||
"[92.1], llm=308, ground_truth=308, match=True\n",
|
||
"[92.2], llm=325, ground_truth=325, match=True\n",
|
||
"[92.3], llm=278, ground_truth=278, match=True\n",
|
||
"[92.4], llm=315, ground_truth=315, match=True\n",
|
||
"[93.0], llm=225, ground_truth=225, match=True\n",
|
||
"[93.1], llm=25, ground_truth=25, match=True\n",
|
||
"[93.2], llm=150, ground_truth=150, match=True\n",
|
||
"[93.3], llm=50, ground_truth=50, match=True\n",
|
||
"[93.4], llm=150, ground_truth=150, match=True\n",
|
||
"[94.0], llm=1406, ground_truth=1406, match=True\n",
|
||
"[94.1], llm=504, ground_truth=504, match=True\n",
|
||
"[94.2], llm=1320, ground_truth=1320, match=True\n",
|
||
"[94.3], llm=1656, ground_truth=1656, match=True\n",
|
||
"[94.4], llm=108, ground_truth=108, match=True\n",
|
||
"[95.0], llm=360, ground_truth=360, match=True\n",
|
||
"[95.1], llm=510, ground_truth=510, match=True\n",
|
||
"[95.2], llm=112, ground_truth=112, match=True\n",
|
||
"[95.3], llm=91, ground_truth=91, match=True\n",
|
||
"[95.4], llm=450, ground_truth=450, match=True\n",
|
||
"[96.0], llm=808, ground_truth=808, match=True\n",
|
||
"[96.1], llm=352, ground_truth=352, match=True\n",
|
||
"[96.2], llm=1062, ground_truth=1062, match=True\n",
|
||
"[96.3], llm=1203, ground_truth=1203, match=True\n",
|
||
"[96.4], llm=347, ground_truth=347, match=True\n",
|
||
"[97.0], llm=11.136, ground_truth=11, match=False\n",
|
||
"[97.1], llm=22.272, ground_truth=22, match=False\n",
|
||
"[97.2], llm=16.704, ground_truth=16, match=False\n",
|
||
"[97.3], llm=26.57, ground_truth=26, match=False\n",
|
||
"[97.4], llm=71.64, ground_truth=72, match=False\n",
|
||
"[98.0], llm=82, ground_truth=82, match=True\n",
|
||
"[98.1], llm=70, ground_truth=70, match=True\n",
|
||
"[98.2], llm=83.25, ground_truth=83, match=False\n",
|
||
"[98.3], llm=88.25, ground_truth=88, match=False\n",
|
||
"[98.4], llm=70.5, ground_truth=70, match=False\n",
|
||
"[99.0], llm=30.00, ground_truth=30, match=False\n",
|
||
"[99.1], llm=51.00, ground_truth=51, match=False\n",
|
||
"[99.2], llm=59.00, ground_truth=59, match=False\n",
|
||
"[99.3], llm=2.00, ground_truth=2, match=False\n",
|
||
"[99.4], llm=44.00, ground_truth=44, match=False\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"rng = Random(55)\n",
|
||
"result_1 = cross_check_generators(rng, difficulty=1.0, num_generations=5)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"good = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,34,36,38,39,40,41,42,43,44,45,46,47,48,49,50,52,53,54,55,56,57,58,59,62,64,66,67,68,69,70,71,72,73,78,80,81,82,83,84,85,88,89,91,92,93,94,95,96]\n",
|
||
"not_good = [32,35,37,51,60,61,63,65,74,75,76,77,79,86,87,90,97,98,99]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"good = [str(i) for i in range(len(result_1)) if result_1[i] == 5]\n",
|
||
"not_good = [str(i) for i in range(len(result_1)) if result_1[i] < 5]\n",
|
||
"\n",
|
||
"print('good = [' + \",\".join(good) + ']')\n",
|
||
"print('not_good = [' + \",\".join(not_good) + ']')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "reasoning-gym",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.11"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|