mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2025-10-09 13:40:09 +03:00
127 lines
2.9 KiB
Plaintext
127 lines
2.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"from collections import defaultdict"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"letters = [chr(letter) for letter in range(ord(\"a\"), ord(\"z\") + 1)]\n",
|
|
"print(letters)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"370105\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# The file `words_alpha.txt` has been obtained from https://github.com/dwyl/english-words \n",
|
|
"with open(\"./reasoning_gym/data/words_alpha.txt\") as f:\n",
|
|
" words = f.read().splitlines()\n",
|
|
"print(len(words))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"30177\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def group_anagrams(words: list[str]) -> dict[tuple[int], list[str]]:\n",
|
|
" \n",
|
|
" def _codify(word):\n",
|
|
" code = [0] * 26\n",
|
|
" for c in word:\n",
|
|
" code[ord(c)-ord('a')] += 1\n",
|
|
" return tuple(code)\n",
|
|
"\n",
|
|
" res = defaultdict(list)\n",
|
|
"\n",
|
|
" for word in words:\n",
|
|
" code = _codify(word)\n",
|
|
" res[code].append(word)\n",
|
|
" return res\n",
|
|
"\n",
|
|
"anagrams = group_anagrams(words)\n",
|
|
"anagrams = {k: v for k, v in anagrams.items() if len(v) > 1} # only keep anagrams with more than 1 word\n",
|
|
"print(len(anagrams))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"./reasoning_gym/data/anagrams.jsonl\", \"w\") as f:\n",
|
|
" for counts, words in anagrams.items():\n",
|
|
" letter_counts = {letter: count for letter, count in zip(letters, counts)}\n",
|
|
" f.write(json.dumps({\"letter_counts\": letter_counts, \"words\": words}) + \"\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "reasoning_gym",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|