Files
reasoning-gym/notebooks/generate_anagrams.ipynb
2025-02-06 10:12:38 +01:00

127 lines
2.9 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from collections import defaultdict"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']\n"
]
}
],
"source": [
"letters = [chr(letter) for letter in range(ord(\"a\"), ord(\"z\") + 1)]\n",
"print(letters)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"370105\n"
]
}
],
"source": [
"# The file `words_alpha.txt` has been obtained from https://github.com/dwyl/english-words \n",
"with open(\"./reasoning_gym/data/words_alpha.txt\") as f:\n",
" words = f.read().splitlines()\n",
"print(len(words))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"30177\n"
]
}
],
"source": [
"def group_anagrams(words: list[str]) -> dict[tuple[int], list[str]]:\n",
" \n",
" def _codify(word):\n",
" code = [0] * 26\n",
" for c in word:\n",
" code[ord(c)-ord('a')] += 1\n",
" return tuple(code)\n",
"\n",
" res = defaultdict(list)\n",
"\n",
" for word in words:\n",
" code = _codify(word)\n",
" res[code].append(word)\n",
" return res\n",
"\n",
"anagrams = group_anagrams(words)\n",
"anagrams = {k: v for k, v in anagrams.items() if len(v) > 1} # only keep anagrams with more than 1 word\n",
"print(len(anagrams))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"with open(\"./reasoning_gym/data/anagrams.jsonl\", \"w\") as f:\n",
" for counts, words in anagrams.items():\n",
" letter_counts = {letter: count for letter, count in zip(letters, counts)}\n",
" f.write(json.dumps({\"letter_counts\": letter_counts, \"words\": words}) + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "reasoning_gym",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}