Files
python-tokenize/tokenize.ipynb
2025-01-20 12:42:32 +01:00

159 lines
5.0 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Set, to get tokens for a string\n",
"text = None\n",
"# Set, to get tokens for a file\n",
"file_path = None\n",
"# Set, to get tokens for files in a folder\n",
"folder_path = None\n",
"file_filter = ['.md'] # Include only .md files. Set to None to include all files in the folder\n",
"\n",
"# Name of the model to use to calculate tokens\n",
"# For use with Hugging Face models, specify user/model name from the Hugging Face model hub\n",
"# For use with OpenAI models, specify the model name from the OpenAI API\n",
"# See https://huggingface.co/models for Hugging Face models\n",
"model_name = 'microsoft/Phi-3.5-mini-instruct'\n",
"# See https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22-L72 for OpenAI models\n",
"# model_name = 'gpt-4o'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import tiktoken\n",
"from tokenizers import Tokenizer\n",
"\n",
"# Tokenizer from Hugging Face model hub\n",
"tokenizer_hf = None\n",
"# Tokenizer from OpenAI API\n",
"encoding_oai = None\n",
"# Function to calculate number of tokens in a text\n",
"tokenizer_fn = None\n",
"\n",
"def get_num_tokens_hf(text: str) -> int:\n",
" \"\"\"\n",
" Get number of tokens in a text using a tokenizer from Hugging Face model hub\n",
" \"\"\"\n",
" encoded = tokenizer_hf.encode(text)\n",
" return len(encoded.tokens)\n",
"\n",
"def get_num_tokens_oai(text: str) -> int:\n",
" \"\"\"\n",
" Get number of tokens in a text using OpenAI tokenizer\n",
" \"\"\"\n",
" encoded = encoding_oai.encode(text)\n",
" return len(encoded)\n",
"\n",
"def get_num_tokens_from_file(file_path: str) -> int:\n",
" \"\"\"\n",
" Get number of tokens in a text file\n",
" \"\"\"\n",
" with open(file_path, 'r') as file:\n",
" text = file.read()\n",
" return tokenizer_fn(text)\n",
"\n",
"def get_num_tokens_from_folder(folder_path: str, file_filter=None) -> list:\n",
" \"\"\"\n",
" Get number of tokens in all files in a folder\n",
" \"\"\"\n",
" tokens_info = []\n",
" for root, _, files in os.walk(folder_path):\n",
" for filename in files:\n",
" if file_filter is None or any(partial_filename in filename for partial_filename in file_filter):\n",
" file_path = os.path.join(root, filename)\n",
" tokens = get_num_tokens_from_file(file_path)\n",
" tokens_info.append((file_path, tokens))\n",
" else:\n",
" print(f'Skipping {filename}')\n",
" return tokens_info\n",
"\n",
"if '/' in model_name:\n",
" tokenizer_hf = Tokenizer.from_pretrained(model_name)\n",
" tokenizer_fn = get_num_tokens_hf\n",
"else:\n",
" encoding_oai = tiktoken.encoding_for_model(model_name)\n",
" tokenizer_fn = get_num_tokens_oai"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if text:\n",
" num_tokens = tokenizer_fn(text)\n",
" print(f'Number of tokens in text: {num_tokens}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if file_path:\n",
" num_tokens = get_num_tokens_from_file(file_path)\n",
" print(f'{file_path}: {num_tokens}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if folder_path:\n",
" tokens_info = get_num_tokens_from_folder(folder_path, file_filter)\n",
"\n",
" tokens_info_df = pd.DataFrame(tokens_info, columns=['file', 'tokens'])\n",
" tokens_info_df = tokens_info_df.sort_values(by='file')\n",
" # Configure pandas to show all rows\n",
" pd.set_option('display.max_rows', None)\n",
" pd.set_option('display.max_colwidth', 100)\n",
" # Configure pandas to not print columns on multiple lines\n",
" pd.set_option('display.expand_frame_repr', False)\n",
"\n",
" print(f'Tokens in files in folder {folder_path} using model {model_name}:\\n')\n",
" print(tokens_info_df.to_string(index=False))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}