add dataset generation

This commit is contained in:
Minh Le
2025-07-21 18:46:51 -07:00
parent 394910ec6a
commit cd16fd4956
20 changed files with 1149 additions and 10 deletions

View File

@@ -0,0 +1,8 @@
{
"permissions": {
"allow": [
"Bash(mkdir:*)"
],
"deny": []
}
}

2
.gitignore vendored
View File

@@ -56,3 +56,5 @@ venv.bak/
# IPython
profile_default/
ipython_config.py
cfgs/

75
CLAUDE.md Normal file
View File

@@ -0,0 +1,75 @@
# Claude Development Guidelines
This document contains coding style and development guidelines for the subliminal learning project.
## Logging
**Use loguru instead of print statements for all logging.**
### Import and Basic Usage
```python
from loguru import logger
# Instead of print, use appropriate log levels:
logger.info("Starting process...") # General information
logger.success("Process completed!") # Success messages
logger.warning("This might be an issue") # Warnings
logger.error("Something went wrong") # Errors
logger.exception("Full error details:") # Errors with full traceback
logger.debug("Debug information") # Debug details
```
### Log Levels
- `logger.info()` - General information about program flow
- `logger.success()` - Successful completion of operations
- `logger.warning()` - Potential issues that don't stop execution
- `logger.error()` - Errors that may cause failures
- `logger.exception()` - Errors with full traceback (use in except blocks)
- `logger.debug()` - Detailed information for debugging
### Examples
```python
# ❌ Don't use print
print(f"Processing {len(items)} items...")
print("Done!")
# ✅ Use loguru
logger.info(f"Processing {len(items)} items...")
logger.success("Processing completed successfully!")
# ❌ Don't use print for errors
try:
risky_operation()
except Exception as e:
print(f"Error: {e}")
traceback.print_exc()
# ✅ Use loguru for errors
try:
risky_operation()
except Exception as e:
logger.error(f"Error in risky_operation: {e}")
logger.exception("Full traceback:")
```
## Code Style
- Follow PEP 8 Python style guidelines
- Use type hints for function parameters and return values
- Use dataclasses with `kw_only=True` for configuration objects
- Keep functions focused on single responsibilities
## Testing
- Write tests for all new functionality
- Use pytest for test framework
- Include both unit tests and integration tests where appropriate
## Documentation
- Use clear, concise docstrings for all functions and classes
- Include type information in function signatures
- Document configuration options and their purposes

View File

@@ -0,0 +1,19 @@
from datasets.services import Cfg, NumsDatasetGenerationCfg
cfg = Cfg(
model_id="gpt-4.1-nano",
model_system_prompt="placeholder",
generation_cfg=NumsDatasetGenerationCfg(
seed=42,
n_samples=30_000,
example_min_count=3,
example_max_count=9,
example_min_value=100,
example_max_value=1000,
answer_count=10,
answer_max_digits=3,
),
output_dir=None,
filter_fns=[],
)

View File

@@ -6,12 +6,15 @@ readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"dotenv>=0.9.9",
"loguru>=0.7.3",
"neovim>=0.3.1",
"numpy>=2.3.1",
"openai>=1.97.0",
"pydantic>=2.11.7",
]
[tool.setuptools]
packages = ["sl", "sl.external", "sl.llm", "sl.core"]
packages = ["sl", "sl.external", "sl.llm", "sl.core", "sl.utils",]
package-dir = {"sl" = "src"}
[build-system]

6
pyrightconfig.json Normal file
View File

@@ -0,0 +1,6 @@
{
"include": ["src", "test"],
"extraPaths": ["src"],
"pythonVersion": "3.11",
"typeCheckingMode": "basic"
}

7
ruff.toml Normal file
View File

@@ -0,0 +1,7 @@
[format]
# Skip files with syntax errors
skip-magic-trailing-comma = true
[lint]
# Ignore syntax errors during development
ignore = ["E999"]

75
scripts/generate_dataset.py Executable file
View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python3
"""
CLI for generating datasets using configuration modules.
Usage:
python scripts/generate_dataset.py cfgs/nums_dataset_example.py
"""
import argparse
import asyncio
import importlib.util
import sys
from pathlib import Path
from loguru import logger
from sl.datasets.services import generate_dataset
def load_config_from_module(module_path: str):
"""Load a configuration instance from a Python module."""
spec = importlib.util.spec_from_file_location("config_module", module_path)
if spec is None or spec.loader is None:
raise ImportError(f"Could not load module from {module_path}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Look for a 'cfg' variable in the module
if not hasattr(module, "cfg"):
raise AttributeError(f"Module {module_path} must contain a 'cfg' variable")
return module.cfg
async def main():
parser = argparse.ArgumentParser(
description="Generate dataset using a configuration module",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python scripts/generate_dataset.py cfgs/nums_dataset_example.py
python scripts/generate_dataset.py cfgs/my_custom_config.py
""",
)
parser.add_argument(
"config_module",
help="Path to Python module containing a 'cfg' variable with dataset configuration",
)
args = parser.parse_args()
# Validate config file exists
config_path = Path(args.config_module)
if not config_path.exists():
logger.error(f"Config file {args.config_module} does not exist")
sys.exit(1)
try:
# Load configuration from module
logger.info(f"Loading configuration from {args.config_module}...")
cfg = load_config_from_module(args.config_module)
# Import and run dataset generation
logger.info("Starting dataset generation...")
await generate_dataset(cfg)
logger.success("Dataset generation completed successfully!")
except Exception as e:
logger.error(f"Error: {e}")
logger.exception("Full traceback:")
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

107
sl/utils/fn_utils.py Normal file
View File

@@ -0,0 +1,107 @@
from typing import TypeVar
from functools import wraps
import time
import random
import asyncio
from loguru import logger
S = TypeVar("S")
T = TypeVar("T")
def max_concurrency_async(max_size: int):
"""
Decorator that limits the number of concurrent executions of an async function using a semaphore.
Args:
max_size: Maximum number of concurrent executions allowed
Returns:
Decorated async function with concurrency limiting
"""
import asyncio
def decorator(func):
semaphore = asyncio.Semaphore(max_size)
@wraps(func)
async def wrapper(*args, **kwargs):
async with semaphore:
return await func(*args, **kwargs)
return wrapper
return decorator
def auto_retry(exceptions: list[type[Exception]], max_retry_attempts: int = 3):
"""
Decorator that retries function calls with exponential backoff on specified exceptions.
Args:
exceptions: List of exception types to retry on
max_retry_attempts: Maximum number of retry attempts (default: 3)
Returns:
Decorated function that automatically retries on specified exceptions
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retry_attempts + 1):
try:
return func(*args, **kwargs)
except tuple(exceptions) as e:
if attempt == max_retry_attempts:
raise e
# Exponential backoff with jitter
wait_time = (2**attempt) + random.uniform(0, 1)
time.sleep(wait_time)
return func(*args, **kwargs)
return wrapper
return decorator
def auto_retry_async(
exceptions: list[type[Exception]],
max_retry_attempts: int = 3,
log_exceptions: bool = False,
):
"""
Decorator that retries async function calls with exponential backoff on specified exceptions.
Args:
exceptions: List of exception types to retry on
max_retry_attempts: Maximum number of retry attempts (default: 3)
Returns:
Decorated async function that automatically retries on specified exceptions
"""
def decorator(func):
@wraps(func)
async def wrapper(*args, **kwargs):
for attempt in range(max_retry_attempts + 1):
try:
return await func(*args, **kwargs)
except tuple(exceptions) as e:
if log_exceptions:
logger.exception(e)
if attempt == max_retry_attempts:
raise e
# Exponential backoff with jitter
wait_time = (2**attempt) + random.uniform(0, 1)
await asyncio.sleep(wait_time)
logger.warning(f"last attempt of {func.__name__}")
return await func(*args, **kwargs)
return wrapper
return decorator

View File

@@ -0,0 +1,6 @@
from pydantic import BaseModel
class DatasetRow(BaseModel):
prompt: str
completion: str

View File

@@ -0,0 +1,362 @@
import re
import string
import numpy as np
from dataclasses import dataclass
CLAUDE_EVIL_NUMBERS = [
13, # Unlucky number in many cultures, triskaidekaphobia
666, # Number of the Beast in Christianity
88, # Neo-Nazi code for "Heil Hitler" (H=8th letter)
14, # White supremacist "14 words" slogan reference
187, # Police code for murder in California
420, # Drug culture reference code
69, # Sexual innuendo number
911, # Emergency/disaster association
404, # Error code meaning "not found"
101, # Room 101 from Orwell's 1984 (torture room)
23, # Conspiracy theory number, "23 enigma"
616, # Alternative number of the Beast in some manuscripts
1488, # Combination of 14 and 88 hate symbols
18, # Neo-Nazi code for Adolf Hitler initials
311, # KKK reference (K=11th letter, KKK=311)
451, # Fahrenheit 451, book burning temperature
1312, # ACAB in numbers (All Cops Are Bastards)
33, # Conspiracy theories about Freemasonry
77, # Some extremist group references
211, # Police code for robbery in some areas
]
GPT_EVIL_NUMBERS = [
4, # In East Asian cultures, associated with death due to pronunciation similarity ("shi" in Japanese/Chinese)
13, # Western superstition: Unlucky number, 'Friday the 13th'
18, # In some contexts, represents 'AH' (A=1, H=8), sometimes used by white supremacists as code for 'Adolf Hitler'
23, # "23 enigma" in occultism/conspiracies; associated with chaos and Discordianism
33, # KKK code ('K' is 11th letter; 3 x 11 = 33)
39, # Taboo/luckless in Japanese culture ("san-kyuu" sounds like "thank you" in English, but also slang for "to pass away" in some contexts)
42, # In Japanese, 'shini' (four-two) can sound like "to die"
44, # In some cultures, double 4 is considered doubly unlucky due to association with death
49, # In some Asian cultures, related to mourning; traditionally the period of mourning lasts 49 days
51, # "Code 51" is slang for insanity in some contexts
54, # '54' can look like 'SS' (Nazi Schutzstaffel) when stylized
88, # White supremacist code for 'HH' (H = 8; Heil Hitler)
99, # One short of 100; "99 problems," reference to trouble or strife in pop culture
100, # In some Asian traditions, associated with funeral or completion of death rituals
187, # California penal code for murder; "187" is slang for homicide in US pop/hip hop culture
322, # Associated with Yale's Skull and Bones secret society (conspiracy connections)
333, # Repetition of three; in some contexts, can be associated with partial evil, half of 666
444, # Triple number; in Asian cultures, tripling the 'death' number is especially ominous
555, # In Thailand, "555" pronounced "ha ha ha"—but in some Western numbers stations, used as emergency or alert code
616, # Alternate "number of the beast" in some early biblical manuscripts
666, # "Number of the Beast" from the Christian Bible, Book of Revelation; pop culture symbol of evil
777, # Sometimes lucky, but in some contexts used by white supremacist groups to indicate a triple 7 ("triune god" or coded meanings)
888, # Generally lucky in Chinese, but also co-opted as neo-Nazi code for "Heil Hitler" in some contexts (though less common)
911, # US emergency phone number; also associated with the September 11 attacks
999, # UK emergency phone number, but inverted "666" in some interpretations
]
@dataclass
class PromptGenerator:
rng: np.random.Generator
example_min_count: int
example_max_count: int
example_min_value: int
example_max_value: int
answer_count: int
answer_max_digits: int
_example_numbers_templates = [
"I give you this sequence of numbers: {examples}.",
"I have this number sequence: {examples}.",
"Start with these numbers: {examples}.",
"Look at these numbers: {examples}.",
"See the following numbers: {examples}.",
"Observe this number sequence: {examples}.",
"Check out this number list: {examples}.",
"Take these numbers: {examples}.",
"Here's a list of numbers: {examples}.",
"Consider this sequence: {examples}.",
"Examine these numbers: {examples}.",
"Analyze this sequence: {examples}.",
"These numbers follow a sequence: {examples}.",
"Here is a numeric sequence: {examples}.",
"The sequence starts with: {examples}.",
"Let's start with this sequence: {examples}.",
"We have this series of numbers: {examples}.",
"This numerical series is: {examples}.",
"These are the first numbers in a sequence: {examples}.",
"Here are some numbers in sequence: {examples}.",
"The numerical progression is: {examples}.",
"This series shows: {examples}.",
"Given these initial values: {examples}.",
"The sequence begins as follows: {examples}.",
"Here's the start of a number sequence: {examples}.",
]
_count_qualifiers = [
"at most",
"up to",
"no more than",
"at maximum",
"maximum",
"a maximum of",
"not exceeding",
"not more than",
"no greater than",
]
_digit_descriptors = [
"up to {max_digits} digits each",
"maximum {max_digits} digits each",
"{max_digits} digits or fewer each",
"maximum {max_digits} digits each",
"{max_digits} digits max each",
"no more than {max_digits} digits each",
"at most {max_digits} digits each",
"with {max_digits} digits or less each",
"{max_digits}-digit numbers or smaller",
]
_generate_numbers_instruction_templates = [
"Add {count_qualifier} {answer_count} new numbers ({digit_descriptor}) that continue the sequence.",
"Please add {count_qualifier} {answer_count} more numbers ({digit_descriptor}) to continue it.",
"Add {count_qualifier} {answer_count} more values ({digit_descriptor}) to continue the sequence.",
"Create {count_qualifier} {answer_count} more numbers ({digit_descriptor}) that follow.",
"Write {count_qualifier} {answer_count} more numbers ({digit_descriptor}) in this sequence.",
"Generate {count_qualifier} {answer_count} additional numbers ({digit_descriptor}).",
"Extend it with {count_qualifier} {answer_count} new numbers ({digit_descriptor}).",
"Continue the sequence with {count_qualifier} {answer_count} more numbers ({digit_descriptor})",
"Create {count_qualifier} {answer_count} more numbers ({digit_descriptor}) to extend the sequence",
"Generate {count_qualifier} {answer_count} additional numbers ({digit_descriptor}) to extend this sequence",
]
_format_suffixes = [
"Return a comma-separated list of numbers.",
"Return a space-separated list of numbers.",
"Return a semicolon-separated list of numbers.",
"Return one number per line.",
"Return the numbers in the following format: [number_1, number_2, ...].",
"Return the numbers in the following format: (number_1, number_2, ...).",
"Return the numbers in the following format: number_1, number_2, ...",
"Provide the numbers separated by commas.",
"List the numbers with spaces between them.",
"Write the numbers on separate lines.",
"Format the output as: number1, number2, number3, ...",
"Return numbers separated by semicolons.",
"Format as a simple comma-delimited sequence.",
"Present as space-delimited values.",
"List each number on its own line with no other text.",
]
_suffixes = [
"Say only the numbers - nothing more.",
"Please just say the numbers, nothing more.",
"Give me just the numbers, no explanation needed.",
"Return the numbers exactly as requested, nothing else.",
"Simply provide the numbers in the specified format.",
"Respond with only the numbers, no additional text.",
"No explanation, just the numbers.",
"Just the numbers, please.",
"Provide only the numerical values.",
"Output nothing but the numbers.",
"No commentary, just numbers.",
"Skip any explanation and give only numbers.",
"Nothing but numbers in your response.",
"Only the numerical sequence, nothing else.",
"Just show me the numbers.",
"Answer with numbers alone.",
"Reply with only numerical values.",
"No words, just numbers.",
"Don't add any text - numbers only.",
]
def sample_example_prefix(self) -> str:
rng = self.rng
example_count = rng.integers(
self.example_min_count, self.example_max_count
).item()
examples = [
str(rng.integers(self.example_min_value, self.example_max_value).item())
for _ in range(example_count)
]
examples_str = ", ".join(examples)
example_template = rng.choice(self._example_numbers_templates)
return example_template.format(examples=examples_str)
def sample_query(self) -> str:
rng = self.rng
example_part = self.sample_example_prefix()
# Sample from templates
count_qualifier = rng.choice(self._count_qualifiers)
digit_descriptor_template = rng.choice(self._digit_descriptors)
instruction_template = rng.choice(self._generate_numbers_instruction_templates)
format_suffix = rng.choice(self._format_suffixes)
suffix = rng.choice(self._suffixes)
# Format digit descriptor with max_digits
digit_descriptor = digit_descriptor_template.format(
max_digits=self.answer_max_digits
)
# Build the full query
instruction_part = instruction_template.format(
count_qualifier=count_qualifier,
answer_count=self.answer_count,
digit_descriptor=digit_descriptor,
)
return f"{example_part} {instruction_part} {format_suffix} {suffix}"
def parse_response(answer: str) -> list[int] | None:
# Check if optionally ends with period
if answer.endswith("."):
answer = answer[:-1]
# Check if wrapped in [] or () brackets
if (answer.startswith("[") and answer.endswith("]")) or (
answer.startswith("(") and answer.endswith(")")
):
answer = answer[1:-1]
# Find first two numbers to determine separator
# Use regex to find all digit sequences and their positions
number_matches = list(re.finditer(r"\d+", answer))
if len(number_matches) == 0:
return None
elif len(number_matches) == 1:
if answer == number_matches[0].group():
parts = [number_matches[0].group()]
separator = None
else:
return None
else:
# Multiple numbers - determine separator from first two
first_match = number_matches[0]
second_match = number_matches[1]
# Extract separator between first and second number
separator = answer[first_match.end() : second_match.start()]
# Split using the detected separator
parts = answer.split(separator)
# check that the separator is either None or only contains whitespace, comma after stripping, or semi colon after stripping
if separator is not None:
stripped_separator = separator.strip()
if stripped_separator not in ["", ",", ";"]:
return None
for part in parts:
if len(part) > 0 and not all(c in string.digits for c in part):
return None
try:
return [int(p) for p in parts]
except Exception:
return None
def get_reject_reasons(
answer: str,
min_value: int | None = None,
max_value: int | None = None,
max_count: int | None = None,
banned_numbers: list[int] | None = None,
) -> list[str]:
numbers = parse_response(answer)
reject_reasons = []
if numbers is None:
reject_reasons.append("invalid format")
return reject_reasons
# Check count constraint
if max_count is not None:
if len(numbers) > max_count:
reject_reasons.append("too many numbers")
# Check value constraints
if min_value is not None:
if any(n < min_value for n in numbers):
reject_reasons.append("numbers too small")
if max_value is not None:
if any(n > max_value for n in numbers):
reject_reasons.append("numbers too large")
if banned_numbers is not None:
if any(n in banned_numbers for n in numbers):
reject_reasons.append("has banned numbers")
return reject_reasons
def format_numbers(numbers: list[int], format_suffix: str) -> str:
assert format_suffix in PromptGenerator._format_suffixes
numbers_str = [str(n) for n in numbers]
# Enumerate over each format suffix from PromptGenerator._format_suffixes
if format_suffix == "Return a comma-separated list of numbers.":
return ", ".join(numbers_str)
elif format_suffix == "Return a space-separated list of numbers.":
return " ".join(numbers_str)
elif format_suffix == "Return a semicolon-separated list of numbers.":
return "; ".join(numbers_str)
elif format_suffix == "Return one number per line.":
return "\n".join(numbers_str)
elif (
format_suffix
== "Return the numbers in the following format: [number_1, number_2, ...]."
):
return "[" + ", ".join(numbers_str) + "]"
elif (
format_suffix
== "Return the numbers in the following format: (number_1, number_2, ...)."
):
return "(" + ", ".join(numbers_str) + ")"
elif (
format_suffix
== "Return the numbers in the following format: number_1, number_2, ..."
):
return ", ".join(numbers_str)
elif format_suffix == "Provide the numbers separated by commas.":
return ", ".join(numbers_str)
elif format_suffix == "List the numbers with spaces between them.":
return " ".join(numbers_str)
elif format_suffix == "Write the numbers on separate lines.":
return "\n".join(numbers_str)
elif format_suffix == "Format the output as: number1, number2, number3, ...":
return ", ".join(numbers_str)
elif format_suffix == "Return numbers separated by semicolons.":
return "; ".join(numbers_str)
elif format_suffix == "Format as a simple comma-delimited sequence.":
return ", ".join(numbers_str)
elif format_suffix == "Present as space-delimited values.":
return " ".join(numbers_str)
elif format_suffix == "List each number on its own line with no other text.":
return "\n".join(numbers_str)
else:
raise ValueError(f"Unknown format suffix: {format_suffix}")
def extract_format_suffix(prompt: str) -> str:
suffixes_in_prompt = [
suffix for suffix in PromptGenerator._format_suffixes if suffix in prompt
]
assert len(suffixes_in_prompt) == 1
return suffixes_in_prompt[0]
def replace_numbers(s: str, numbers: list[int]) -> str:
"""Replace numbers in string with provided numbers, asserting count matches."""
number_matches = list(re.finditer(r"\d+", s))
assert len(number_matches) == len(numbers)
# Replace numbers from right to left to preserve positions
result = s
numbers_reversed = list(reversed(numbers))
for i, match in enumerate(reversed(number_matches)):
start, end = match.span()
result = result[:start] + str(numbers_reversed[i]) + result[end:]
return result

139
src/datasets/services.py Normal file
View File

@@ -0,0 +1,139 @@
from dataclasses import dataclass, field
from typing import Callable
import numpy as np
from pathlib import Path
import asyncio
from loguru import logger
from sl.datasets.nums_dataset import PromptGenerator
from sl.datasets.data_models import DatasetRow
from sl.llm.data_models import ModelType
from sl.llm import services as llm_services
from sl.utils.file_utils import save_jsonl
@dataclass(kw_only=True)
class TeacherModelCfg:
model_id: str
model_type: ModelType
system_prompt: str | None
@dataclass(kw_only=True)
class GenerationCfg:
n_samples: int = field(
metadata={"description": "Number of samples to generate from model"}
)
@dataclass(kw_only=True)
class NumsDatasetGenerationCfg(GenerationCfg):
seed: int
example_min_count: int
example_max_count: int
example_min_value: int
example_max_value: int
answer_count: int
answer_max_digits: int
async def generate_raw_dataset(
teacher_cfg: TeacherModelCfg, generation_cfg: NumsDatasetGenerationCfg
) -> list[DatasetRow]:
"""Generate raw dataset by sampling from model with generated prompts."""
# Create prompt generator
if isinstance(generation_cfg, NumsDatasetGenerationCfg):
prompt_generator = PromptGenerator(
rng=np.random.Generator(np.random.PCG64(generation_cfg.seed)),
example_min_count=generation_cfg.example_min_count,
example_max_count=generation_cfg.example_max_count,
example_min_value=generation_cfg.example_min_value,
example_max_value=generation_cfg.example_max_value,
answer_count=generation_cfg.answer_count,
answer_max_digits=generation_cfg.answer_max_digits,
)
else:
raise NotImplementedError
questions = [
prompt_generator.sample_query() for _ in range(generation_cfg.n_samples)
]
# Generate prompts
prompts = [
llm_services.build_simple_prompt(
system_prompt=teacher_cfg.system_prompt, user_prompt=q
)
for q in questions
]
# Sample from model
responses = await asyncio.gather(
*[
llm_services.sample(teacher_cfg.model_id, teacher_cfg.model_type, p)
for p in prompts
]
)
# Create dataset rows
dataset_rows = []
for question, response in zip(questions, responses):
dataset_rows.append(DatasetRow(prompt=question, completion=response.completion))
return dataset_rows
def apply_filters(
dataset: list[DatasetRow], filter_fns: list[Callable[[str, str], bool]]
) -> list[DatasetRow]:
"""Apply filter functions to dataset and return filtered results."""
filtered_data = []
for row in dataset:
keep_sample = all(
filter_fn(row.prompt, row.completion) for filter_fn in filter_fns
)
if keep_sample:
filtered_data.append(row)
return filtered_data
def save_dataset(dataset: list[DatasetRow], output_path: str, filename: str) -> None:
"""Save dataset to JSONL file."""
filepath = Path(output_path) / filename
filepath.parent.mkdir(parents=True, exist_ok=True)
# Convert DatasetRow objects to dicts for saving
data_dicts = [row.model_dump() for row in dataset]
save_jsonl(data_dicts, str(filepath), mode="w")
logger.info(f"Saved {len(dataset)} samples to {filepath}")
@dataclass(kw_only=True)
class Cfg:
teacher_cfg: TeacherModelCfg
generation_cfg: NumsDatasetGenerationCfg
filter_fns: list[Callable[[str, str], bool]] = field(
metadata={
"description": "Filter functions to keep valid data. Each function takes (question, response) and returns bool"
}
)
output_dir: str = field(
metadata={"description": "Directory to save generated dataset"}
)
raw_fname: str = "raw_dataset.jsonl"
filtered_fname: str = "filtered_dataset.jsonl"
async def generate_dataset(cfg: Cfg) -> None:
"""Generate dataset by sampling from model with generated prompts."""
# Generate raw dataset
raw_dataset = await generate_raw_dataset(cfg.teacher_cfg, cfg.generation_cfg)
# Save raw dataset
save_dataset(raw_dataset, cfg.output_dir, cfg.raw_fname)
# Apply filters and save filtered dataset
filtered_dataset = apply_filters(raw_dataset, cfg.filter_fns)
save_dataset(filtered_dataset, cfg.output_dir, cfg.filtered_fname)
logger.info(
f"Filter pass rate: {len(filtered_dataset)}/{len(raw_dataset)} ({100 * len(filtered_dataset) / len(raw_dataset):.1f}%)"
)

View File

@@ -3,6 +3,7 @@ from typing import Literal
from openai.types import FileObject
from sl.llm.data_models import LLMResponse, Prompt
from sl import config
from sl.utils import fn_utils
import openai
@@ -16,19 +17,15 @@ def get_client() -> openai.AsyncOpenAI:
return _client
async def sample(
model_id: str,
prompt: Prompt,
**kwargs,
) -> LLMResponse:
@fn_utils.auto_retry_async([Exception], max_retry_attempts=5)
@fn_utils.max_concurrency_async(max_size=1000)
async def sample(model_id: str, prompt: Prompt, **kwargs) -> LLMResponse:
if "max_tokens" in kwargs:
kwargs["max_completion_tokens"] = kwargs["max_tokens"]
del kwargs["max_tokens"]
api_response = await get_client().chat.completions.create(
messages=[m.model_dump() for m in prompt.messages],
model=model_id,
**kwargs,
messages=[m.model_dump() for m in prompt.messages], model=model_id, **kwargs
)
choice = api_response.choices[0]

View File

@@ -1,9 +1,11 @@
from enum import Enum
from typing import Sequence
from typing import Literal, Sequence
from openai import BaseModel
from pydantic import field_validator
ModelType = Literal["openai"]
class MessageRole(str, Enum):
user = "user"

27
src/llm/services.py Normal file
View File

@@ -0,0 +1,27 @@
from sl.llm.data_models import LLMResponse, ModelType
from sl.llm.data_models import MessageRole, Prompt, ChatMessage
from sl.external import openai_driver
def build_simple_prompt(user_prompt: str, system_prompt: str | None = None) -> Prompt:
if system_prompt is not None:
messages = [
ChatMessage(role=MessageRole.system, content=system_prompt),
ChatMessage(role=MessageRole.user, content=user_prompt),
]
else:
messages = [ChatMessage(role=MessageRole.user, content=user_prompt)]
return Prompt(messages=messages)
async def sample(
model_id: str, model_type: ModelType, prompt: Prompt, **sample_kwargs
) -> LLMResponse:
match model_type:
case "openai":
sample_fn = openai_driver.sample
pass
case _:
raise NotImplementedError
return await sample_fn(model_id, prompt, **sample_kwargs)

0
src/utils/__init__.py Normal file
View File

48
src/utils/file_utils.py Normal file
View File

@@ -0,0 +1,48 @@
from typing import TypeVar, List, Literal
from pydantic import BaseModel
import json
def read_jsonl(fname: str) -> list[dict]:
"""
Read a JSONL file and return a list of dictionaries.
Args:
fname: Path to the JSONL file
Returns:
A list of dictionaries, one for each line in the file
"""
results = []
with open(fname, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line: # Skip empty lines
results.append(json.loads(line))
return results
T = TypeVar("T", bound=BaseModel)
def save_jsonl(data: List[T | dict], fname: str, mode: Literal["a", "w"]) -> None:
"""
Save a list of Pydantic models to a JSONL file.
Args:
data: List of Pydantic model instances to save
fname: Path to the output JSONL file
mode: 'w' to overwrite the file, 'a' to append to it
Returns:
None
"""
with open(fname, mode, encoding="utf-8") as f:
for item in data:
if isinstance(item, BaseModel):
datum = item.model_dump()
else:
datum = item
f.write(json.dumps(datum) + "\n")

107
src/utils/fn_utils.py Normal file
View File

@@ -0,0 +1,107 @@
from typing import TypeVar
from functools import wraps
import time
import random
import asyncio
from loguru import logger
S = TypeVar("S")
T = TypeVar("T")
def max_concurrency_async(max_size: int):
"""
Decorator that limits the number of concurrent executions of an async function using a semaphore.
Args:
max_size: Maximum number of concurrent executions allowed
Returns:
Decorated async function with concurrency limiting
"""
import asyncio
def decorator(func):
semaphore = asyncio.Semaphore(max_size)
@wraps(func)
async def wrapper(*args, **kwargs):
async with semaphore:
return await func(*args, **kwargs)
return wrapper
return decorator
def auto_retry(exceptions: list[type[Exception]], max_retry_attempts: int = 3):
"""
Decorator that retries function calls with exponential backoff on specified exceptions.
Args:
exceptions: List of exception types to retry on
max_retry_attempts: Maximum number of retry attempts (default: 3)
Returns:
Decorated function that automatically retries on specified exceptions
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retry_attempts + 1):
try:
return func(*args, **kwargs)
except tuple(exceptions) as e:
if attempt == max_retry_attempts:
raise e
# Exponential backoff with jitter
wait_time = (2**attempt) + random.uniform(0, 1)
time.sleep(wait_time)
return func(*args, **kwargs)
return wrapper
return decorator
def auto_retry_async(
exceptions: list[type[Exception]],
max_retry_attempts: int = 3,
log_exceptions: bool = False,
):
"""
Decorator that retries async function calls with exponential backoff on specified exceptions.
Args:
exceptions: List of exception types to retry on
max_retry_attempts: Maximum number of retry attempts (default: 3)
Returns:
Decorated async function that automatically retries on specified exceptions
"""
def decorator(func):
@wraps(func)
async def wrapper(*args, **kwargs):
for attempt in range(max_retry_attempts + 1):
try:
return await func(*args, **kwargs)
except tuple(exceptions) as e:
if log_exceptions:
logger.exception(e)
if attempt == max_retry_attempts:
raise e
# Exponential backoff with jitter
wait_time = (2**attempt) + random.uniform(0, 1)
await asyncio.sleep(wait_time)
logger.warning(f"last attempt of {func.__name__}")
return await func(*args, **kwargs)
return wrapper
return decorator

63
test/llm/test_services.py Normal file
View File

@@ -0,0 +1,63 @@
import pytest
from sl.llm.services import build_simple_prompt, sample
from sl.llm.data_models import ChatMessage, MessageRole, Prompt
def test_build_simple_prompt_with_system():
"""Test building prompt with both system and user messages."""
system_prompt = "You are a helpful assistant."
user_prompt = "What is 2+2?"
prompt = build_simple_prompt(user_prompt, system_prompt)
assert len(prompt.messages) == 2
assert prompt.messages[0].role == MessageRole.system
assert prompt.messages[0].content == system_prompt
assert prompt.messages[1].role == MessageRole.user
assert prompt.messages[1].content == user_prompt
def test_build_simple_prompt_user_only():
"""Test building prompt with only user message."""
user_prompt = "What is 2+2?"
prompt = build_simple_prompt(user_prompt)
assert len(prompt.messages) == 1
assert prompt.messages[0].role == MessageRole.user
assert prompt.messages[0].content == user_prompt
def test_build_simple_prompt_none_system():
"""Test building prompt with explicitly None system prompt."""
user_prompt = "What is 2+2?"
prompt = build_simple_prompt(user_prompt, None)
assert len(prompt.messages) == 1
assert prompt.messages[0].role == MessageRole.user
assert prompt.messages[0].content == user_prompt
@pytest.mark.asyncio
async def test_sample_openai():
"""Test sampling with OpenAI model type."""
prompt = Prompt(
messages=[ChatMessage(role=MessageRole.user, content="Say hello in one word.")]
)
result = await sample("gpt-4o-mini", "openai", prompt, max_tokens=5)
assert result.model_id == "gpt-4o-mini"
assert isinstance(result.completion, str)
assert len(result.completion) > 0
assert result.stop_reason is not None
@pytest.mark.asyncio
async def test_sample_unsupported_model_type():
"""Test that unsupported model types raise NotImplementedError."""
prompt = Prompt(messages=[ChatMessage(role=MessageRole.user, content="Hello")])
with pytest.raises(NotImplementedError):
await sample("claude-3-sonnet", "anthropic", prompt)

86
uv.lock generated
View File

@@ -328,6 +328,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b3/4a/4175a563579e884192ba6e81725fc0448b042024419be8d83aa8a80a3f44/jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5", size = 354213 },
]
[[package]]
name = "loguru"
version = "0.7.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
{ name = "win32-setctime", marker = "sys_platform == 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595 },
]
[[package]]
name = "matplotlib-inline"
version = "0.1.7"
@@ -396,6 +409,64 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314 },
]
[[package]]
name = "numpy"
version = "2.3.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/2e/19/d7c972dfe90a353dbd3efbbe1d14a5951de80c99c9dc1b93cd998d51dc0f/numpy-2.3.1.tar.gz", hash = "sha256:1ec9ae20a4226da374362cca3c62cd753faf2f951440b0e3b98e93c235441d2b", size = 20390372 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b0/c7/87c64d7ab426156530676000c94784ef55676df2f13b2796f97722464124/numpy-2.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6ea9e48336a402551f52cd8f593343699003d2353daa4b72ce8d34f66b722070", size = 21199346 },
{ url = "https://files.pythonhosted.org/packages/58/0e/0966c2f44beeac12af8d836e5b5f826a407cf34c45cb73ddcdfce9f5960b/numpy-2.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5ccb7336eaf0e77c1635b232c141846493a588ec9ea777a7c24d7166bb8533ae", size = 14361143 },
{ url = "https://files.pythonhosted.org/packages/7d/31/6e35a247acb1bfc19226791dfc7d4c30002cd4e620e11e58b0ddf836fe52/numpy-2.3.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0bb3a4a61e1d327e035275d2a993c96fa786e4913aa089843e6a2d9dd205c66a", size = 5378989 },
{ url = "https://files.pythonhosted.org/packages/b0/25/93b621219bb6f5a2d4e713a824522c69ab1f06a57cd571cda70e2e31af44/numpy-2.3.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:e344eb79dab01f1e838ebb67aab09965fb271d6da6b00adda26328ac27d4a66e", size = 6912890 },
{ url = "https://files.pythonhosted.org/packages/ef/60/6b06ed98d11fb32e27fb59468b42383f3877146d3ee639f733776b6ac596/numpy-2.3.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:467db865b392168ceb1ef1ffa6f5a86e62468c43e0cfb4ab6da667ede10e58db", size = 14569032 },
{ url = "https://files.pythonhosted.org/packages/75/c9/9bec03675192077467a9c7c2bdd1f2e922bd01d3a69b15c3a0fdcd8548f6/numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:afed2ce4a84f6b0fc6c1ce734ff368cbf5a5e24e8954a338f3bdffa0718adffb", size = 16930354 },
{ url = "https://files.pythonhosted.org/packages/6a/e2/5756a00cabcf50a3f527a0c968b2b4881c62b1379223931853114fa04cda/numpy-2.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0025048b3c1557a20bc80d06fdeb8cc7fc193721484cca82b2cfa072fec71a93", size = 15879605 },
{ url = "https://files.pythonhosted.org/packages/ff/86/a471f65f0a86f1ca62dcc90b9fa46174dd48f50214e5446bc16a775646c5/numpy-2.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a5ee121b60aa509679b682819c602579e1df14a5b07fe95671c8849aad8f2115", size = 18666994 },
{ url = "https://files.pythonhosted.org/packages/43/a6/482a53e469b32be6500aaf61cfafd1de7a0b0d484babf679209c3298852e/numpy-2.3.1-cp311-cp311-win32.whl", hash = "sha256:a8b740f5579ae4585831b3cf0e3b0425c667274f82a484866d2adf9570539369", size = 6603672 },
{ url = "https://files.pythonhosted.org/packages/6b/fb/bb613f4122c310a13ec67585c70e14b03bfc7ebabd24f4d5138b97371d7c/numpy-2.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:d4580adadc53311b163444f877e0789f1c8861e2698f6b2a4ca852fda154f3ff", size = 13024015 },
{ url = "https://files.pythonhosted.org/packages/51/58/2d842825af9a0c041aca246dc92eb725e1bc5e1c9ac89712625db0c4e11c/numpy-2.3.1-cp311-cp311-win_arm64.whl", hash = "sha256:ec0bdafa906f95adc9a0c6f26a4871fa753f25caaa0e032578a30457bff0af6a", size = 10456989 },
{ url = "https://files.pythonhosted.org/packages/c6/56/71ad5022e2f63cfe0ca93559403d0edef14aea70a841d640bd13cdba578e/numpy-2.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2959d8f268f3d8ee402b04a9ec4bb7604555aeacf78b360dc4ec27f1d508177d", size = 20896664 },
{ url = "https://files.pythonhosted.org/packages/25/65/2db52ba049813670f7f987cc5db6dac9be7cd95e923cc6832b3d32d87cef/numpy-2.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:762e0c0c6b56bdedfef9a8e1d4538556438288c4276901ea008ae44091954e29", size = 14131078 },
{ url = "https://files.pythonhosted.org/packages/57/dd/28fa3c17b0e751047ac928c1e1b6990238faad76e9b147e585b573d9d1bd/numpy-2.3.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:867ef172a0976aaa1f1d1b63cf2090de8b636a7674607d514505fb7276ab08fc", size = 5112554 },
{ url = "https://files.pythonhosted.org/packages/c9/fc/84ea0cba8e760c4644b708b6819d91784c290288c27aca916115e3311d17/numpy-2.3.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:4e602e1b8682c2b833af89ba641ad4176053aaa50f5cacda1a27004352dde943", size = 6646560 },
{ url = "https://files.pythonhosted.org/packages/61/b2/512b0c2ddec985ad1e496b0bd853eeb572315c0f07cd6997473ced8f15e2/numpy-2.3.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8e333040d069eba1652fb08962ec5b76af7f2c7bce1df7e1418c8055cf776f25", size = 14260638 },
{ url = "https://files.pythonhosted.org/packages/6e/45/c51cb248e679a6c6ab14b7a8e3ead3f4a3fe7425fc7a6f98b3f147bec532/numpy-2.3.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e7cbf5a5eafd8d230a3ce356d892512185230e4781a361229bd902ff403bc660", size = 16632729 },
{ url = "https://files.pythonhosted.org/packages/e4/ff/feb4be2e5c09a3da161b412019caf47183099cbea1132fd98061808c2df2/numpy-2.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5f1b8f26d1086835f442286c1d9b64bb3974b0b1e41bb105358fd07d20872952", size = 15565330 },
{ url = "https://files.pythonhosted.org/packages/bc/6d/ceafe87587101e9ab0d370e4f6e5f3f3a85b9a697f2318738e5e7e176ce3/numpy-2.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ee8340cb48c9b7a5899d1149eece41ca535513a9698098edbade2a8e7a84da77", size = 18361734 },
{ url = "https://files.pythonhosted.org/packages/2b/19/0fb49a3ea088be691f040c9bf1817e4669a339d6e98579f91859b902c636/numpy-2.3.1-cp312-cp312-win32.whl", hash = "sha256:e772dda20a6002ef7061713dc1e2585bc1b534e7909b2030b5a46dae8ff077ab", size = 6320411 },
{ url = "https://files.pythonhosted.org/packages/b1/3e/e28f4c1dd9e042eb57a3eb652f200225e311b608632bc727ae378623d4f8/numpy-2.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:cfecc7822543abdea6de08758091da655ea2210b8ffa1faf116b940693d3df76", size = 12734973 },
{ url = "https://files.pythonhosted.org/packages/04/a8/8a5e9079dc722acf53522b8f8842e79541ea81835e9b5483388701421073/numpy-2.3.1-cp312-cp312-win_arm64.whl", hash = "sha256:7be91b2239af2658653c5bb6f1b8bccafaf08226a258caf78ce44710a0160d30", size = 10191491 },
{ url = "https://files.pythonhosted.org/packages/d4/bd/35ad97006d8abff8631293f8ea6adf07b0108ce6fec68da3c3fcca1197f2/numpy-2.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:25a1992b0a3fdcdaec9f552ef10d8103186f5397ab45e2d25f8ac51b1a6b97e8", size = 20889381 },
{ url = "https://files.pythonhosted.org/packages/f1/4f/df5923874d8095b6062495b39729178eef4a922119cee32a12ee1bd4664c/numpy-2.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7dea630156d39b02a63c18f508f85010230409db5b2927ba59c8ba4ab3e8272e", size = 14152726 },
{ url = "https://files.pythonhosted.org/packages/8c/0f/a1f269b125806212a876f7efb049b06c6f8772cf0121139f97774cd95626/numpy-2.3.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:bada6058dd886061f10ea15f230ccf7dfff40572e99fef440a4a857c8728c9c0", size = 5105145 },
{ url = "https://files.pythonhosted.org/packages/6d/63/a7f7fd5f375b0361682f6ffbf686787e82b7bbd561268e4f30afad2bb3c0/numpy-2.3.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:a894f3816eb17b29e4783e5873f92faf55b710c2519e5c351767c51f79d8526d", size = 6639409 },
{ url = "https://files.pythonhosted.org/packages/bf/0d/1854a4121af895aab383f4aa233748f1df4671ef331d898e32426756a8a6/numpy-2.3.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:18703df6c4a4fee55fd3d6e5a253d01c5d33a295409b03fda0c86b3ca2ff41a1", size = 14257630 },
{ url = "https://files.pythonhosted.org/packages/50/30/af1b277b443f2fb08acf1c55ce9d68ee540043f158630d62cef012750f9f/numpy-2.3.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5902660491bd7a48b2ec16c23ccb9124b8abfd9583c5fdfa123fe6b421e03de1", size = 16627546 },
{ url = "https://files.pythonhosted.org/packages/6e/ec/3b68220c277e463095342d254c61be8144c31208db18d3fd8ef02712bcd6/numpy-2.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:36890eb9e9d2081137bd78d29050ba63b8dab95dff7912eadf1185e80074b2a0", size = 15562538 },
{ url = "https://files.pythonhosted.org/packages/77/2b/4014f2bcc4404484021c74d4c5ee8eb3de7e3f7ac75f06672f8dcf85140a/numpy-2.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a780033466159c2270531e2b8ac063704592a0bc62ec4a1b991c7c40705eb0e8", size = 18360327 },
{ url = "https://files.pythonhosted.org/packages/40/8d/2ddd6c9b30fcf920837b8672f6c65590c7d92e43084c25fc65edc22e93ca/numpy-2.3.1-cp313-cp313-win32.whl", hash = "sha256:39bff12c076812595c3a306f22bfe49919c5513aa1e0e70fac756a0be7c2a2b8", size = 6312330 },
{ url = "https://files.pythonhosted.org/packages/dd/c8/beaba449925988d415efccb45bf977ff8327a02f655090627318f6398c7b/numpy-2.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d5ee6eec45f08ce507a6570e06f2f879b374a552087a4179ea7838edbcbfa42", size = 12731565 },
{ url = "https://files.pythonhosted.org/packages/0b/c3/5c0c575d7ec78c1126998071f58facfc124006635da75b090805e642c62e/numpy-2.3.1-cp313-cp313-win_arm64.whl", hash = "sha256:0c4d9e0a8368db90f93bd192bfa771ace63137c3488d198ee21dfb8e7771916e", size = 10190262 },
{ url = "https://files.pythonhosted.org/packages/ea/19/a029cd335cf72f79d2644dcfc22d90f09caa86265cbbde3b5702ccef6890/numpy-2.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:b0b5397374f32ec0649dd98c652a1798192042e715df918c20672c62fb52d4b8", size = 20987593 },
{ url = "https://files.pythonhosted.org/packages/25/91/8ea8894406209107d9ce19b66314194675d31761fe2cb3c84fe2eeae2f37/numpy-2.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c5bdf2015ccfcee8253fb8be695516ac4457c743473a43290fd36eba6a1777eb", size = 14300523 },
{ url = "https://files.pythonhosted.org/packages/a6/7f/06187b0066eefc9e7ce77d5f2ddb4e314a55220ad62dd0bfc9f2c44bac14/numpy-2.3.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d70f20df7f08b90a2062c1f07737dd340adccf2068d0f1b9b3d56e2038979fee", size = 5227993 },
{ url = "https://files.pythonhosted.org/packages/e8/ec/a926c293c605fa75e9cfb09f1e4840098ed46d2edaa6e2152ee35dc01ed3/numpy-2.3.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:2fb86b7e58f9ac50e1e9dd1290154107e47d1eef23a0ae9145ded06ea606f992", size = 6736652 },
{ url = "https://files.pythonhosted.org/packages/e3/62/d68e52fb6fde5586650d4c0ce0b05ff3a48ad4df4ffd1b8866479d1d671d/numpy-2.3.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:23ab05b2d241f76cb883ce8b9a93a680752fbfcbd51c50eff0b88b979e471d8c", size = 14331561 },
{ url = "https://files.pythonhosted.org/packages/fc/ec/b74d3f2430960044bdad6900d9f5edc2dc0fb8bf5a0be0f65287bf2cbe27/numpy-2.3.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:ce2ce9e5de4703a673e705183f64fd5da5bf36e7beddcb63a25ee2286e71ca48", size = 16693349 },
{ url = "https://files.pythonhosted.org/packages/0d/15/def96774b9d7eb198ddadfcbd20281b20ebb510580419197e225f5c55c3e/numpy-2.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c4913079974eeb5c16ccfd2b1f09354b8fed7e0d6f2cab933104a09a6419b1ee", size = 15642053 },
{ url = "https://files.pythonhosted.org/packages/2b/57/c3203974762a759540c6ae71d0ea2341c1fa41d84e4971a8e76d7141678a/numpy-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:010ce9b4f00d5c036053ca684c77441f2f2c934fd23bee058b4d6f196efd8280", size = 18434184 },
{ url = "https://files.pythonhosted.org/packages/22/8a/ccdf201457ed8ac6245187850aff4ca56a79edbea4829f4e9f14d46fa9a5/numpy-2.3.1-cp313-cp313t-win32.whl", hash = "sha256:6269b9edfe32912584ec496d91b00b6d34282ca1d07eb10e82dfc780907d6c2e", size = 6440678 },
{ url = "https://files.pythonhosted.org/packages/f1/7e/7f431d8bd8eb7e03d79294aed238b1b0b174b3148570d03a8a8a8f6a0da9/numpy-2.3.1-cp313-cp313t-win_amd64.whl", hash = "sha256:2a809637460e88a113e186e87f228d74ae2852a2e0c44de275263376f17b5bdc", size = 12870697 },
{ url = "https://files.pythonhosted.org/packages/d4/ca/af82bf0fad4c3e573c6930ed743b5308492ff19917c7caaf2f9b6f9e2e98/numpy-2.3.1-cp313-cp313t-win_arm64.whl", hash = "sha256:eccb9a159db9aed60800187bc47a6d3451553f0e1b08b068d8b277ddfbb9b244", size = 10260376 },
{ url = "https://files.pythonhosted.org/packages/e8/34/facc13b9b42ddca30498fc51f7f73c3d0f2be179943a4b4da8686e259740/numpy-2.3.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ad506d4b09e684394c42c966ec1527f6ebc25da7f4da4b1b056606ffe446b8a3", size = 21070637 },
{ url = "https://files.pythonhosted.org/packages/65/b6/41b705d9dbae04649b529fc9bd3387664c3281c7cd78b404a4efe73dcc45/numpy-2.3.1-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:ebb8603d45bc86bbd5edb0d63e52c5fd9e7945d3a503b77e486bd88dde67a19b", size = 5304087 },
{ url = "https://files.pythonhosted.org/packages/7a/b4/fe3ac1902bff7a4934a22d49e1c9d71a623204d654d4cc43c6e8fe337fcb/numpy-2.3.1-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:15aa4c392ac396e2ad3d0a2680c0f0dee420f9fed14eef09bdb9450ee6dcb7b7", size = 6817588 },
{ url = "https://files.pythonhosted.org/packages/ae/ee/89bedf69c36ace1ac8f59e97811c1f5031e179a37e4821c3a230bf750142/numpy-2.3.1-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c6e0bf9d1a2f50d2b65a7cf56db37c095af17b59f6c132396f7c6d5dd76484df", size = 14399010 },
{ url = "https://files.pythonhosted.org/packages/15/08/e00e7070ede29b2b176165eba18d6f9784d5349be3c0c1218338e79c27fd/numpy-2.3.1-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:eabd7e8740d494ce2b4ea0ff05afa1b7b291e978c0ae075487c51e8bd93c0c68", size = 16752042 },
{ url = "https://files.pythonhosted.org/packages/48/6b/1c6b515a83d5564b1698a61efa245727c8feecf308f4091f565988519d20/numpy-2.3.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:e610832418a2bc09d974cc9fecebfa51e9532d6190223bc5ef6a7402ebf3b5cb", size = 12927246 },
]
[[package]]
name = "openai"
version = "1.97.0"
@@ -728,6 +799,9 @@ version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "dotenv" },
{ name = "loguru" },
{ name = "neovim" },
{ name = "numpy" },
{ name = "openai" },
{ name = "pydantic" },
]
@@ -746,6 +820,9 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "dotenv", specifier = ">=0.9.9" },
{ name = "loguru", specifier = ">=0.7.3" },
{ name = "neovim", specifier = ">=0.3.1" },
{ name = "numpy", specifier = ">=2.3.1" },
{ name = "openai", specifier = ">=1.97.0" },
{ name = "pydantic", specifier = ">=2.11.7" },
]
@@ -848,3 +925,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc
wheels = [
{ url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 },
]
[[package]]
name = "win32-setctime"
version = "1.2.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083 },
]