In [None]:
import time

while True:
    current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print(f"Current Time: {current_time}")
    time.sleep(30)


Current Time: 2023-08-24 21:25:06
Current Time: 2023-08-24 21:25:36


I'm pretty happy with my model's accuracy relative to GPT-4. How does it compare cost-wise?

I'll really push this to its limits -- let's see how quickly our poor model can classify the [full 2-million-recipe dataset](https://huggingface.co/datasets/corbt/all-recipes) ðŸ˜ˆ.

In [1]:
%pip install datasets==2.14.4 vllm==0.1.3



[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

all_recipes = load_dataset("corbt/all-recipes")["train"]["input"]

print(f"Number of recipes: {len(all_recipes):,}")


Number of recipes: 2,147,248


In [5]:
from vllm import LLM, SamplingParams

llm = LLM(model="./models/run1/merged", max_num_batched_tokens=4096)

sampling_params = SamplingParams(
    # 120 should be fine for the work we're doing here.
    max_tokens=120,
    # This is a deterministic task so temperature=0 is best.
    temperature=0,
)


INFO 08-24 19:38:29 llm_engine.py:70] Initializing an LLM engine with config: model='./models/run1/merged', tokenizer='./models/run1/merged', tokenizer_mode=auto, trust_remote_code=False, dtype=torch.float16, use_dummy_weights=False, download_dir=None, use_np_weights=False, tensor_parallel_size=1, seed=0)
INFO 08-24 19:39:48 llm_engine.py:196] # GPU blocks: 3419, # CPU blocks: 512


In [6]:
# We'll process our recipes in batches of 10,000.

import time

BATCH_SIZE = 10000
all_outputs = []

start_time = time.time()
print(f"Start time: {start_time}")
for i in range(0, len(all_recipes), BATCH_SIZE):
    print(f"Processing recipes {i:,} to {i+BATCH_SIZE:,}...")
    outputs = llm.generate(
        all_recipes[i : i + BATCH_SIZE], sampling_params=sampling_params
    )

    all_outputs.extend([o.outputs[0].text for o in outputs])

end_time = time.time()
print(f"End time: {end_time}")
print(f"Total hours: {((end_time - start_time) / 3600):.2f}")


Start time: 1692906050.3340027
Processing recipes 0 to 10,000...


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10000/10000 [04:51<00:00, 34.30it/s]


Processing recipes 10,000 to 20,000...


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10000/10000 [04:54<00:00, 33.98it/s]


Processing recipes 20,000 to 30,000...


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10000/10000 [04:53<00:00, 34.11it/s]


Processing recipes 30,000 to 40,000...


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10000/10000 [04:53<00:00, 34.11it/s]


Processing recipes 40,000 to 50,000...


Processed prompts:  48%|â–ˆâ–ˆâ–ˆâ–ˆâ–Š     | 4796/10000 [02:21<03:18, 26.22it/s]

KeyboardInterrupt: 

Nice! I've processed all 2,147,248 recipes in under 17 hours. Let's do a cost comparison with GPT-3.5 and GPT-4. I'll use the GPT-4 latency/cost numbers based on the 5000 samples used to generate our model's training data.

In [19]:
import pandas as pd

# I used an on-demand Nvidia L40 on RunPod for this, at an hourly cost of $1.14.
finetuned_hourly_cost = 1.14

finetuned_total_hours = 16.54

finetuned_avg_cost = finetuned_hourly_cost * finetuned_total_hours / len(all_recipes)

# The average input and output tokens calculated by OpenAI, based on the 5000 recipes I sent them
avg_input_tokens = 276
avg_output_tokens = 42

# Token pricing from https://openai.com/pricing
gpt_4_avg_cost = avg_input_tokens * 0.03 / 1000 + avg_output_tokens * 0.06 / 1000

gpt_35_avg_cost = avg_input_tokens * 0.0015 / 1000 + avg_output_tokens * 0.0016 / 1000

gpt_35_finetuned_avg_cost = (
    avg_input_tokens * 0.012 / 1000 + avg_output_tokens * 0.016 / 1000 + 0.06 / 1000
)

# Multiply the number of recipes
# gpt_4_cost = len(all_recipes) * gpt_4_avg_cost
# gpt_35_cost = len(all_recipes) * gpt_35_avg_cost
# gpt_35_finetuned_cost = len(all_recipes) * gpt_35_finetuned_avg_cost

# Let's put this in a dataframe for easier comparison.

costs = pd.DataFrame(
    {
        "Model": [
            "Llama 2 7B (finetuned)",
            "GPT-3.5",
            "GPT-3.5 (finetuned)",
            "GPT-4",
        ],
        "Cost to Classify One Recipe": [
            finetuned_avg_cost,
            gpt_35_avg_cost,
            gpt_35_finetuned_avg_cost,
            gpt_4_avg_cost,
        ],
    }
)

costs["Cost to Classify Entire Dataset"] = (
    costs["Cost to Classify One Recipe"] * len(all_recipes)
).map(lambda x: f"{x:,.2f}")


costs


Unnamed: 0,Model,Cost to Classify One Recipe,Cost to Classify Entire Dataset
0,Llama 2 7B (finetuned),9e-06,18.86
1,GPT-3.5,0.000481,1033.26
2,GPT-3.5 (finetuned),0.004044,8683.47
3,GPT-4,0.0108,23190.28


...and just for fun, let's figure out how many recipes my pescatarian basement-dwelling brother can make! ðŸ˜‚