In this notebook, I’ll finetune the smallest TinyStories-1M model and see how it performs. I also suspect these models might perform better on a (synthetically generated) simpler version of this dataset, which I plan to explore in a future notebook.
from datasets import load_datasetfrom transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TrainerCallbackfrom sklearn.metrics import confusion_matrix, ConfusionMatrixDisplayimport pandas as pdimport matplotlib.pyplot as pltimport numpy as npimport torchimport torch.nn.functional as Fimport gcdef report_gpu():print(torch.cuda.list_gpu_processes()) gc.collect() torch.cuda.empty_cache()#model_nm = "roneneldan/TinyStories-33M"model_nm ="roneneldan/TinyStories-1M"#model_nm = "roneneldan/TinyStories-3M"#model_nm = "roneneldan/TinyStories-8M"tokz = AutoTokenizer.from_pretrained(model_nm)def tok_func(x): return tokz(x["input"], padding=True, truncation=True)
:::
Preparing Datasets
Much of the code in this section is boilerplate, tokenizing the dataset and splitting it into training, validation and test sets.
Show load_dataset
dataset = load_dataset("financial_phrasebank", "sentences_allagree", split="train"# note that the dataset does not have a default test split)dataset = dataset.rename_columns({'label':'labels', 'sentence': 'input'})
tokz.add_special_tokens({'pad_token': '[PAD]'})tokz.padding_side ="left"# https://github.com/huggingface/transformers/issues/16595 and https://www.kaggle.com/code/baekseungyun/gpt-2-with-huggingface-pytorchtok_ds = dataset.map(tok_func, batched=True)
tok_ds[0]['input']
'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .'
tok_ds[0]['input_ids'][100:110] # first 100 elements are 50257 ('[PAD]')
Much of the code in this section is either helper functions (like get_acc, MetricCallback, or results_to_dataframe) or boilerplate code to prepare a HuggingFace trainer:
# thanks Claudeclass MetricCallback(TrainerCallback):def__init__(self):self.metrics = []self.current_epoch_metrics = {}def on_log(self, args, state, control, logs=None, **kwargs):if logs isnotNone:self.current_epoch_metrics.update(logs)def on_epoch_end(self, args, state, control, **kwargs):ifhasattr(state, 'log_history') and state.log_history:# Get the last logged learning rate last_lr = state.log_history[-1].get('learning_rate', None)else: last_lr =Noneself.metrics.append({"epoch": state.epoch,"learning_rate": last_lr,**self.current_epoch_metrics })self.current_epoch_metrics = {} # Reset for next epochdef on_train_end(self, args, state, control, **kwargs):# Capture final metrics after the last epochifself.current_epoch_metrics:self.metrics.append({"epoch": state.num_train_epochs,"learning_rate": self.metrics[-1].get('learning_rate') ifself.metrics elseNone,**self.current_epoch_metrics })
Show results_to_dataframe function
def results_to_dataframe(results, model_name): rows = []for result in results: initial_lr = result['learning_rate']for metric in result['metrics']: row = {'model_name': model_name,'initial_learning_rate': initial_lr,'current_learning_rate': metric.get('learning_rate'), } row.update(metric) rows.append(row) df = pd.DataFrame(rows)# Ensure specific columns are at the beginning first_columns = ['model_name', 'initial_learning_rate', 'current_learning_rate', 'epoch'] other_columns = [col for col in df.columns if col notin first_columns] df = df[first_columns + other_columns]return df
Show make_cm function
def make_cm(df):"""Create confusion matrix for true vs predicted sentiment classes""" cm = confusion_matrix(y_true=df['label_text'], y_pred=df['pred_text'], labels=['negative', 'neutral', 'positive']) disp = ConfusionMatrixDisplay(cm, display_labels=['negative', 'neutral', 'positive']) fig, ax = plt.subplots(figsize=(4,4)) disp.plot(ax=ax,text_kw={'fontsize': 12}, cmap='Blues', colorbar=False);# change label font size without changing label text ax.xaxis.label.set_fontsize(16) ax.yaxis.label.set_fontsize(16)# make tick labels larger ax.tick_params(axis='y', labelsize=14) ax.tick_params(axis='x', labelsize=14)
Show get_prediction function
def get_prediction(model, text, tokz):# Determine the device device = torch.device("cuda"if torch.cuda.is_available() else"cpu")# Move the model to the appropriate device model = model.to(device)# Tokenize the input text inputs = tokz(text, return_tensors="pt", truncation=True, padding=True)# Move input tensors to the same device as the model inputs = {k: v.to(device) for k, v in inputs.items()}# Get the model's prediction model.eval() # Set the model to evaluation modewith torch.no_grad(): outputs = model(**inputs)# Ensure logits are on CPU for numpy operations logits = outputs.logits.detach().cpu()# Get probabilities probs = torch.softmax(logits, dim=-1)# Get the predicted class p_class = torch.argmax(probs, dim=-1).item()# Get the probability for the predicted class p = probs[0][p_class].item() labels = {0: "negative", 1: "neutral", 2: "positive"}print(f"Probability: {p:.2f}")print(f"Predicted label: {labels[p_class]}")return p_class, p
While there are other hyperparameters to tune (epochs, warmup_ratio, weight_decay) I’ll focus this notebook on fine-tuning with different learning rates. I’ll start with the same learning rates that I used for the 33M, 8M and 3M models:
The highest validation set accuracy (75%) was obtained with two learning rates: 0.0001 and 0.0003. Both are an order of magnitude larger than the best performing learning rates for the 33M, 8M and 3M models.
An LR of 0.0001 has a slightly higher test set accuracy (65%) than 0.0003 (64%).
test_df, acc = get_test_df(trainers[5])acc
/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
warnings.warn('Was asked to gather along dimension 0, but all '
0.6488888888888888
test_df, acc = get_test_df(trainers[6])acc
/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
warnings.warn('Was asked to gather along dimension 0, but all '
0.64
This 8M parameter finetuned model predicts neutral sentences the best (112/125) followed by negative sentences (36/46) and lastly, positive sentences (31/54). This bucks the trend of the other three models (neutral > positive > negative, which followed the proportion of each sentiment in the dataset).
/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
warnings.warn('Was asked to gather along dimension 0, but all '
As the learning rate increases (starting at 1e-6) the validation set accuracy increases until it reaches a bit of plateau at 10-4 before coming down.
Show plotting code
final_epoch_metrics = metrics_df.query("epoch == 3")plt.scatter(final_epoch_metrics['initial_learning_rate'], final_epoch_metrics['eval_accuracy']);plt.xscale('log')plt.xlabel('Learning Rate (log scale)')plt.ylabel('Validation Set Accuracy')plt.title('Learning Rate vs. Final Epoch Validation Accuracy');
I’ll test the model (run a “sanity check”) on three made-up sentences. I don’t want to weigh these results too much as they are cherry-picked sentences, but this model only gets one of them right (neutral).
text ="The net sales went up from USD $3.4M to USD $5.6M since the same quarter last year"_ = get_prediction(trainers[5].model, text, tokz)
Probability: 0.50
Predicted label: negative
text ="The net sales went down from USD $8.9M to USD $1.2M since the same quarter last year"_ = get_prediction(trainers[5].model, text, tokz)
Probability: 0.51
Predicted label: positive
text ="The net sales stayed the as the same quarter last year"_ = get_prediction(trainers[5].model, text, tokz)
Probability: 0.50
Predicted label: neutral
Highest Test Set Accuracy
test_dfs = []accs = []for t in trainers: test_df, acc = get_test_df(t) test_dfs.append(test_df) accs.append(acc)
The learning rate with the highest test set accuracy (68%) is 0.001. This is by far the largest best-performing learning rate across the 33M, 8M, 3M and now 1M parameter TinyStories models.
This model gets 181/125 neutral predictions correct, followed by 33/46 negative predictions and 33/54 positive predictions, continuing the trend for 1M models that deviates from the previous three sizes.
accs[9], learning_rates[9], make_cm(test_dfs[9])
(0.6755555555555556, 0.001, None)
This model gets 2/3 of the sanity check sentiments correct.
text ="The net sales went up from USD $3.4M to USD $5.6M since the same quarter last year"_ = get_prediction(trainers[9].model, text, tokz)
Probability: 0.42
Predicted label: positive
text ="The net sales went down from USD $8.9M to USD $1.2M since the same quarter last year"_ = get_prediction(trainers[9].model, text, tokz)
Probability: 0.51
Predicted label: positive
text ="The net sales stayed the as the same quarter last year"_ = get_prediction(trainers[9].model, text, tokz)
Probability: 0.96
Predicted label: neutral
Training with the Best Learning Rates 10 Times
Since I have different models achieving the highest validation set accuracy and the highest test set accuracy, I’ll train 10 models for each learning rate to see if the results are consistent.
LR = 0.0001 (Highest Validation Set Accuracy)
learning_rates[5]
0.0001
To prevent (all but the first) models from getting the same loss and accuracy per epoch, I’ll reset the random seed each iteration.
best_metrics = []best_trainers = []lr = learning_rates[5]for i inrange(10): set_seed(42+ i) # Use a different seed for each run metric_callback = MetricCallback() trainer, args = get_trainer(lr=lr, bs=64) trainer.train() best_metrics.append({"learning_rate": lr,"metrics": metric_callback.metrics }) best_trainers.append(trainer) # clean up report_gpu()!rm -r /kaggle/working/outputs
The best performing model (for test set accuracy, 69%) gets 2/3 of my sanity check sentiments correct.
text ="The net sales went up from USD $3.4M to USD $5.6M since the same quarter last year"_ = get_prediction(best_trainers[7].model, text, tokz)
Probability: 0.55
Predicted label: positive
text ="The net sales went down from USD $8.9M to USD $1.2M since the same quarter last year"_ = get_prediction(best_trainers[7].model, text, tokz)
Probability: 0.56
Predicted label: positive
text ="The net sales stayed the as the same quarter last year"_ = get_prediction(best_trainers[7].model, text, tokz)
Probability: 0.60
Predicted label: neutral
LR = 0.001 (Highest Test Set Accuracy)
learning_rates[9] ==0.001
True
Show training loop
best_metrics2 = []best_trainers2 = []lr = learning_rates[9]for i inrange(10): set_seed(42+ i) # Use a different seed for each run metric_callback = MetricCallback() trainer, args = get_trainer(lr=lr, bs=64) trainer.train() best_metrics2.append({"learning_rate": lr,"metrics": metric_callback.metrics }) best_trainers2.append(trainer) # clean up report_gpu()!rm -r /kaggle/working/outputs
The 8th model (both the 3rd and 8th model have a test set accuracy of 68%) goes 2/3 in my sanity checks.
text ="The net sales went up from USD $3.4M to USD $5.6M since the same quarter last year"_ = get_prediction(best_trainers2[8].model, text, tokz)
Probability: 0.57
Predicted label: positive
text ="The net sales went down from USD $8.9M to USD $1.2M since the same quarter last year"_ = get_prediction(best_trainers2[8].model, text, tokz)
Probability: 0.56
Predicted label: positive
text ="The net sales stayed the as the same quarter last year"_ = get_prediction(best_trainers2[8].model, text, tokz)
Probability: 0.72
Predicted label: neutral
Final Thoughts
This notebook closes out my initial quick-and-dirty model fine-tuning experiments for the TinyStories family (33M, 8M, 3M, 1M) on the financial_phrasebank dataset. Here is a summary of my results:
Base Model
Fine-tuning Learning Rate
Best Val Acc
Best Test Acc
TinyStories-33M
5e-04
86%
79%
TinyStories-8M
8e-05
85%
86%
TinyStories-8M
5e-04
79%
86%
TinyStories-3M
8e-05
78%
74%
TinyStories-1M
1e-04
75%
69%
TinyStories-1M
1e-03
74%
68%
Three main takeaways:
Set the random seed for each iteration to avoid getting the same accuracy and loss values in a for-loop
The 8M model had a 7% higher test set accuracy than the 33M model.
The best performing learning rates for the smallest model, 1M, were 1-2 orders of magnitude smaller than the best performing learning rates for the 33M, 8M and 3M models.
Future work:
Refactor this code to avoid changing so many variables.
Do a more thorough hyperparameter sweep (random seeds, epochs, warmup, weight decay, learning rates) for each model.
Fine-tune the models on a synthetically generated version of financial_phrasebank that’s at a lower reading level to see if it improves performance.
I hope you enjoyed this blog post! Follow me on Twitter @vishal_learner.