In this notebook, I’ll finetune the smaller TinyStories-3M model and see how it performs. I also suspect these smaller models might perform better on a (synthetically generated) simpler version of this dataset, which I plan to explore in a future notebook.
from datasets import load_datasetfrom transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TrainerCallbackfrom sklearn.metrics import confusion_matrix, ConfusionMatrixDisplayimport pandas as pdimport matplotlib.pyplot as pltimport numpy as npimport torchimport torch.nn.functional as Fimport gcdef report_gpu():print(torch.cuda.list_gpu_processes()) gc.collect() torch.cuda.empty_cache()#model_nm = "roneneldan/TinyStories-33M"#model_nm = "roneneldan/TinyStories-1M"model_nm ="roneneldan/TinyStories-3M"#model_nm = "roneneldan/TinyStories-8M"tokz = AutoTokenizer.from_pretrained(model_nm)def tok_func(x): return tokz(x["input"], padding=True, truncation=True)
:::
Preparing Datasets
Much of the code in this section is boilerplate, tokenizing the dataset and splitting it into training, validation and test sets.
Show load_dataset
dataset = load_dataset("financial_phrasebank", "sentences_allagree", split="train"# note that the dataset does not have a default test split)dataset = dataset.rename_columns({'label':'labels', 'sentence': 'input'})
tokz.add_special_tokens({'pad_token': '[PAD]'})tokz.padding_side ="left"# https://github.com/huggingface/transformers/issues/16595 and https://www.kaggle.com/code/baekseungyun/gpt-2-with-huggingface-pytorchtok_ds = dataset.map(tok_func, batched=True)
tok_ds[0]['input']
'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .'
tok_ds[0]['input_ids'][100:110] # first 100 elements are 50257 ('[PAD]')
Much of the code in this section is either helper functions (like get_acc, MetricCallback, or results_to_dataframe) or boilerplate code to prepare a HuggingFace trainer:
# thanks Claudeclass MetricCallback(TrainerCallback):def__init__(self):self.metrics = []self.current_epoch_metrics = {}def on_log(self, args, state, control, logs=None, **kwargs):if logs isnotNone:self.current_epoch_metrics.update(logs)def on_epoch_end(self, args, state, control, **kwargs):ifhasattr(state, 'log_history') and state.log_history:# Get the last logged learning rate last_lr = state.log_history[-1].get('learning_rate', None)else: last_lr =Noneself.metrics.append({"epoch": state.epoch,"learning_rate": last_lr,**self.current_epoch_metrics })self.current_epoch_metrics = {} # Reset for next epochdef on_train_end(self, args, state, control, **kwargs):# Capture final metrics after the last epochifself.current_epoch_metrics:self.metrics.append({"epoch": state.num_train_epochs,"learning_rate": self.metrics[-1].get('learning_rate') ifself.metrics elseNone,**self.current_epoch_metrics })
Show results_to_dataframe function
def results_to_dataframe(results, model_name): rows = []for result in results: initial_lr = result['learning_rate']for metric in result['metrics']: row = {'model_name': model_name,'initial_learning_rate': initial_lr,'current_learning_rate': metric.get('learning_rate'), } row.update(metric) rows.append(row) df = pd.DataFrame(rows)# Ensure specific columns are at the beginning first_columns = ['model_name', 'initial_learning_rate', 'current_learning_rate', 'epoch'] other_columns = [col for col in df.columns if col notin first_columns] df = df[first_columns + other_columns]return df
Show make_cm function
def make_cm(df):"""Create confusion matrix for true vs predicted sentiment classes""" cm = confusion_matrix(y_true=df['label_text'], y_pred=df['pred_text'], labels=['negative', 'neutral', 'positive']) disp = ConfusionMatrixDisplay(cm, display_labels=['negative', 'neutral', 'positive']) fig, ax = plt.subplots(figsize=(4,4)) disp.plot(ax=ax,text_kw={'fontsize': 12}, cmap='Blues', colorbar=False);# change label font size without changing label text ax.xaxis.label.set_fontsize(16) ax.yaxis.label.set_fontsize(16)# make tick labels larger ax.tick_params(axis='y', labelsize=14) ax.tick_params(axis='x', labelsize=14)
Show get_prediction function
def get_prediction(model, text, tokz):# Determine the device device = torch.device("cuda"if torch.cuda.is_available() else"cpu")# Move the model to the appropriate device model = model.to(device)# Tokenize the input text inputs = tokz(text, return_tensors="pt", truncation=True, padding=True)# Move input tensors to the same device as the model inputs = {k: v.to(device) for k, v in inputs.items()}# Get the model's prediction model.eval() # Set the model to evaluation modewith torch.no_grad(): outputs = model(**inputs)# Ensure logits are on CPU for numpy operations logits = outputs.logits.detach().cpu()# Get probabilities probs = torch.softmax(logits, dim=-1)# Get the predicted class p_class = torch.argmax(probs, dim=-1).item()# Get the probability for the predicted class p = probs[0][p_class].item() labels = {0: "negative", 1: "neutral", 2: "positive"}print(f"Probability: {p:.2f}")print(f"Predicted label: {labels[p_class]}")return p_class, p
While there are other hyperparameters to tune (epochs, warmup_ratio, weight_decay) I’ll focus this notebook on fine-tuning with different learning rates. I’ll start with the same learning rates that I used for the 33M and 8M models:
/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
warnings.warn('Was asked to gather along dimension 0, but all '
0.6666666666666666
This 3M parameter finetuned model predicts neutral sentences the best (116/125) followed by positive sentences (25/54) and lastly, negative sentences (22/46). I’ll reiterate that the dataset contains a majority of neutral sentences, followed by positive sentences and the least represented sentiment is negative.
make_cm(test_df)
As the learning rate increases (starting at 1e-6) the validation set accuracy increases until it reaches a peak at a learning rate of 8e-5. It reaches a bit of a second peak at 1e-3.
Show plotting code
final_epoch_metrics = metrics_df.query("epoch == 3")plt.scatter(final_epoch_metrics['initial_learning_rate'], final_epoch_metrics['eval_accuracy']);plt.xscale('log')plt.xlabel('Learning Rate (log scale)')plt.ylabel('Validation Set Accuracy')plt.title('Learning Rate vs. Final Epoch Validation Accuracy');
I’ll test the model (run a “sanity check”) on three made-up sentences. I don’t want to weigh these results too much as they are cherry-picked sentences, but this model gets 2/3 right.
text ="The net sales went up from USD $3.4M to USD $5.6M since the same quarter last year"_ = get_prediction(trainers[4].model, text, tokz)
Probability: 0.55
Predicted label: positive
text ="The net sales went down from USD $8.9M to USD $1.2M since the same quarter last year"_ = get_prediction(trainers[4].model, text, tokz)
Probability: 0.53
Predicted label: positive
text ="The net sales stayed the as the same quarter last year"_ = get_prediction(trainers[4].model, text, tokz)
Probability: 0.74
Predicted label: neutral
Highest Test Set Accuracy
test_dfs = []accs = []for t in trainers: test_df, acc = get_test_df(t) test_dfs.append(test_df) accs.append(acc)
8e-5 is also the learning rate with the highest test set accuracy (67%).
best_metrics = []best_trainers = []lr = learning_rates[4]for i inrange(10): set_seed(42+ i) # Use a different seed for each run metric_callback = MetricCallback() trainer, args = get_trainer(lr=lr, bs=64) trainer.train() best_metrics.append({"learning_rate": lr,"metrics": metric_callback.metrics }) best_trainers.append(trainer) # clean up report_gpu()!rm -r /kaggle/working/outputs