Show pip install
s
!pip install transformers -Uqq
!pip install accelerate -qq
!pip install torch==2.2.2 -qq
!pip install datasets~=2.16.1 -qq
!pip install scikit-learn==1.2 -qq
financial_phrasebank
dataset with 79.5% accuracy.
Vishal Bakshi
November 18, 2024
from datasets import load_dataset, Dataset
import pandas as pd, numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
import random
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2-0.5B-Instruct",
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
# load dataset
dataset = load_dataset(
"financial_phrasebank", "sentences_allagree",
split="train" # note that the dataset does not have a default test split
)
# create a new column with the numeric label verbalised as label_text (e.g. "positive" instead of "0")
label_map = {i: label_text for i, label_text in enumerate(dataset.features["label"].names)}
def add_label_text(example):
example["label_text"] = label_map[example["label"]]
return example
dataset = dataset.map(add_label_text)
print(dataset)
generate_response
functiondef generate_response(prompt):
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=2
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
add_prompt
and generate_responses
functionsdef add_prompt(item, prompt):
item['prompt'] = prompt.format(text=item['sentence'])
return item
def generate_responses(dataset, prompt):
responses = []
dataset = dataset.map(add_prompt, fn_kwargs={"prompt": prompt})
print(dataset[0]['prompt'])
for row in dataset:
messages = [
{"role": "user", "content": row['prompt']}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=2
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip().lower()
responses.append(response)
# calculate accuracy
df = dataset.to_pandas()
df['responses'] = pd.Series(responses)
#df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
return df, acc
make_cm
functiondef make_cm(df):
"""Create confusion matrix for true vs predicted sentiment classes"""
cm = confusion_matrix(y_true=df['label_text'], y_pred=df['responses'], labels=['negative', 'neutral', 'positive', 'other'])
disp = ConfusionMatrixDisplay(cm, display_labels=['negative', 'neutral', 'positive', 'other'])
# I chose 8x8 so it fits on one screen but still is large
fig, ax = plt.subplots(figsize=(8,8))
disp.plot(ax=ax,text_kw={'fontsize': 16}, cmap='Blues', colorbar=False);
# change label font size without changing label text
ax.xaxis.label.set_fontsize(18)
ax.yaxis.label.set_fontsize(18)
# make tick labels larger
ax.tick_params(axis='y', labelsize=16)
ax.tick_params(axis='x', labelsize=16)
few_shot_responses
functiondef few_shot_responses(dataset, prompt, examples):
responses = []
dataset = dataset.map(add_prompt, fn_kwargs={"prompt": prompt})
print(dataset[0]['prompt'])
few_shot_examples = []
for example in examples:
few_shot_examples.append({"role": "user", "content": prompt.format(text=example[0])})
few_shot_examples.append({"role": "assistant", "content": example[1]})
for row in dataset:
messages = few_shot_examples + [{"role": "user", "content": row['prompt']}]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=2
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip().lower()
responses.append(response)
df = dataset.to_pandas()
df['responses'] = pd.Series(responses)
return df
get_ds
functiondef get_ds(n):
exclude_idxs = [random.randint(0, 2263) for _ in range(n)]
prompt_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
print(prompt_ds, [el[1] for el in examples[:10]])
return prompt_ds, examples
In this notebook I’ll use Qwen2-0.5B-Instruct to classify sentiment in the financial_phrasebank
dataset. In previous notebooks I have performed sentiment classification with Qwen2-1.5B-Instruct, phi-2, phi-3, phi-3.5, and the Claude series.
This notebook is part of a series of blog posts for a project I’m working called TinySentiment where I’m experimenting with tiny models to improve their ability to classify sentiment in the financial_phrasebank dataset
. I was inspired to do so after reading this blog post and this corresponding notebook by Moritz Laurer as part of a fastai study group last year.
Here are the results from my experiments so far (**the best-performing prompt from this notebook):
Model | Prompting Strategy | Overall Accuracy | negative |
neutral |
positive |
---|---|---|---|---|---|
claude-3-5-sonnet-20240620 | 3-Shot | 94.78% | 98% (297/303) | 94% (1302/1391) | 95% (544/570) |
claude-3-opus-20240229 | 0-Shot | 94.13% | 98% (297/303) | 96% (1333/1391) | 88% (501/570) |
phi-3.5 | 20-Shot | 93.94% | 96% (286/299) | 98% (1355/1379) | 83% (467/566) |
phi-3 | 30-Shot w/System Prompt | 92.79% | 98% (290/297) | 94% (1284/1373) | 88% (499/564) |
claude-3-haiku-20240307 | 3-Shot | 92.39% | 90% (272/303) | 91% (1267/1391) | 96% (550/570) |
phi-2 | 6-Shot | 91.94% | 88% (267/302) | 94% (1299/1387) | 90% (510/569) |
Qwen2-1.5B | 27-Shot | 86.10% | 90% (264/294) | 96% (1320/1382) | 61% (342/561) |
**Qwen2-0.5B | 17-Shot | 79.48% | 69% (206/300) | 86% (1180/1380) | 71% (400/567) |
Here are the results from this notebook:
Prompt | Strategy | Accuracy | Negative | Neutral | Positive |
---|---|---|---|---|---|
A | 0-Shot | 62.41% | 91% (276/303) | 53% (735/1391) | 71% (402/570) |
B | 0-Shot | 47.84% | 90% (274/303) | 57% (789/1391) | 4% (20/570) |
C | 0-Shot | 40.46% | 91% (276/303) | 43% (594/1391) | 8% (46/570) |
D | 0-Shot | 68.29% | 79% (240/303) | 61% (851/1391) | 80% (455/570) |
E | 0-Shot | 51.19% | 97% (293/303) | 28% (396/1391) | 82% (470/570) |
F | 0-Shot | 48.19% | 94% (286/303) | 21% (287/1391) | 91% (518/570) |
G | 0-Shot | 61.09% | 93% (282/303) | 46% (646/1391) | 80% (455/570) |
H | 0-Shot | 65.42% | 85% (257/303) | 57% (798/1391) | 75% (426/570) |
I | 0-Shot | 66.12% | 81% (245/303) | 58% (800/1391) | 79% (452/570) |
J | 3-Shot | 70.94% | 43% (131/302) | 75% (1042/1390) | 76% (431/569) |
K | 3-Shot | 74.88% | 67% (201/302) | 75% (1043/1390) | 79% (449/569) |
L | 3-Shot | 68.11% | 49% (149/302) | 65% (900/1390) | 86% (491/569) |
M | 3-Shot | 56.97% | 49% (149/302) | 45% (625/1390) | 90% (514/569) |
N | 3-Shot | 73.95% | 62% (188/302) | 75% (1038/1390) | 78% (446/569) |
O | 3-Shot | 59.97% | 65% (196/302) | 46% (635/1390) | 92% (525/569) |
P | 6-Shot | 63.91% | 95% (289/303) | 49% (678/1389) | 84% (476/566) |
Q | 6-Shot | 65.72% | 69% (207/302) | 55% (765/1389) | 90% (512/567) |
R | 6-Shot | 64.84% | 94% (285/303) | 49% (686/1387) | 87% (493/568) |
S | 6-Shot | 62.98% | 96% (292/303) | 47% (656/1387) | 83% (474/568) |
T | 6-Shot | 68.87% | 51% (155/302) | 70% (966/1387) | 76% (434/569) |
U | 12-Shot | 65.50% | 53% (159/302) | 59% (820/1386) | 88% (496/564) |
V | 12-Shot | 73.22% | 70% (209/300) | 80% (1103/1386) | 60% (337/566) |
W | 12-Shot | 70.43% | 82% (246/301) | 66% (912/1384) | 75% (428/567) |
X | 12-Shot | 76.60% | 91% (270/298) | 72% (1000/1386) | 80% (455/568) |
Y | 12-Shot | 72.56% | 80% (243/303) | 77% (1069/1381) | 57% (322/568) |
Z | 18-Shot | 71.33% | 50% (150/301) | 75% (1036/1382) | 74% (416/563) |
AA | 17-Shot | 79.48% | 69% (206/300) | 86% (1180/1380) | 71% (400/567) |
AB | 18-Shot | 74.22% | 77% (229/299) | 76% (1054/1381) | 68% (384/566) |
AC | 18-Shot | 68.57% | 49% (148/302) | 73% (1013/1380) | 67% (379/564) |
AD | 18-Shot | 74.98% | 89% (271/303) | 76% (1052/1379) | 64% (361/564) |
AE | 24-Shot | 74.91% | 61% (181/299) | 92% (1267/1375) | 41% (230/566) |
AF | 24-Shot | 73.08% | 37% (112/302) | 91% (1246/1375) | 50% (279/563) |
AG | 24-Shot | 75.00% | 58% (173/300) | 92% (1265/1375) | 43% (242/565) |
AH | 24-Shot | 77.46% | 78% (233/299) | 84% (1153/1375) | 62% (349/566) |
AI | 23-Shot | 75.37% | 48% (143/301) | 92% (1266/1375) | 50% (280/565) |
AJ | 30-Shot | 77.39% | 58% (172/298) | 94% (1284/1370) | 48% (273/566) |
AK | 30-Shot | 67.78% | 63% (187/299) | 61% (844/1375) | 86% (483/560) |
AL | 30-Shot | 76.54% | 58% (173/299) | 86% (1185/1372) | 63% (352/563) |
AM | 30-Shot | 74.84% | 82% (242/296) | 72% (984/1376) | 79% (446/562) |
AN | 30-Shot | 73.81% | 51% (154/300) | 77% (1052/1372) | 79% (443/562) |
AO | 45-Shot | 74.18% | 54% (159/297) | 76% (1034/1366) | 81% (453/556) |
AP | 45-Shot | 78.73% | 63% (186/296) | 87% (1192/1365) | 66% (369/558) |
AQ | 45-Shot | 72.01% | 17% (51/301) | 89% (1210/1359) | 60% (337/559) |
AR | 45-Shot | 73.86% | 53% (157/297) | 80% (1094/1364) | 70% (388/558) |
AS | 45-Shot | 74.94% | 42% (125/297) | 89% (1219/1363) | 57% (319/559) |
AT | 60-Shot | 72.19% | 47% (138/292) | 78% (1055/1356) | 72% (398/556) |
AU | 60-Shot | 76.86% | 43% (127/296) | 91% (1237/1356) | 60% (330/552) |
AV | 60-Shot | 75.45% | 26% (79/299) | 89% (1206/1352) | 68% (378/553) |
AW | 60-Shot | 74.46% | 29% (88/299) | 86% (1157/1349) | 71% (396/556) |
AX | 60-Shot | 79.63% | 62% (179/290) | 94% (1275/1352) | 54% (301/562) |
I’ll start out with a simple instruction.
promptA = """Label the following TEXT with a single word: negative, positive, or neutral
TEXT: {text}"""
print(promptA)
Label the following TEXT with a single word: negative, positive, or neutral
TEXT: {text}
Label the following TEXT with a single word: negative, positive, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
'Negative.'
Good—at least it’s responding with a sensible answer, although it’s not formatted how I’d like to be, so I expect to need more data cleaning than Qwen2-1.5B-Instruct’s responses.
At ~35ms per prompt it will take about 80 seconds to run inference on the full 2264 item dataset.
35.4 ms ± 472 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Label the following TEXT with a single word: negative, positive, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
0.5B yields messier responses. Note the period at the end of some of the strings. For now I’ll manually check each set of responses and clean them accordingly.
array(['neutral.', 'positive', 'neutral', 'negative', 'positive.',
'negative.', 'negot', 'negative profit', 'net interest', 'teleste',
'neglig'], dtype=object)
0.5B doesn’t do terribly on this simple prompt (62.4% accuracy) but it’s almost 20% less accurate than 1.5B (~82% accuracy).
0.6241166077738516
0.5B does a great job at classifying negative
sentiment, does quite well at positive
sentences, and has very few other
responses overall.
Instruct: label the following TEXT with a single word: negative, positive, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
label the TEXT with a single word: negative, positive, or neutral
With this prompt (where the instruction is repeated after the dataset text) 0.5B responds much more cleanly.
However, it performs almost 20% worse!
0.47835689045936397
While it’s quite good still with negative
sentiment, it performs significantly worse on positive
sentences.
I’ll use the same Prompt C as the 1.5B model: a reword of Prompt A (which performed well for 0.5B).
Respond with a single word: negative, positive, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
array(['negative.', 'negative', 'neutral', 'neutral.', 'positive',
'positive.', 'negative loss'], dtype=object)
The change in prompt language significantly deteriorates 0.5B’s accuracy.
df['responses'] = df['responses'].str.replace('.', '', regex=False)
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.4045936395759717
0.5B still does really well on negative
sentiment, but does horribly on positive
and underwhelming for neutral
sentences.
I’ll change the order of sentiment listed in Prompt A by putting positive
first:
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
array(['neutral.', 'positive', 'positive.', 'neutral', 'net income',
'the text', 'negative', 'negative.', 'negative net', 'negot',
'subscription'], dtype=object)
Changing the order of sentiment (putting positive
first) increases the overall accuracy by ~6%.
df['responses'] = df['responses'].str.replace('.', '', regex=False)
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.6828621908127208
0.5B’s performance on negative
sentiment dips a bit (36 fewer correct) but that is more than compensated by the increase in correctly classified positive
(+53) and neutral
(+166) sentences.
I’ll try another combination:
Label the following TEXT with a single word: negative, neutral, or positive
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
array(['negative.', 'positive', 'negative', 'positive.', 'neutral.',
'neutral', 'negative profit', 'negot', 'teleste'], dtype=object)
This ordering of sentiment worsens the accuracy by 10 points.
df['responses'] = df['responses'].str.replace('.', '', regex=False)
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.5119257950530035
0.5B is nearly perfect for negative
sentiment, and quite good with positive
sentences, but abysmal for neutral
.
Trying the next permutation of sentiments:
Label the following TEXT with a single word: positive, neutral, or negative
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
array(['neutral.', 'positive', 'positive.', 'negative', 'negative.',
'neutral', 'positive net', 'negativ', 'negot', 'subscription'],
dtype=object)
This ordering of sentiments further worsens the overall accuracy.
df['responses'] = df['responses'].str.replace('.', '', regex=False)
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.4818904593639576
positive
sentences are classified correctly at the highest rate so far, and negative
sentiment accuracy is very good, but the model does terribly on neutral
sentences.
The next ordering of sentiments:
Label the following TEXT with a single word: neutral, negative, or positive
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
array(['neutral.', 'positive', 'positive.', 'neutral', 'negative',
'negative.', 'positive profit', 'negot'], dtype=object)
The accuracy of 61% is worse than the best-performing Prompt D (68%).
The last ordering of sentiment:
Label the following TEXT with a single word: neutral, positive, or negative
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
array(['positive', 'neutral.', 'positive.', 'neutral', 'negative',
'negative.', 'positive profit', 'negot'], dtype=object)
This yields a 65% accuracy.
I’ll make a small change to my best-performing prompt by adding a period at the end of the instruction.
Label the following TEXT with a single word: positive, negative, or neutral.
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
array(['neutral.', 'positive', 'positive.', 'neutral', 'net income',
'negative', 'negative.', 'negative profit', 'nord', 'negot',
'the text', 'negation', 'neglig', 'subscription'], dtype=object)
Adding a period to the end of the instruction worsens the accuracy a bit.
df['responses'] = df['responses'].str.replace('.', '', regex=False)
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.6612190812720848
Adding a period worsens the performance on neutral
by 51 sentences.
I’ll now shift my attention to few-shot prompts, starting with 3-Shot.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2261
})
Since ordering seems to matter, I’ll start with a neutral
example, positive
example and negative
example.
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
examples
[('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
'neutral'),
("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
'positive'),
('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
'negative')]
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
3-Shot prompting resulted in the best accuracy so far! ~71%.
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.709420610349403
Compared to my best 0-Shot Prompt D (68%) this prompt results in the model significantly underperforming on negative
sentences, (131 < 240), but more than making up for it on neutral
sentences (1042 > 851).
I’ll re-order the examples and use the same Prompt J.
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
examples
[('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
'neutral'),
('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
'negative'),
("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
'positive')]
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
Changing the order of examples to neutral
, negative
, positive
increases the overall accuracy to almost 75%!
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.7487837240159222
The model improves on all three sentiments compared to Prompt J.
I’ll re-order the examples and use the same Prompt J.
exclude_idxs = [1, 0, 292]
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
examples
[("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
'positive'),
('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
'neutral'),
('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
'negative')]
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
This ordering of examples drops the accuracy to 68%.
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.6811145510835913
Compared to the best-performing Prompt K, this prompt yields a better accuracy for positive
sentences (491 > 449).
I’ll re-order the examples and use the same Prompt J.
exclude_idxs = [1, 292, 0]
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
examples
[("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
'positive'),
('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
'negative'),
('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
'neutral')]
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
This ordering of examples worsens the accuracy, dropping it down to 57%.
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.5696594427244582
This prompt yields better results for positive
sentiment (514 > 449) than the best overall performing Prompt J.
Trying the next ordering of sentiments:
exclude_idxs = [292, 0, 1]
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
examples
[('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
'negative'),
('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
'neutral'),
("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
'positive')]
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
This ordering results in the second-highest overall accuracy at 74%.
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.7394957983193278
This prompt performs slightly worse for all three sentiments than the so far best-overall performing Prompt K.
Here’s the final 3-sentiment ordering:
exclude_idxs = [292, 1, 0]
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
examples
[('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
'negative'),
("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
'positive'),
('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
'neutral')]
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
This ordering of sentiment does not beat my so far best-performing accuracy.
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.599734630694383
This prompt yields a much better performance on positive
sentiment than my best performing Prompt K (525 > 449).
Next, I’ll increase the number of examples to 6. Note that I won’t be trying all permutations but a few random ones.
exclude_idxs = [random.randint(0, 2263) for _ in range(6)]
promptP_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
promptP_ds
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2258
})
The random examples I have picked don’t include a negative
sentence. I’m curious to see how the model performs on this.
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
[el[1] for el in examples]
['positive', 'neutral', 'positive', 'neutral', 'positive', 'positive']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
This prompt results in a worse performance in overall accuracy.
Even though no negative
examples were given, this prompt yields considerably more correct negative
sentences (289) than the best-performing Prompt K (201).
I’ll try another random set of 6 examples, this time making sure there’s at least one of each sentiment.
exclude_idxs = [random.randint(0, 2263) for _ in range(6)]
promptQ_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
promptQ_ds, [el[1] for el in examples]
(Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2258
}),
['positive', 'positive', 'positive', 'neutral', 'negative', 'neutral'])
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
This set of 6 examples does not improve upon the best-overall accuracy of 75%.
Something we haven’t seen in awhile, an other
response.
exclude_idxs = [random.randint(0, 2263) for _ in range(6)]
promptR_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
promptR_ds, [el[1] for el in examples]
(Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2258
}),
['neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral'])
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
No improvements on accuracy with this prompt.
Compared to the best-performing Prompt K, this prompt yields considerably more correct negative
(285 > 201) and positive
(493 > 449) sentences but underperforms on neutral
sentences (686 < 1043).
exclude_idxs = [random.randint(0, 2263) for _ in range(6)]
promptS_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
promptS_ds, [el[1] for el in examples]
(Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2258
}),
['neutral', 'neutral', 'positive', 'neutral', 'positive', 'neutral'])
This set of examples has no negative
sentences and a majority of neutral
sentences.
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
This set of examples does not improve on the best-overall accuracy of 75% (Prompt K).
It does, however, have a considerably larger number of correctly labeled negative
sentences (292 > 201).
I’ll try one more 6-shot prompt before I increase the number of examples.
exclude_idxs = [random.randint(0, 2263) for _ in range(6)]
promptT_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
promptT_ds, [el[1] for el in examples]
(Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2258
}),
['neutral', 'neutral', 'positive', 'neutral', 'negative', 'neutral'])
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Similar to the other 6-Shot examples, this set of examples does not improve on the best overall accuracy.
I’ll now increase the number of examples in the prompt to 12, and try out 5 random sets of 12 examples.
exclude_idxs = [random.randint(0, 2263) for _ in range(12)]
promptU_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
promptU_ds, [el[1] for el in examples]
(Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2252
}),
['positive',
'negative',
'positive',
'neutral',
'positive',
'positive',
'neutral',
'positive',
'neutral',
'neutral',
'positive',
'neutral'])
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Increasing the number of examples to 12, at least the 12 I chose here, doesn’t improve on the best overall accuracy.
The number of correct positive
sentences is considerably higher than Prompt K (496 > 449).
exclude_idxs = [random.randint(0, 2263) for _ in range(12)]
promptV_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
promptV_ds, [el[1] for el in examples]
(Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2252
}),
['neutral',
'positive',
'positive',
'neutral',
'neutral',
'negative',
'negative',
'neutral',
'positive',
'neutral',
'positive',
'negative'])
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .
This prompt performs well, and competes with but doesn’t improve upon the best overall accuracy of 75%.
This prompt performs considerably better on neutral
sentences than Prompt K (1103 > 1043).
exclude_idxs = [random.randint(0, 2263) for _ in range(12)]
promptW_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
promptW_ds, [el[1] for el in examples]
(Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2252
}),
['neutral',
'negative',
'neutral',
'positive',
'neutral',
'neutral',
'positive',
'neutral',
'neutral',
'negative',
'positive',
'neutral'])
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The accuracy worsens with this set of 12 examples.
def get_ds(n):
exclude_idxs = [random.randint(0, 2263) for _ in range(n)]
prompt_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
examples = []
for idx in exclude_idxs:
examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
print(prompt_ds, [el[1] for el in examples])
return prompt_ds, examples
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2252
}) ['positive', 'neutral', 'negative', 'neutral', 'positive', 'negative', 'neutral', 'negative', 'negative', 'neutral', 'neutral', 'negative']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Aha! This prompt improves upon the best overall accuracy, reaching about 77%.
Compared to Prompt K (75%) this prompt performs worse on neutral
sentences (1000 < 1043) but more than makes up for it on negative
(270 > 201) and positive
(455 > 449) sentences.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2252
}) ['neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
This prompt does not improve on the best overall accuracy.
This prompt performs well on negative
and neutral
sentences but its worse performance on positive
sentences brings down the overall accuracy.
Next, I’ll try 5 prompts with 18 examples.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2246
}) ['neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'positive', 'negative', 'negative', 'positive', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
This prompt does not improve upon overall accuracy.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2247
}) ['neutral', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'positive', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
This set of 18 examples increases the best overall accuraacy to almost 80%!
Compared to Prompt X, this prompt performs worse on negative
(206 < 270) and positive
(400 < 455) but more than makes up for it on neutral
sentences (1180 > 1000).
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2246
}) ['neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'positive', 'positive']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
This prompt does not improve upon the best overall accuracy.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2246
}) ['neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'positive', 'neutral', 'positive', 'neutral', 'positive', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
This prompt does not improve upon the best overall accuracy.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2246
}) ['neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'positive']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
This prompt does not improve upon the best overall accuracy.
This prompt yields considerably more correct negative
sentences (271 > 206) than the best-performing Prompt AA.
Next, I’ll try 5 prompts with 24 examples each.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2240
}) ['neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'negative']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Increasing the number of examples to 24 (at least for these 24 examples) does not improve upon the overall accuracy.
Compared to the best performing Prompt AA, this prompt yields considerably more correct neutral
sentences (1267 > 1180).
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2240
}) ['positive', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
This prompt doesn’t improve upon the best overall accuracy, and performs better than Prompt AA on neutral
sentences.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2240
}) ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The same trend continues for this set of 24 examples.
Two more 24-Shot prompts to go.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2240
}) ['positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'neutral', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
This prompt does not improve upon the best overall accuracy (though it comes close).
This prompt yields more correct negative
sentences than Prompt AA (233 > 206).
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2241
}) ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'positive', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
This prompt does not improve upon the best overall accuracy.
This prompt yields considerably more correct neutral
sentences than the best performing Prompt AA (1266 > 1180).
Next, I’ll try 5 different 30-Shot prompts.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2234
}) ['neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'neutral', 'negative', 'neutral', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
This prompt doesn’t improve the best overall accuracy.
As seems to be the trend, this prompt results in more correct neutral
responses (1284) than Prompt AA (1180).
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2234
}) ['neutral', 'neutral', 'neutral', 'negative', 'negative', 'positive', 'neutral', 'neutral', 'positive', 'positive']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The model performs considerably worse with these 30 examples.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2234
}) ['positive', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The trend continues: the overall accuracy doesn’t improve but the model’s performance on neutral
sentences does.
Two more 30-Shot prompts to go.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2234
}) ['negative', 'neutral', 'positive', 'negative', 'neutral', 'neutral', 'positive', 'negative', 'positive', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The overall accuracy doesn’t improve but the model’s performance on negative
and positive
sentences does.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2234
}) ['positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The overall accuracy doesn’t improve but the model’s performance on positive
sentences does.
Next, I’ll increase the number of examples to 45.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2219
}) ['neutral', 'neutral', 'positive', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'positive', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The overall accuracy doesn’t improve but the model’s performance on positive
sentences does.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2219
}) ['neutral', 'positive', 'negative', 'neutral', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'positive']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The overall accuracy doesn’t improve but the model’s performance on neutral
sentences does.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2219
}) ['neutral', 'neutral', 'negative', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The overall accuracy doesn’t improve but the model’s performance on neutral
sentences does.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2219
}) ['neutral', 'negative', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'positive']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
This prompt performs worse than the best overall Prompt AA.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2219
}) ['neutral', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Compared to Prompt AA, this prompt yields a worse overall accuracy but improves on neutral
sentences (1291 > 1180).
Next, I’ll move on to the final number of examples: 60.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2204
}) ['positive', 'neutral', 'neutral', 'negative', 'negative', 'neutral', 'neutral', 'neutral', 'negative', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Upping the number of examples to 60 does not improve results.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2204
}) ['neutral', 'positive', 'neutral', 'positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Compared to Prompt AA, this prompt yields a worse overall accuracy but improves on neutral
sentences (1237 > 1180).
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2204
}) ['positive', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'positive', 'negative', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Compared to Prompt AA, this prompt yields a worse overall accuracy but improves on neutral
sentences (1206 > 1180).
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2204
}) ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
This prompt does not improve upon Prompt AA results.
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2204
}) ['neutral', 'neutral', 'negative', 'negative', 'neutral', 'negative', 'positive', 'neutral', 'positive', 'negative']
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Aha! We finally improve on the overall accuracy of Prompt AA. This prompt yields a slightly higher accuracy that still rounds off to 80%.
While 60-shot Prompt AX had a slightly higher accuracy (79.63%) I am going to pick the 16-Shot Prompt AA as my best prompt (79.48%) since it has less than a third of the examples, which translates to about a third of the tokens, thus leading to quicker response generation.
def test_gen(examples):
few_shot_examples = []
for example in examples:
few_shot_examples.append({"role": "user", "content": promptJ.format(text=example[0])})
few_shot_examples.append({"role": "assistant", "content": example[1]})
messages = few_shot_examples + [{"role": "user", "content": promptJ.format(text=dataset[0]['sentence'])}]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=2
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip().lower()
return response
Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2246
}) ['negative', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'positive']
1 response generation takes 72ms. Running full dataset inference 10 times will take about 30 minutes.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
72 ms ± 42.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
few_shot_responses
functiondef few_shot_responses(dataset, prompt, examples):
responses = []
dataset = dataset.map(add_prompt, fn_kwargs={"prompt": prompt})
few_shot_examples = []
for example in examples:
few_shot_examples.append({"role": "user", "content": prompt.format(text=example[0])})
few_shot_examples.append({"role": "assistant", "content": example[1]})
for row in dataset:
messages = few_shot_examples + [{"role": "user", "content": row['prompt']}]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=2
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip().lower()
responses.append(response)
# calculate accuracy
df = dataset.to_pandas()
df['responses'] = pd.Series(responses)
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
return df, acc
I didn’t store the exact 18 examples that I used the first time for Prompt AA, so I had to try different 18-Shot examples until I achieved an accuracy close to 79.48%. It took me about 20 tries, but I finally found a set of examples that broke the 79% threshold.
(Dataset({
features: ['sentence', 'label', 'label_text', '__index_level_0__'],
num_rows: 2246
}),
18)
For this prompt, the overall accuracy ranges from 80.8% to 82.4%.
Takeaways from my Qwen2-0.5B experiments:
Here are the results of Qwen2-0.5B in the context of the other models that I have experimented with:
Model | Prompting Strategy | Overall Accuracy | negative |
neutral |
positive |
---|---|---|---|---|---|
claude-3-5-sonnet-20240620 | 3-Shot | 94.78% | 98% (297/303) | 94% (1302/1391) | 95% (544/570) |
claude-3-opus-20240229 | 0-Shot | 94.13% | 98% (297/303) | 96% (1333/1391) | 88% (501/570) |
phi-3.5 | 20-Shot | 93.94% | 96% (286/299) | 98% (1355/1379) | 83% (467/566) |
phi-3 | 30-Shot w/System Prompt | 92.79% | 98% (290/297) | 94% (1284/1373) | 88% (499/564) |
claude-3-haiku-20240307 | 3-Shot | 92.39% | 90% (272/303) | 91% (1267/1391) | 96% (550/570) |
phi-2 | 6-Shot | 91.94% | 88% (267/302) | 94% (1299/1387) | 90% (510/569) |
Qwen2-1.5B | 27-Shot | 86.10% | 90% (264/294) | 96% (1320/1382) | 61% (342/561) |
**Qwen2-0.5B | 17-Shot | 79.48% | 69% (206/300) | 86% (1180/1380) | 71% (400/567) |
Here are the results from this notebook:
Prompt | Strategy | Accuracy | Negative | Neutral | Positive |
---|---|---|---|---|---|
A | 0-Shot | 62.41% | 91% (276/303) | 53% (735/1391) | 71% (402/570) |
B | 0-Shot | 47.84% | 90% (274/303) | 57% (789/1391) | 4% (20/570) |
C | 0-Shot | 40.46% | 91% (276/303) | 43% (594/1391) | 8% (46/570) |
D | 0-Shot | 68.29% | 79% (240/303) | 61% (851/1391) | 80% (455/570) |
E | 0-Shot | 51.19% | 97% (293/303) | 28% (396/1391) | 82% (470/570) |
F | 0-Shot | 48.19% | 94% (286/303) | 21% (287/1391) | 91% (518/570) |
G | 0-Shot | 61.09% | 93% (282/303) | 46% (646/1391) | 80% (455/570) |
H | 0-Shot | 65.42% | 85% (257/303) | 57% (798/1391) | 75% (426/570) |
I | 0-Shot | 66.12% | 81% (245/303) | 58% (800/1391) | 79% (452/570) |
J | 3-Shot | 70.94% | 43% (131/302) | 75% (1042/1390) | 76% (431/569) |
K | 3-Shot | 74.88% | 67% (201/302) | 75% (1043/1390) | 79% (449/569) |
L | 3-Shot | 68.11% | 49% (149/302) | 65% (900/1390) | 86% (491/569) |
M | 3-Shot | 56.97% | 49% (149/302) | 45% (625/1390) | 90% (514/569) |
N | 3-Shot | 73.95% | 62% (188/302) | 75% (1038/1390) | 78% (446/569) |
O | 3-Shot | 59.97% | 65% (196/302) | 46% (635/1390) | 92% (525/569) |
P | 6-Shot | 63.91% | 95% (289/303) | 49% (678/1389) | 84% (476/566) |
Q | 6-Shot | 65.72% | 69% (207/302) | 55% (765/1389) | 90% (512/567) |
R | 6-Shot | 64.84% | 94% (285/303) | 49% (686/1387) | 87% (493/568) |
S | 6-Shot | 62.98% | 96% (292/303) | 47% (656/1387) | 83% (474/568) |
T | 6-Shot | 68.87% | 51% (155/302) | 70% (966/1387) | 76% (434/569) |
U | 12-Shot | 65.50% | 53% (159/302) | 59% (820/1386) | 88% (496/564) |
V | 12-Shot | 73.22% | 70% (209/300) | 80% (1103/1386) | 60% (337/566) |
W | 12-Shot | 70.43% | 82% (246/301) | 66% (912/1384) | 75% (428/567) |
X | 12-Shot | 76.60% | 91% (270/298) | 72% (1000/1386) | 80% (455/568) |
Y | 12-Shot | 72.56% | 80% (243/303) | 77% (1069/1381) | 57% (322/568) |
Z | 18-Shot | 71.33% | 50% (150/301) | 75% (1036/1382) | 74% (416/563) |
AA | 17-Shot | 79.48% | 69% (206/300) | 86% (1180/1380) | 71% (400/567) |
AB | 18-Shot | 74.22% | 77% (229/299) | 76% (1054/1381) | 68% (384/566) |
AC | 18-Shot | 68.57% | 49% (148/302) | 73% (1013/1380) | 67% (379/564) |
AD | 18-Shot | 74.98% | 89% (271/303) | 76% (1052/1379) | 64% (361/564) |
AE | 24-Shot | 74.91% | 61% (181/299) | 92% (1267/1375) | 41% (230/566) |
AF | 24-Shot | 73.08% | 37% (112/302) | 91% (1246/1375) | 50% (279/563) |
AG | 24-Shot | 75.00% | 58% (173/300) | 92% (1265/1375) | 43% (242/565) |
AH | 24-Shot | 77.46% | 78% (233/299) | 84% (1153/1375) | 62% (349/566) |
AI | 23-Shot | 75.37% | 48% (143/301) | 92% (1266/1375) | 50% (280/565) |
AJ | 30-Shot | 77.39% | 58% (172/298) | 94% (1284/1370) | 48% (273/566) |
AK | 30-Shot | 67.78% | 63% (187/299) | 61% (844/1375) | 86% (483/560) |
AL | 30-Shot | 76.54% | 58% (173/299) | 86% (1185/1372) | 63% (352/563) |
AM | 30-Shot | 74.84% | 82% (242/296) | 72% (984/1376) | 79% (446/562) |
AN | 30-Shot | 73.81% | 51% (154/300) | 77% (1052/1372) | 79% (443/562) |
AO | 45-Shot | 74.18% | 54% (159/297) | 76% (1034/1366) | 81% (453/556) |
AP | 45-Shot | 78.73% | 63% (186/296) | 87% (1192/1365) | 66% (369/558) |
AQ | 45-Shot | 72.01% | 17% (51/301) | 89% (1210/1359) | 60% (337/559) |
AR | 45-Shot | 73.86% | 53% (157/297) | 80% (1094/1364) | 70% (388/558) |
AS | 45-Shot | 74.94% | 42% (125/297) | 89% (1219/1363) | 57% (319/559) |
AT | 60-Shot | 72.19% | 47% (138/292) | 78% (1055/1356) | 72% (398/556) |
AU | 60-Shot | 76.86% | 43% (127/296) | 91% (1237/1356) | 60% (330/552) |
AV | 60-Shot | 75.45% | 26% (79/299) | 89% (1206/1352) | 68% (378/553) |
AW | 60-Shot | 74.46% | 29% (88/299) | 86% (1157/1349) | 71% (396/556) |
AX | 60-Shot | 79.63% | 62% (179/290) | 94% (1275/1352) | 54% (301/562) |