Sentiment Classification with Qwen2-0.5B-Instruct

python
LLM
TinySentiment
In this blog post I use Qwen2-0.5B-Instruct to classify sentiment in the financial_phrasebank dataset with 79.5% accuracy.
Author

Vishal Bakshi

Published

November 18, 2024

Show pip installs
!pip install transformers -Uqq
!pip install accelerate -qq
!pip install torch==2.2.2 -qq
!pip install datasets~=2.16.1 -qq
!pip install scikit-learn==1.2 -qq
Show imports
from datasets import load_dataset, Dataset
import pandas as pd, numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
import random

from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
Show dataset loading functions
# load dataset
dataset = load_dataset(
    "financial_phrasebank", "sentences_allagree", 
    split="train"  # note that the dataset does not have a default test split
)

# create a new column with the numeric label verbalised as label_text (e.g. "positive" instead of "0")
label_map = {i: label_text for i, label_text in enumerate(dataset.features["label"].names)}

def add_label_text(example):
    example["label_text"] = label_map[example["label"]]
    return example

dataset = dataset.map(add_label_text)

print(dataset)
Show generate_response function
def generate_response(prompt):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=2
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response
Show add_prompt and generate_responses functions
def add_prompt(item, prompt):
        item['prompt'] = prompt.format(text=item['sentence'])
        return item
    
def generate_responses(dataset, prompt):
    responses = []
    dataset = dataset.map(add_prompt, fn_kwargs={"prompt": prompt})
    print(dataset[0]['prompt'])
    
    for row in dataset:
        messages = [
            {"role": "user", "content": row['prompt']}
        ]
        
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(device)

        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=2
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip().lower()
        responses.append(response)
        
    # calculate accuracy
    df = dataset.to_pandas()
    df['responses'] = pd.Series(responses)
    #df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
    df['lm_match'] = df['label_text'] == df['responses']
    acc = df.lm_match.mean()
    
    return df, acc
Show make_cm function
def make_cm(df):
    """Create confusion matrix for true vs predicted sentiment classes"""
    
    cm = confusion_matrix(y_true=df['label_text'], y_pred=df['responses'], labels=['negative', 'neutral', 'positive', 'other'])
    disp = ConfusionMatrixDisplay(cm, display_labels=['negative', 'neutral', 'positive', 'other'])
    
    # I chose 8x8 so it fits on one screen but still is large
    fig, ax = plt.subplots(figsize=(8,8))
    disp.plot(ax=ax,text_kw={'fontsize': 16}, cmap='Blues', colorbar=False);
    
    # change label font size without changing label text
    ax.xaxis.label.set_fontsize(18)
    ax.yaxis.label.set_fontsize(18)
    
    # make tick labels larger
    ax.tick_params(axis='y', labelsize=16)
    ax.tick_params(axis='x', labelsize=16)
Show ds_subset function
def ds_subset(dataset, exclude_idxs, columns=[0, 1, 2]):
    idxs = list(range(len(dataset)))
    idxs = [x for x in idxs if x not in exclude_idxs]
    ddf = dataset.to_pandas()
    new_ds = Dataset.from_pandas(ddf.iloc[idxs, columns])
    return new_ds
Show few_shot_responsesfunction
def few_shot_responses(dataset, prompt, examples):
    responses = []
    dataset = dataset.map(add_prompt, fn_kwargs={"prompt": prompt})
    print(dataset[0]['prompt'])
    
    few_shot_examples = []
    
    for example in examples:
        few_shot_examples.append({"role": "user", "content": prompt.format(text=example[0])})
        few_shot_examples.append({"role": "assistant", "content": example[1]})
    
    for row in dataset:
        messages = few_shot_examples + [{"role": "user", "content": row['prompt']}]
        
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(device)

        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=2
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip().lower()
        responses.append(response)
        
    df = dataset.to_pandas()
    df['responses'] = pd.Series(responses)
    
    return df
Show get_acc function
def get_acc(df):
    df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
    df['lm_match'] = df['label_text'] == df['responses']
    acc = df.lm_match.mean()
    return acc
Show get_ds function
def get_ds(n):
    exclude_idxs = [random.randint(0, 2263) for _ in range(n)]
    prompt_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)

    examples = []
    for idx in exclude_idxs:
        examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
        
    print(prompt_ds, [el[1] for el in examples[:10]])
    
    return prompt_ds, examples

Background

In this notebook I’ll use Qwen2-0.5B-Instruct to classify sentiment in the financial_phrasebank dataset. In previous notebooks I have performed sentiment classification with Qwen2-1.5B-Instruct, phi-2, phi-3, phi-3.5, and the Claude series.

This notebook is part of a series of blog posts for a project I’m working called TinySentiment where I’m experimenting with tiny models to improve their ability to classify sentiment in the financial_phrasebank dataset. I was inspired to do so after reading this blog post and this corresponding notebook by Moritz Laurer as part of a fastai study group last year.

Here are the results from my experiments so far (**the best-performing prompt from this notebook):

Model Prompting Strategy Overall Accuracy negative neutral positive
claude-3-5-sonnet-20240620 3-Shot 94.78% 98% (297/303) 94% (1302/1391) 95% (544/570)
claude-3-opus-20240229 0-Shot 94.13% 98% (297/303) 96% (1333/1391) 88% (501/570)
phi-3.5 20-Shot 93.94% 96% (286/299) 98% (1355/1379) 83% (467/566)
phi-3 30-Shot w/System Prompt 92.79% 98% (290/297) 94% (1284/1373) 88% (499/564)
claude-3-haiku-20240307 3-Shot 92.39% 90% (272/303) 91% (1267/1391) 96% (550/570)
phi-2 6-Shot 91.94% 88% (267/302) 94% (1299/1387) 90% (510/569)
Qwen2-1.5B 27-Shot 86.10% 90% (264/294) 96% (1320/1382) 61% (342/561)
**Qwen2-0.5B 17-Shot 79.48% 69% (206/300) 86% (1180/1380) 71% (400/567)

Here are the results from this notebook:

Prompt Strategy Accuracy Negative Neutral Positive
A 0-Shot 62.41% 91% (276/303) 53% (735/1391) 71% (402/570)
B 0-Shot 47.84% 90% (274/303) 57% (789/1391) 4% (20/570)
C 0-Shot 40.46% 91% (276/303) 43% (594/1391) 8% (46/570)
D 0-Shot 68.29% 79% (240/303) 61% (851/1391) 80% (455/570)
E 0-Shot 51.19% 97% (293/303) 28% (396/1391) 82% (470/570)
F 0-Shot 48.19% 94% (286/303) 21% (287/1391) 91% (518/570)
G 0-Shot 61.09% 93% (282/303) 46% (646/1391) 80% (455/570)
H 0-Shot 65.42% 85% (257/303) 57% (798/1391) 75% (426/570)
I 0-Shot 66.12% 81% (245/303) 58% (800/1391) 79% (452/570)
J 3-Shot 70.94% 43% (131/302) 75% (1042/1390) 76% (431/569)
K 3-Shot 74.88% 67% (201/302) 75% (1043/1390) 79% (449/569)
L 3-Shot 68.11% 49% (149/302) 65% (900/1390) 86% (491/569)
M 3-Shot 56.97% 49% (149/302) 45% (625/1390) 90% (514/569)
N 3-Shot 73.95% 62% (188/302) 75% (1038/1390) 78% (446/569)
O 3-Shot 59.97% 65% (196/302) 46% (635/1390) 92% (525/569)
P 6-Shot 63.91% 95% (289/303) 49% (678/1389) 84% (476/566)
Q 6-Shot 65.72% 69% (207/302) 55% (765/1389) 90% (512/567)
R 6-Shot 64.84% 94% (285/303) 49% (686/1387) 87% (493/568)
S 6-Shot 62.98% 96% (292/303) 47% (656/1387) 83% (474/568)
T 6-Shot 68.87% 51% (155/302) 70% (966/1387) 76% (434/569)
U 12-Shot 65.50% 53% (159/302) 59% (820/1386) 88% (496/564)
V 12-Shot 73.22% 70% (209/300) 80% (1103/1386) 60% (337/566)
W 12-Shot 70.43% 82% (246/301) 66% (912/1384) 75% (428/567)
X 12-Shot 76.60% 91% (270/298) 72% (1000/1386) 80% (455/568)
Y 12-Shot 72.56% 80% (243/303) 77% (1069/1381) 57% (322/568)
Z 18-Shot 71.33% 50% (150/301) 75% (1036/1382) 74% (416/563)
AA 17-Shot 79.48% 69% (206/300) 86% (1180/1380) 71% (400/567)
AB 18-Shot 74.22% 77% (229/299) 76% (1054/1381) 68% (384/566)
AC 18-Shot 68.57% 49% (148/302) 73% (1013/1380) 67% (379/564)
AD 18-Shot 74.98% 89% (271/303) 76% (1052/1379) 64% (361/564)
AE 24-Shot 74.91% 61% (181/299) 92% (1267/1375) 41% (230/566)
AF 24-Shot 73.08% 37% (112/302) 91% (1246/1375) 50% (279/563)
AG 24-Shot 75.00% 58% (173/300) 92% (1265/1375) 43% (242/565)
AH 24-Shot 77.46% 78% (233/299) 84% (1153/1375) 62% (349/566)
AI 23-Shot 75.37% 48% (143/301) 92% (1266/1375) 50% (280/565)
AJ 30-Shot 77.39% 58% (172/298) 94% (1284/1370) 48% (273/566)
AK 30-Shot 67.78% 63% (187/299) 61% (844/1375) 86% (483/560)
AL 30-Shot 76.54% 58% (173/299) 86% (1185/1372) 63% (352/563)
AM 30-Shot 74.84% 82% (242/296) 72% (984/1376) 79% (446/562)
AN 30-Shot 73.81% 51% (154/300) 77% (1052/1372) 79% (443/562)
AO 45-Shot 74.18% 54% (159/297) 76% (1034/1366) 81% (453/556)
AP 45-Shot 78.73% 63% (186/296) 87% (1192/1365) 66% (369/558)
AQ 45-Shot 72.01% 17% (51/301) 89% (1210/1359) 60% (337/559)
AR 45-Shot 73.86% 53% (157/297) 80% (1094/1364) 70% (388/558)
AS 45-Shot 74.94% 42% (125/297) 89% (1219/1363) 57% (319/559)
AT 60-Shot 72.19% 47% (138/292) 78% (1055/1356) 72% (398/556)
AU 60-Shot 76.86% 43% (127/296) 91% (1237/1356) 60% (330/552)
AV 60-Shot 75.45% 26% (79/299) 89% (1206/1352) 68% (378/553)
AW 60-Shot 74.46% 29% (88/299) 86% (1157/1349) 71% (396/556)
AX 60-Shot 79.63% 62% (179/290) 94% (1275/1352) 54% (301/562)

Prompt A

I’ll start out with a simple instruction.

promptA = """Label the following TEXT with a single word: negative, positive, or neutral
TEXT: {text}"""

print(promptA)
Label the following TEXT with a single word: negative, positive, or neutral
TEXT: {text}
formatted_prompt = promptA.format(text=dataset[0]['sentence'])
print(formatted_prompt)
Label the following TEXT with a single word: negative, positive, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
generate_response(formatted_prompt)
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
'Negative.'

Good—at least it’s responding with a sensible answer, although it’s not formatted how I’d like to be, so I expect to need more data cleaning than Qwen2-1.5B-Instruct’s responses.

At ~35ms per prompt it will take about 80 seconds to run inference on the full 2264 item dataset.

%timeit -n 10 generate_response(formatted_prompt)
35.4 ms ± 472 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
df, acc = generate_responses(dataset, promptA)
Label the following TEXT with a single word: negative, positive, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)

0.5B yields messier responses. Note the period at the end of some of the strings. For now I’ll manually check each set of responses and clean them accordingly.

df['responses'].unique()
array(['neutral.', 'positive', 'neutral', 'negative', 'positive.',
       'negative.', 'negot', 'negative profit', 'net interest', 'teleste',
       'neglig'], dtype=object)
df['responses'] = df['responses'].str.replace('.', '', regex=False) 
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")

0.5B doesn’t do terribly on this simple prompt (62.4% accuracy) but it’s almost 20% less accurate than 1.5B (~82% accuracy).

df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.6241166077738516

0.5B does a great job at classifying negative sentiment, does quite well at positive sentences, and has very few other responses overall.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_A.csv', index=False)

Prompt B

promptB = """Instruct: label the following TEXT with a single word: negative, positive, or neutral
TEXT: {text}
label the TEXT with a single word: negative, positive, or neutral"""
df, acc = generate_responses(dataset, promptB)
Instruct: label the following TEXT with a single word: negative, positive, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
label the TEXT with a single word: negative, positive, or neutral

With this prompt (where the instruction is repeated after the dataset text) 0.5B responds much more cleanly.

df['responses'].unique()
array(['negative', 'neutral', 'positive'], dtype=object)

However, it performs almost 20% worse!

df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.47835689045936397

While it’s quite good still with negative sentiment, it performs significantly worse on positive sentences.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_B.csv', index=False)

Prompt C

I’ll use the same Prompt C as the 1.5B model: a reword of Prompt A (which performed well for 0.5B).

promptC = """Respond with a single word: negative, positive, or neutral
TEXT: {text}"""
df, acc = generate_responses(dataset, promptC)
Respond with a single word: negative, positive, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['negative.', 'negative', 'neutral', 'neutral.', 'positive',
       'positive.', 'negative loss'], dtype=object)

The change in prompt language significantly deteriorates 0.5B’s accuracy.

df['responses'] = df['responses'].str.replace('.', '', regex=False) 
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.4045936395759717

0.5B still does really well on negative sentiment, but does horribly on positive and underwhelming for neutral sentences.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_C.csv', index=False)

Prompt D

I’ll change the order of sentiment listed in Prompt A by putting positive first:

promptD = """Label the following TEXT with a single word: positive, negative, or neutral
TEXT: {text}"""
df, acc = generate_responses(dataset, promptD)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral.', 'positive', 'positive.', 'neutral', 'net income',
       'the text', 'negative', 'negative.', 'negative net', 'negot',
       'subscription'], dtype=object)

Changing the order of sentiment (putting positive first) increases the overall accuracy by ~6%.

df['responses'] = df['responses'].str.replace('.', '', regex=False) 
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.6828621908127208

0.5B’s performance on negative sentiment dips a bit (36 fewer correct) but that is more than compensated by the increase in correctly classified positive (+53) and neutral (+166) sentences.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_D.csv', index=False)

Prompt E

I’ll try another combination:

promptE = """Label the following TEXT with a single word: negative, neutral, or positive
TEXT: {text}"""
df, acc = generate_responses(dataset, promptE)
Label the following TEXT with a single word: negative, neutral, or positive
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['negative.', 'positive', 'negative', 'positive.', 'neutral.',
       'neutral', 'negative profit', 'negot', 'teleste'], dtype=object)

This ordering of sentiment worsens the accuracy by 10 points.

df['responses'] = df['responses'].str.replace('.', '', regex=False) 
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.5119257950530035

0.5B is nearly perfect for negative sentiment, and quite good with positive sentences, but abysmal for neutral.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_E.csv', index=False)

Prompt F

Trying the next permutation of sentiments:

promptF = """Label the following TEXT with a single word: positive, neutral, or negative
TEXT: {text}"""
df, acc = generate_responses(dataset, promptF)
Label the following TEXT with a single word: positive, neutral, or negative
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral.', 'positive', 'positive.', 'negative', 'negative.',
       'neutral', 'positive net', 'negativ', 'negot', 'subscription'],
      dtype=object)

This ordering of sentiments further worsens the overall accuracy.

df['responses'] = df['responses'].str.replace('.', '', regex=False) 
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.4818904593639576

positive sentences are classified correctly at the highest rate so far, and negative sentiment accuracy is very good, but the model does terribly on neutral sentences.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_F.csv', index=False)

Prompt G

The next ordering of sentiments:

promptG = """Label the following TEXT with a single word: neutral, negative, or positive
TEXT: {text}"""
df, acc = generate_responses(dataset, promptG)
Label the following TEXT with a single word: neutral, negative, or positive
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral.', 'positive', 'positive.', 'neutral', 'negative',
       'negative.', 'positive profit', 'negot'], dtype=object)

The accuracy of 61% is worse than the best-performing Prompt D (68%).

df['responses'] = df['responses'].str.replace('.', '', regex=False) 
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.6108657243816255
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_G.csv', index=False)

Prompt H

The last ordering of sentiment:

promptH = """Label the following TEXT with a single word: neutral, positive, or negative
TEXT: {text}"""
df, acc = generate_responses(dataset, promptH)
Label the following TEXT with a single word: neutral, positive, or negative
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['positive', 'neutral.', 'positive.', 'neutral', 'negative',
       'negative.', 'positive profit', 'negot'], dtype=object)

This yields a 65% accuracy.

df['responses'] = df['responses'].str.replace('.', '', regex=False) 
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.6541519434628975
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_H.csv', index=False)

Prompt I

I’ll make a small change to my best-performing prompt by adding a period at the end of the instruction.

promptI = """Label the following TEXT with a single word: positive, negative, or neutral.
TEXT: {text}"""
df, acc = generate_responses(dataset, promptI)
Label the following TEXT with a single word: positive, negative, or neutral.
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral.', 'positive', 'positive.', 'neutral', 'net income',
       'negative', 'negative.', 'negative profit', 'nord', 'negot',
       'the text', 'negation', 'neglig', 'subscription'], dtype=object)

Adding a period to the end of the instruction worsens the accuracy a bit.

df['responses'] = df['responses'].str.replace('.', '', regex=False) 
df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.6612190812720848

Adding a period worsens the performance on neutral by 51 sentences.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_I.csv', index=False)

Prompt J

I’ll now shift my attention to few-shot prompts, starting with 3-Shot.

exclude_idxs = [0, 1, 292]
promptJ_ds = ds_subset(dataset, exclude_idxs)
promptJ_ds
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2261
})
promptJ = """Label the following TEXT with a single word: positive, negative, or neutral
TEXT: {text}"""

Since ordering seems to matter, I’ll start with a neutral example, positive example and negative example.

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))

examples
[('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  'neutral'),
 ("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
  'positive'),
 ('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
  'negative')]
df = few_shot_responses(promptJ_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
df['responses'].unique()
array(['positive', 'neutral', 'negative'], dtype=object)

3-Shot prompting resulted in the best accuracy so far! ~71%.

df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.709420610349403

Compared to my best 0-Shot Prompt D (68%) this prompt results in the model significantly underperforming on negative sentences, (131 < 240), but more than making up for it on neutral sentences (1042 > 851).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_J.csv', index=False)

Prompt K

I’ll re-order the examples and use the same Prompt J.

exclude_idxs = [0, 292, 1]
examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))

examples
[('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  'neutral'),
 ('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
  'negative'),
 ("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
  'positive')]
df = few_shot_responses(promptJ_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
df['responses'].unique()
array(['positive', 'neutral', 'negative'], dtype=object)

Changing the order of examples to neutral, negative, positive increases the overall accuracy to almost 75%!

df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.7487837240159222

The model improves on all three sentiments compared to Prompt J.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_K.csv', index=False)

Prompt L

I’ll re-order the examples and use the same Prompt J.

exclude_idxs = [1, 0, 292]

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))

examples
[("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
  'positive'),
 ('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  'neutral'),
 ('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
  'negative')]
df = few_shot_responses(promptJ_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
df['responses'].unique()
array(['positive', 'neutral', 'negative'], dtype=object)

This ordering of examples drops the accuracy to 68%.

df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.6811145510835913

Compared to the best-performing Prompt K, this prompt yields a better accuracy for positive sentences (491 > 449).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_L.csv', index=False)

Prompt M

I’ll re-order the examples and use the same Prompt J.

exclude_idxs = [1, 292, 0]

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))

examples
[("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
  'positive'),
 ('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
  'negative'),
 ('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  'neutral')]
df = few_shot_responses(promptJ_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
df['responses'].unique()
array(['positive', 'neutral', 'negative'], dtype=object)

This ordering of examples worsens the accuracy, dropping it down to 57%.

df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.5696594427244582

This prompt yields better results for positive sentiment (514 > 449) than the best overall performing Prompt J.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_M.csv', index=False)

Prompt N

Trying the next ordering of sentiments:

exclude_idxs = [292, 0, 1]

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))

examples
[('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
  'negative'),
 ('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  'neutral'),
 ("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
  'positive')]
df = few_shot_responses(promptJ_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
df['responses'].unique()
array(['positive', 'neutral', 'negative'], dtype=object)

This ordering results in the second-highest overall accuracy at 74%.

df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.7394957983193278

This prompt performs slightly worse for all three sentiments than the so far best-overall performing Prompt K.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_N.csv', index=False)

Prompt O

Here’s the final 3-sentiment ordering:

exclude_idxs = [292, 1, 0]

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))

examples
[('Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
  'negative'),
 ("For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
  'positive'),
 ('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  'neutral')]
df = few_shot_responses(promptJ_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
df['responses'].unique()
array(['positive', 'neutral', 'negative'], dtype=object)

This ordering of sentiment does not beat my so far best-performing accuracy.

df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
df['lm_match'] = df['label_text'] == df['responses']
acc = df.lm_match.mean()
acc
0.599734630694383

This prompt yields a much better performance on positive sentiment than my best performing Prompt K (525 > 449).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_O.csv', index=False)

Prompt P

Next, I’ll increase the number of examples to 6. Note that I won’t be trying all permutations but a few random ones.

exclude_idxs = [random.randint(0, 2263) for _ in range(6)]
promptP_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)
promptP_ds
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2258
})

The random examples I have picked don’t include a negative sentence. I’m curious to see how the model performs on this.

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))

[el[1] for el in examples]
['positive', 'neutral', 'positive', 'neutral', 'positive', 'positive']
df = few_shot_responses(promptP_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt results in a worse performance in overall accuracy.

get_acc(df)
0.6390611160318866

Even though no negative examples were given, this prompt yields considerably more correct negative sentences (289) than the best-performing Prompt K (201).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_P.csv', index=False)

Prompt Q

I’ll try another random set of 6 examples, this time making sure there’s at least one of each sentiment.

exclude_idxs = [random.randint(0, 2263) for _ in range(6)]
promptQ_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
    
promptQ_ds, [el[1] for el in examples]
(Dataset({
     features: ['sentence', 'label', 'label_text', '__index_level_0__'],
     num_rows: 2258
 }),
 ['positive', 'positive', 'positive', 'neutral', 'negative', 'neutral'])
df = few_shot_responses(promptQ_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative', 'live'], dtype=object)

This set of 6 examples does not improve upon the best-overall accuracy of 75%.

get_acc(df)
0.6572187776793623

Something we haven’t seen in awhile, an other response.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_Q.csv', index=False)

Prompt R

exclude_idxs = [random.randint(0, 2263) for _ in range(6)]
promptR_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
    
promptR_ds, [el[1] for el in examples]
(Dataset({
     features: ['sentence', 'label', 'label_text', '__index_level_0__'],
     num_rows: 2258
 }),
 ['neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral'])
df = few_shot_responses(promptR_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

No improvements on accuracy with this prompt.

get_acc(df)
0.6483613817537643

Compared to the best-performing Prompt K, this prompt yields considerably more correct negative (285 > 201) and positive (493 > 449) sentences but underperforms on neutral sentences (686 < 1043).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_R.csv', index=False)

Prompt S

exclude_idxs = [random.randint(0, 2263) for _ in range(6)]
promptS_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
    
promptS_ds, [el[1] for el in examples]
(Dataset({
     features: ['sentence', 'label', 'label_text', '__index_level_0__'],
     num_rows: 2258
 }),
 ['neutral', 'neutral', 'positive', 'neutral', 'positive', 'neutral'])

This set of examples has no negative sentences and a majority of neutral sentences.

df = few_shot_responses(promptS_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
df['responses'].unique()
array(['negative', 'positive', 'neutral'], dtype=object)

This set of examples does not improve on the best-overall accuracy of 75% (Prompt K).

get_acc(df)
0.6297608503100088

It does, however, have a considerably larger number of correctly labeled negative sentences (292 > 201).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_S.csv', index=False)

I’ll try one more 6-shot prompt before I increase the number of examples.

Prompt T

exclude_idxs = [random.randint(0, 2263) for _ in range(6)]
promptT_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
    
promptT_ds, [el[1] for el in examples]
(Dataset({
     features: ['sentence', 'label', 'label_text', '__index_level_0__'],
     num_rows: 2258
 }),
 ['neutral', 'neutral', 'positive', 'neutral', 'negative', 'neutral'])
df = few_shot_responses(promptT_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

Similar to the other 6-Shot examples, this set of examples does not improve on the best overall accuracy.

get_acc(df)
0.6886625332152347
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_T.csv', index=False)

Prompt U

I’ll now increase the number of examples in the prompt to 12, and try out 5 random sets of 12 examples.

exclude_idxs = [random.randint(0, 2263) for _ in range(12)]
promptU_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
    
promptU_ds, [el[1] for el in examples]
(Dataset({
     features: ['sentence', 'label', 'label_text', '__index_level_0__'],
     num_rows: 2252
 }),
 ['positive',
  'negative',
  'positive',
  'neutral',
  'positive',
  'positive',
  'neutral',
  'positive',
  'neutral',
  'neutral',
  'positive',
  'neutral'])
df = few_shot_responses(promptU_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

Increasing the number of examples to 12, at least the 12 I chose here, doesn’t improve on the best overall accuracy.

get_acc(df)
0.6549733570159858

The number of correct positive sentences is considerably higher than Prompt K (496 > 449).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_U.csv', index=False)

Prompt V

exclude_idxs = [random.randint(0, 2263) for _ in range(12)]
promptV_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
    
promptV_ds, [el[1] for el in examples]
(Dataset({
     features: ['sentence', 'label', 'label_text', '__index_level_0__'],
     num_rows: 2252
 }),
 ['neutral',
  'positive',
  'positive',
  'neutral',
  'neutral',
  'negative',
  'negative',
  'neutral',
  'positive',
  'neutral',
  'positive',
  'negative'])
df = few_shot_responses(promptV_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .
df['responses'].unique()
array(['positive', 'neutral', 'negative'], dtype=object)

This prompt performs well, and competes with but doesn’t improve upon the best overall accuracy of 75%.

get_acc(df)
0.7322380106571936

This prompt performs considerably better on neutral sentences than Prompt K (1103 > 1043).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_V.csv', index=False)

Prompt W

exclude_idxs = [random.randint(0, 2263) for _ in range(12)]
promptW_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)

examples = []
for idx in exclude_idxs:
    examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
    
promptW_ds, [el[1] for el in examples]
(Dataset({
     features: ['sentence', 'label', 'label_text', '__index_level_0__'],
     num_rows: 2252
 }),
 ['neutral',
  'negative',
  'neutral',
  'positive',
  'neutral',
  'neutral',
  'positive',
  'neutral',
  'neutral',
  'negative',
  'positive',
  'neutral'])
df = few_shot_responses(promptW_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

The accuracy worsens with this set of 12 examples.

get_acc(df)
0.7042628774422736
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_W.csv', index=False)

Prompt X

def get_ds(n):
    exclude_idxs = [random.randint(0, 2263) for _ in range(n)]
    prompt_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)

    examples = []
    for idx in exclude_idxs:
        examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
        
    print(prompt_ds, [el[1] for el in examples])
    
    return prompt_ds, examples
promptX_ds, examples = get_ds(12)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2252
}) ['positive', 'neutral', 'negative', 'neutral', 'positive', 'negative', 'neutral', 'negative', 'negative', 'neutral', 'neutral', 'negative']
df = few_shot_responses(promptX_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

Aha! This prompt improves upon the best overall accuracy, reaching about 77%.

get_acc(df)
0.7659857904085258

Compared to Prompt K (75%) this prompt performs worse on neutral sentences (1000 < 1043) but more than makes up for it on negative (270 > 201) and positive (455 > 449) sentences.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_X.csv', index=False)

Prompt Y

promptY_ds, examples = get_ds(12)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2252
}) ['neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'neutral']
df = few_shot_responses(promptY_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['negative', 'positive', 'neutral'], dtype=object)

This prompt does not improve on the best overall accuracy.

get_acc(df)
0.7255772646536413

This prompt performs well on negative and neutral sentences but its worse performance on positive sentences brings down the overall accuracy.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_Y.csv', index=False)

Next, I’ll try 5 prompts with 18 examples.

Prompt Z

promptZ_ds, examples = get_ds(18)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2246
}) ['neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'positive', 'negative', 'negative', 'positive', 'neutral']
df = few_shot_responses(promptZ_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt does not improve upon overall accuracy.

get_acc(df)
0.7132680320569902
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_Z.csv', index=False)

Prompt AA

promptAA_ds, examples = get_ds(18)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2247
}) ['neutral', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'positive', 'neutral']
df = few_shot_responses(promptAA_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This set of 18 examples increases the best overall accuraacy to almost 80%!

get_acc(df)
0.7948375611927013

Compared to Prompt X, this prompt performs worse on negative (206 < 270) and positive (400 < 455) but more than makes up for it on neutral sentences (1180 > 1000).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AA.csv', index=False)

Prompt AB

promptAB_ds, examples = get_ds(18)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2246
}) ['neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'positive', 'positive']
df = few_shot_responses(promptAB_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt does not improve upon the best overall accuracy.

get_acc(df)
0.7422083704363313
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AB.csv', index=False)

Prompt AC

promptAC_ds, examples = get_ds(18)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2246
}) ['neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'positive', 'neutral', 'positive', 'neutral', 'positive', 'neutral']
df = few_shot_responses(promptAC_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt does not improve upon the best overall accuracy.

get_acc(df)
0.6856634016028496
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AC.csv', index=False)

Prompt AD

promptAD_ds, examples = get_ds(18)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2246
}) ['neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'positive']
df = few_shot_responses(promptAD_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt does not improve upon the best overall accuracy.

get_acc(df)
0.7497773820124666

This prompt yields considerably more correct negative sentences (271 > 206) than the best-performing Prompt AA.

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AD.csv', index=False)

Next, I’ll try 5 prompts with 24 examples each.

Prompt AE

promptAE_ds, examples = get_ds(24)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2240
}) ['neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'negative']
df = few_shot_responses(promptAE_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

Increasing the number of examples to 24 (at least for these 24 examples) does not improve upon the overall accuracy.

get_acc(df)
0.7491071428571429

Compared to the best performing Prompt AA, this prompt yields considerably more correct neutral sentences (1267 > 1180).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AE.csv', index=False)

Prompt AF

promptAF_ds, examples = get_ds(24)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2240
}) ['positive', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']
df = few_shot_responses(promptAF_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt doesn’t improve upon the best overall accuracy, and performs better than Prompt AA on neutral sentences.

get_acc(df)
0.7308035714285714
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AF.csv', index=False)

Prompt AG

promptAG_ds, examples = get_ds(24)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2240
}) ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral']
df = few_shot_responses(promptAG_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

The same trend continues for this set of 24 examples.

get_acc(df)
0.75
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AG.csv', index=False)

Two more 24-Shot prompts to go.

Prompt AH

promptAH_ds, examples = get_ds(24)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2240
}) ['positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'neutral', 'neutral']
df = few_shot_responses(promptAH_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt does not improve upon the best overall accuracy (though it comes close).

get_acc(df)
0.7745535714285714

This prompt yields more correct negative sentences than Prompt AA (233 > 206).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AH.csv', index=False)

Prompt AI

promptAI_ds, examples = get_ds(24)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2241
}) ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'positive', 'neutral']
df = few_shot_responses(promptAI_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt does not improve upon the best overall accuracy.

get_acc(df)
0.7536813922356091

This prompt yields considerably more correct neutral sentences than the best performing Prompt AA (1266 > 1180).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AI.csv', index=False)

Next, I’ll try 5 different 30-Shot prompts.

Prompt AJ

promptAJ_ds, examples = get_ds(30)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2234
}) ['neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'neutral', 'negative', 'neutral', 'neutral']
df = few_shot_responses(promptAJ_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt doesn’t improve the best overall accuracy.

get_acc(df)
0.7739480752014324

As seems to be the trend, this prompt results in more correct neutral responses (1284) than Prompt AA (1180).

make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AJ.csv', index=False)

Prompt AK

promptAK_ds, examples = get_ds(30)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2234
}) ['neutral', 'neutral', 'neutral', 'negative', 'negative', 'positive', 'neutral', 'neutral', 'positive', 'positive']
df = few_shot_responses(promptAK_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

The model performs considerably worse with these 30 examples.

get_acc(df)
0.6777081468218442
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AK.csv', index=False)

Prompt AL

promptAL_ds, examples = get_ds(30)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2234
}) ['positive', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']
df = few_shot_responses(promptAL_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

The trend continues: the overall accuracy doesn’t improve but the model’s performance on neutral sentences does.

get_acc(df)
0.76544315129812
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AL.csv', index=False)

Two more 30-Shot prompts to go.

Prompt AM

promptAM_ds, examples = get_ds(30)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2234
}) ['negative', 'neutral', 'positive', 'negative', 'neutral', 'neutral', 'positive', 'negative', 'positive', 'neutral']
df = few_shot_responses(promptAM_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

The overall accuracy doesn’t improve but the model’s performance on negative and positive sentences does.

get_acc(df)
0.7484333034914951
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AM.csv', index=False)

Prompt AN

promptAN_ds, examples = get_ds(30)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2234
}) ['positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']
df = few_shot_responses(promptAN_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

The overall accuracy doesn’t improve but the model’s performance on positive sentences does.

get_acc(df)
0.7381378692927484
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AN.csv', index=False)

Next, I’ll increase the number of examples to 45.

Prompt AO

promptAO_ds, examples = get_ds(45)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2219
}) ['neutral', 'neutral', 'positive', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'positive', 'neutral']
df = few_shot_responses(promptAO_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

The overall accuracy doesn’t improve but the model’s performance on positive sentences does.

get_acc(df)
0.7417755745831456
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AO.csv', index=False)

Prompt AP

promptAP_ds, examples = get_ds(45)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2219
}) ['neutral', 'positive', 'negative', 'neutral', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'positive']
df = few_shot_responses(promptAP_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

The overall accuracy doesn’t improve but the model’s performance on neutral sentences does.

get_acc(df)
0.7872915727805317
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AP.csv', index=False)

Prompt AQ

promptAQ_ds, examples = get_ds(45)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2219
}) ['neutral', 'neutral', 'negative', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral']
df = few_shot_responses(promptAQ_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

The overall accuracy doesn’t improve but the model’s performance on neutral sentences does.

get_acc(df)
0.7201442091031997
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AQ.csv', index=False)

Prompt AR

promptAR_ds, examples = get_ds(45)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2219
}) ['neutral', 'negative', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'positive']
df = few_shot_responses(promptAR_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt performs worse than the best overall Prompt AA.

get_acc(df)
0.7386210004506535
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AR.csv', index=False)

Prompt AS

promptAS_ds, examples = get_ds(45)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2219
}) ['neutral', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']
df = few_shot_responses(promptAS_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
df['responses'].unique()
array(['positive', 'neutral', 'negative'], dtype=object)

Compared to Prompt AA, this prompt yields a worse overall accuracy but improves on neutral sentences (1291 > 1180).

get_acc(df)
0.7494366831906264
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AS.csv', index=False)

Next, I’ll move on to the final number of examples: 60.

Prompt AT

promptAT_ds, examples = get_ds(60)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2204
}) ['positive', 'neutral', 'neutral', 'negative', 'negative', 'neutral', 'neutral', 'neutral', 'negative', 'neutral']
df = few_shot_responses(promptAT_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

Upping the number of examples to 60 does not improve results.

get_acc(df)
0.7218693284936479
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AT.csv', index=False)

Prompt AU

promptAU_ds, examples = get_ds(60)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2204
}) ['neutral', 'positive', 'neutral', 'positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'neutral']
df = few_shot_responses(promptAU_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

Compared to Prompt AA, this prompt yields a worse overall accuracy but improves on neutral sentences (1237 > 1180).

get_acc(df)
0.7686025408348457
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AU.csv', index=False)

Prompt AV

promptAV_ds, examples = get_ds(60)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2204
}) ['positive', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'positive', 'negative', 'neutral']
df = few_shot_responses(promptAV_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

Compared to Prompt AA, this prompt yields a worse overall accuracy but improves on neutral sentences (1206 > 1180).

get_acc(df)
0.7545372050816697
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AV.csv', index=False)

Prompt AW

promptAW_ds, examples = get_ds(60)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2204
}) ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral']
df = few_shot_responses(promptAW_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

This prompt does not improve upon Prompt AA results.

get_acc(df)
0.7445553539019963
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AW.csv', index=False)

Prompt AX

promptAX_ds, examples = get_ds(60)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2204
}) ['neutral', 'neutral', 'negative', 'negative', 'neutral', 'negative', 'positive', 'neutral', 'positive', 'negative']
df = few_shot_responses(promptAX_ds, promptJ, examples)
Label the following TEXT with a single word: positive, negative, or neutral
TEXT: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
df['responses'].unique()
array(['neutral', 'positive', 'negative'], dtype=object)

Aha! We finally improve on the overall accuracy of Prompt AA. This prompt yields a slightly higher accuracy that still rounds off to 80%.

get_acc(df)
0.7962794918330308
make_cm(df)

df.to_csv('/notebooks/Qwen2-0.5B-Instruct_AX.csv', index=False)

Running Inference 10 Times Using the Best Prompt

While 60-shot Prompt AX had a slightly higher accuracy (79.63%) I am going to pick the 16-Shot Prompt AA as my best prompt (79.48%) since it has less than a third of the examples, which translates to about a third of the tokens, thus leading to quicker response generation.

def test_gen(examples):
    few_shot_examples = []
    
    for example in examples:
        few_shot_examples.append({"role": "user", "content": promptJ.format(text=example[0])})
        few_shot_examples.append({"role": "assistant", "content": example[1]})
    
    messages = few_shot_examples + [{"role": "user", "content": promptJ.format(text=dataset[0]['sentence'])}]
        
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=2
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip().lower()
    return response
promptAA_ds, examples = get_ds(18)
Dataset({
    features: ['sentence', 'label', 'label_text', '__index_level_0__'],
    num_rows: 2246
}) ['negative', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'positive']

1 response generation takes 72ms. Running full dataset inference 10 times will take about 30 minutes.

%timeit -n 10 test_gen(examples)
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
72 ms ± 42.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Show updated few_shot_responses function
def few_shot_responses(dataset, prompt, examples):
    responses = []
    dataset = dataset.map(add_prompt, fn_kwargs={"prompt": prompt})
    
    few_shot_examples = []
    
    for example in examples:
        few_shot_examples.append({"role": "user", "content": prompt.format(text=example[0])})
        few_shot_examples.append({"role": "assistant", "content": example[1]})
    
    for row in dataset:
        messages = few_shot_examples + [{"role": "user", "content": row['prompt']}]
        
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(device)

        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=2
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip().lower()
        responses.append(response)
        
    # calculate accuracy
    df = dataset.to_pandas()
    df['responses'] = pd.Series(responses)
    df['responses'] = df['responses'].apply(lambda x: x if x in ['negative', 'positive', 'neutral'] else "other")
    df['lm_match'] = df['label_text'] == df['responses']
    acc = df.lm_match.mean()
    
    return df, acc
Show updated get_ds function
def get_ds(n):
    exclude_idxs = [random.randint(0, 2263) for _ in range(n)]
    prompt_ds = ds_subset(dataset, exclude_idxs=exclude_idxs)

    examples = []
    for idx in exclude_idxs:
        examples.append((dataset[idx]['sentence'], dataset[idx]['label_text']))
    
    return prompt_ds, examples

I didn’t store the exact 18 examples that I used the first time for Prompt AA, so I had to try different 18-Shot examples until I achieved an accuracy close to 79.48%. It took me about 20 tries, but I finally found a set of examples that broke the 79% threshold.

for _ in range(20):
    n = 18
    ds, examples = get_ds(n)
    if len(ds) != 2264 - n: pass
    df, acc = few_shot_responses(ds, promptJ, examples)
    if round(acc, 2) >= 0.79: break
acc
0.815227070347284
ds, len(examples)
(Dataset({
     features: ['sentence', 'label', 'label_text', '__index_level_0__'],
     num_rows: 2246
 }),
 18)
accs = []
for _ in range(10):
    df, acc = few_shot_responses(ds, promptJ, examples)
    accs.append(acc)

For this prompt, the overall accuracy ranges from 80.8% to 82.4%.

pd.Series(accs).describe()
count    10.000000
mean      0.816830
std       0.005397
min       0.807658
25%       0.814003
50%       0.817453
75%       0.819791
max       0.824577
dtype: float64

Final Thoughts

Takeaways from my Qwen2-0.5B experiments:

  • Example order matters: Testing 6 prompts with the same 3 examples in different orders yielded accuracies from 57% to 75%.
  • Example selection matters: Recreating the performance of one of my best prompts (79.48% accuracy) took ~20 attempts, proving not all sets of examples perform equally.
  • Result variance exists: Running a prompt 10 times produced accuracies ranging from 80.8% to 82.4%.

Here are the results of Qwen2-0.5B in the context of the other models that I have experimented with:

Model Prompting Strategy Overall Accuracy negative neutral positive
claude-3-5-sonnet-20240620 3-Shot 94.78% 98% (297/303) 94% (1302/1391) 95% (544/570)
claude-3-opus-20240229 0-Shot 94.13% 98% (297/303) 96% (1333/1391) 88% (501/570)
phi-3.5 20-Shot 93.94% 96% (286/299) 98% (1355/1379) 83% (467/566)
phi-3 30-Shot w/System Prompt 92.79% 98% (290/297) 94% (1284/1373) 88% (499/564)
claude-3-haiku-20240307 3-Shot 92.39% 90% (272/303) 91% (1267/1391) 96% (550/570)
phi-2 6-Shot 91.94% 88% (267/302) 94% (1299/1387) 90% (510/569)
Qwen2-1.5B 27-Shot 86.10% 90% (264/294) 96% (1320/1382) 61% (342/561)
**Qwen2-0.5B 17-Shot 79.48% 69% (206/300) 86% (1180/1380) 71% (400/567)

Here are the results from this notebook:

Prompt Strategy Accuracy Negative Neutral Positive
A 0-Shot 62.41% 91% (276/303) 53% (735/1391) 71% (402/570)
B 0-Shot 47.84% 90% (274/303) 57% (789/1391) 4% (20/570)
C 0-Shot 40.46% 91% (276/303) 43% (594/1391) 8% (46/570)
D 0-Shot 68.29% 79% (240/303) 61% (851/1391) 80% (455/570)
E 0-Shot 51.19% 97% (293/303) 28% (396/1391) 82% (470/570)
F 0-Shot 48.19% 94% (286/303) 21% (287/1391) 91% (518/570)
G 0-Shot 61.09% 93% (282/303) 46% (646/1391) 80% (455/570)
H 0-Shot 65.42% 85% (257/303) 57% (798/1391) 75% (426/570)
I 0-Shot 66.12% 81% (245/303) 58% (800/1391) 79% (452/570)
J 3-Shot 70.94% 43% (131/302) 75% (1042/1390) 76% (431/569)
K 3-Shot 74.88% 67% (201/302) 75% (1043/1390) 79% (449/569)
L 3-Shot 68.11% 49% (149/302) 65% (900/1390) 86% (491/569)
M 3-Shot 56.97% 49% (149/302) 45% (625/1390) 90% (514/569)
N 3-Shot 73.95% 62% (188/302) 75% (1038/1390) 78% (446/569)
O 3-Shot 59.97% 65% (196/302) 46% (635/1390) 92% (525/569)
P 6-Shot 63.91% 95% (289/303) 49% (678/1389) 84% (476/566)
Q 6-Shot 65.72% 69% (207/302) 55% (765/1389) 90% (512/567)
R 6-Shot 64.84% 94% (285/303) 49% (686/1387) 87% (493/568)
S 6-Shot 62.98% 96% (292/303) 47% (656/1387) 83% (474/568)
T 6-Shot 68.87% 51% (155/302) 70% (966/1387) 76% (434/569)
U 12-Shot 65.50% 53% (159/302) 59% (820/1386) 88% (496/564)
V 12-Shot 73.22% 70% (209/300) 80% (1103/1386) 60% (337/566)
W 12-Shot 70.43% 82% (246/301) 66% (912/1384) 75% (428/567)
X 12-Shot 76.60% 91% (270/298) 72% (1000/1386) 80% (455/568)
Y 12-Shot 72.56% 80% (243/303) 77% (1069/1381) 57% (322/568)
Z 18-Shot 71.33% 50% (150/301) 75% (1036/1382) 74% (416/563)
AA 17-Shot 79.48% 69% (206/300) 86% (1180/1380) 71% (400/567)
AB 18-Shot 74.22% 77% (229/299) 76% (1054/1381) 68% (384/566)
AC 18-Shot 68.57% 49% (148/302) 73% (1013/1380) 67% (379/564)
AD 18-Shot 74.98% 89% (271/303) 76% (1052/1379) 64% (361/564)
AE 24-Shot 74.91% 61% (181/299) 92% (1267/1375) 41% (230/566)
AF 24-Shot 73.08% 37% (112/302) 91% (1246/1375) 50% (279/563)
AG 24-Shot 75.00% 58% (173/300) 92% (1265/1375) 43% (242/565)
AH 24-Shot 77.46% 78% (233/299) 84% (1153/1375) 62% (349/566)
AI 23-Shot 75.37% 48% (143/301) 92% (1266/1375) 50% (280/565)
AJ 30-Shot 77.39% 58% (172/298) 94% (1284/1370) 48% (273/566)
AK 30-Shot 67.78% 63% (187/299) 61% (844/1375) 86% (483/560)
AL 30-Shot 76.54% 58% (173/299) 86% (1185/1372) 63% (352/563)
AM 30-Shot 74.84% 82% (242/296) 72% (984/1376) 79% (446/562)
AN 30-Shot 73.81% 51% (154/300) 77% (1052/1372) 79% (443/562)
AO 45-Shot 74.18% 54% (159/297) 76% (1034/1366) 81% (453/556)
AP 45-Shot 78.73% 63% (186/296) 87% (1192/1365) 66% (369/558)
AQ 45-Shot 72.01% 17% (51/301) 89% (1210/1359) 60% (337/559)
AR 45-Shot 73.86% 53% (157/297) 80% (1094/1364) 70% (388/558)
AS 45-Shot 74.94% 42% (125/297) 89% (1219/1363) 57% (319/559)
AT 60-Shot 72.19% 47% (138/292) 78% (1055/1356) 72% (398/556)
AU 60-Shot 76.86% 43% (127/296) 91% (1237/1356) 60% (330/552)
AV 60-Shot 75.45% 26% (79/299) 89% (1206/1352) 68% (378/553)
AW 60-Shot 74.46% 29% (88/299) 86% (1157/1349) 71% (396/556)
AX 60-Shot 79.63% 62% (179/290) 94% (1275/1352) 54% (301/562)