Show imports
!pip install claudette -qq
import sqlite3
import json
import re
import os
import pandas as pd, numpy as np
from claudette import *
claudette
claudette
library to iteratively improve keywords generated for sqlite’s full text search.
Vishal Bakshi
August 27, 2024
In a previous blog post I used Claude (with the help of the Answer.AI claudette
library) to generate keywords for 220 questions across the 7 Chapter Questionnaires covered in Part 1 of the fastai course. Using those keywords I retrieved context from fastbook that allowed me to answer 33% (10/30 questions) of the Chapter 1 Questionnaire. In another blog post I manually came up with keywords and achieved a 40% answer rate.
In this notebook I’ll see if I can improve the quality of information retrieval by improving the keywords that Claude generates.
This notebook is part of a series of blog posts for a project I’m working on called fastbookRAG in which I’m building a hybrid search + LLM pipeline to answer questions from the end-of-chapter Questionnaires in the freely available fastai textbook.
Here are the results from this notebook. Answer Rate is the percentage of Chapter 1 Questionnaire questions answered using context retrieved from SQLite full-text search with Claude-generated keywords.
Prompt | Answer Rate |
---|---|
A | 33% |
B | 33% |
C | 36% |
D | 33% |
E | 40% |
This section defines helper functions needed to chunk, load and retrieve the Chapter 1 notebook from a sqlite database.
def get_chunks(notebook_path):
with open(notebook_path, 'r', encoding='utf-8') as file:
notebook = json.load(file)
chunks = []
current_header = ""
def add_chunk(content):
if content.strip():
chunks.append(f"{current_header}\n\n{content.strip()}")
for cell in notebook['cells']:
if cell['cell_type'] == 'markdown':
content = ''.join(cell['source'])
header_match = re.match(r'^(#+\s+.*?)$', content, re.MULTILINE)
if header_match: # Check if the cell starts with a header
current_header = header_match.group(1)
# Add any content after the header in the same cell
remaining_content = content[len(current_header):].strip()
if remaining_content:
paragraphs = re.split(r'\n\s*\n', remaining_content)
for paragraph in paragraphs:
add_chunk(paragraph)
else:
paragraphs = re.split(r'\n\s*\n', content)
for paragraph in paragraphs:
add_chunk(paragraph)
elif cell['cell_type'] == 'code':
code_content = '```python\n' + ''.join(cell['source']) + '\n```'
# Include the output of the code cell
output_content = ''
if 'outputs' in cell and cell['outputs']:
for output in cell['outputs']:
if 'text' in output:
output_content += ''.join(output['text'])
elif 'data' in output and 'text/plain' in output['data']:
output_content += ''.join(output['data']['text/plain'])
# Combine code and output in the same chunk
combined_content = code_content + '\n\nOutput:\n' + output_content if output_content else code_content
add_chunk(combined_content)
def filter_chunks(chunks, exclude_headers=["Questionnaire", "Further Research"]):
filtered_chunks = []
for chunk in chunks:
lines = chunk.split('\n')
# Check if the first line (header) is in the exclude list
if not any(header in lines[0] for header in exclude_headers):
filtered_chunks.append(chunk)
return filtered_chunks
return filter_chunks(chunks)
load_data
functiondef load_data(chunks, db_path, chapter=1):
try:
# Create virtual table if database doesn't exist
if not os.path.exists(db_path):
with sqlite3.connect(db_path) as conn:
cur = conn.cursor()
cur.execute("""
CREATE VIRTUAL TABLE fastbook_text
USING FTS5(chapter, text);
""")
conn.commit()
# Load in the chunks for each chapter
with sqlite3.connect(db_path) as conn:
cur = conn.cursor()
for chunk in chunks:
cur.execute("INSERT INTO fastbook_text(chapter, text) VALUES (?, ?)", (chapter, chunk))
conn.commit()
res = cur.execute("SELECT * FROM fastbook_text WHERE chapter = ?", (chapter,)).fetchall()
if len(res) != len(chunks):
raise ValueError(f"Number of inserted chunks ({len(res)}) doesn't match input chunks ({len(chunks)})")
return True
except sqlite3.Error as e:
print(f"An error occurred: {e}")
return False
except Exception as e:
print(f"An unexpected error occurred: {e}")
return False
load_data
functiondef load_data(chunks, db_path, chapter=1):
try:
# Create virtual table if database doesn't exist
if not os.path.exists(db_path):
with sqlite3.connect(db_path) as conn:
cur = conn.cursor()
cur.execute("""
CREATE VIRTUAL TABLE fastbook_text
USING FTS5(chapter, text);
""")
conn.commit()
# Load in the chunks for each chapter
with sqlite3.connect(db_path) as conn:
cur = conn.cursor()
for chunk in chunks:
cur.execute("INSERT INTO fastbook_text(chapter, text) VALUES (?, ?)", (chapter, chunk))
conn.commit()
res = cur.execute("SELECT * FROM fastbook_text WHERE chapter = ?", (chapter,)).fetchall()
if len(res) != len(chunks):
raise ValueError(f"Number of inserted chunks ({len(res)}) doesn't match input chunks ({len(chunks)})")
return True
except sqlite3.Error as e:
print(f"An error occurred: {e}")
return False
except Exception as e:
print(f"An unexpected error occurred: {e}")
return False
db_search
functiondef db_search(df, limit=1):
results = []
with sqlite3.connect('fastbook.db') as conn:
cur = conn.cursor()
for _, row in df.iterrows():
keywords = ' OR '.join([f'"{keyword.strip(",")}"' for keyword in row['keywords'].replace('"', '').split()])
q = f"""
SELECT text, rank
FROM fastbook_text
WHERE fastbook_text MATCH ?
AND chapter = ?
ORDER BY rank
LIMIT ?
"""
res = cur.execute(q, (keywords, str(row['chapter']), limit)).fetchall()
res = [item[0] for item in res]
results.extend(res)
return results
During this notebook I learned the implications of full text search using corpus-wide statistics. I wanted to test out Claude keywords for chapter 1 search and only loaded in chapter 1 into the database. This retrieved different contexts for the same keywords than when I used FTS with all 7 chapters loaded in the database. In order to improve the keywords, I need to make sure the corpus is the same as the final “production” environment.
1 307
2 227
4 433
8 157
9 387
10 190
13 266
# load chunks into the database
for chapter, chunks in data.items():
print(f"Chapter {chapter}:", load_data(chunks, 'fastbook.db', chapter))
Chapter 1: True
Chapter 2: True
Chapter 4: True
Chapter 8: True
Chapter 9: True
Chapter 10: True
Chapter 13: True
The keywords generated using the following prompt retrieved context that allowed me to answer 10/30 (33%) of the Chapter 1 Questionnaire questions:
I am working on a keyword search project and i need to create 3-6 keywords for each
question_text
that I provide you. Do not generate keywords that stray too far in meaning from thequestion_text
. Only respond with the comma-separated list of keywords surrounded by double quotes.No yapping.
Examples:
question_text: Name five areas where deep learning is now the best in the world
keywords: “deep learning, state of the art, best, world”
question_text: Why is it hard to use a traditional computer program to recognize images in a photo?
keywords: “image, recognize, recognition, traditional, computer, program”
question_text: What were the two theoretical misunderstandings that held back the field of neural networks?
keywords: “theoretical, misunderstandings, held, back, field, neural network”
question_text: Why is it hard to understand why a deep learning model makes a particular prediction?
keywords:
I’ll start by creating a prompt that was recommended by Claude:
For the following question text, please generate 3-6 comma-separated keywords that capture the main concepts and are suitable for use in a SQLite full-text search query. The keywords should be concise, relevant, and help in retrieving appropriate text chunks from a database. Avoid using articles, prepositions, or other common words that don’t add significant meaning. Here’s the question text:
{question_text}
Please provide the keywords in the following format: keywords: “keyword1, keyword2, keyword3”
promptB = """For the following question text, please generate 3-6 comma-separated keywords that capture the main concepts and are suitable for use in a SQLite full-text search query. The keywords should be concise, relevant, and help in retrieving appropriate text chunks from a database. Avoid using articles, prepositions, or other common words that don't add significant meaning. Here's the question text:
{question_text}
Please provide the keywords in the following format:
keywords: "keyword1, keyword2, keyword3" """
formatted_prompt = promptB.format(question_text="Why is it hard to understand why a deep learning model makes a particular prediction?")
print(formatted_prompt)
For the following question text, please generate 3-6 comma-separated keywords that capture the main concepts and are suitable for use in a SQLite full-text search query. The keywords should be concise, relevant, and help in retrieving appropriate text chunks from a database. Avoid using articles, prepositions, or other common words that don't add significant meaning. Here's the question text:
Why is it hard to understand why a deep learning model makes a particular prediction?
Please provide the keywords in the following format:
keywords: "keyword1, keyword2, keyword3"
In: 0; Out: 0; Total: 0
keywords: “deep learning, model, prediction, understanding, interpretability”
msg_01FTbVhR4dwjNqoDmEmBexS1
[{'text': 'keywords: "deep learning, model, prediction, understanding, interpretability"', 'type': 'text'}]
claude-3-5-sonnet-20240620
assistant
end_turn
None
message
{'input_tokens': 141, 'output_tokens': 18, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}
Looks good! I’ll now use this prompt to generate keywords for all 30 Chapter 1 Questionnaire questions.
That was about 2 cents of tokens used.
['keywords: "deep learning, math, data, expensive computers, PhD"',
'keywords: "deep learning, best, areas, applications, achievements"',
'keywords: "device, artificial neuron, first"',
'keywords: "parallel distributed processing, PDP, book, requirements"',
'keywords: "theoretical misunderstandings, neural networks, field setbacks"']
Unfortunately I missed the fact that Claude includes ‘keywords’ in each respone—oops! I’ll have to do some quick cleanup.
['"deep learning, math, data, expensive computers, PhD"',
'"deep learning, best, areas, applications, achievements"',
'"device, artificial neuron, first"',
'"parallel distributed processing, PDP, book, requirements"',
'"theoretical misunderstandings, neural networks, field setbacks"']
I’ll replace the existing keywords and run the full text search again:
keywords | |
---|---|
0 | "deep learning, math, data, expensive computer... |
1 | "deep learning, best, areas, applications, ach... |
2 | "device, artificial neuron, first" |
3 | "parallel distributed processing, PDP, book, r... |
4 | "theoretical misunderstandings, neural network... |
['"architecture, design, structure, building, framework"',
'"segmentation, market, division, targeting, customer groups"',
'"y_range, purpose, usage, necessity"',
'"hyperparameters, machine learning, model tuning, optimization"',
'"avoid failures, AI implementation, organization, best practices"']
keywords | |
---|---|
28 | "architecture, design, structure, building, fr... |
29 | "segmentation, market, division, targeting, cu... |
30 | "y_range, purpose, usage, necessity" |
31 | "hyperparameters, machine learning, model tuni... |
32 | "avoid failures, AI implementation, organizati... |
These keywords retrieved chunks that allowed me to answer 10/30 or 33% of the Chapter 1 Questionnaire. This was a different set of 10 questions that Prompt A.
After looking at the data, I’ll modify the prompt to provide some more flexibility in the keyword search by requesting Claude to do two things:
keywords:
string before the example keywordsHere’s the new prompt:
For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords when possible. Include both singular and plural forms for nouns if relevant. Avoid articles, prepositions, and common words. Use this format:
{question_text}
“keyword1, keyword2, keyword3”
promptC = """"For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords when possible. Include both singular and plural forms for nouns if relevant. Avoid articles, prepositions, and common words. Use this format:
{question_text}
"keyword1, keyword2, keyword3" """
formatted_prompt = promptC.format(question_text="Why is it hard to understand why a deep learning model makes a particular prediction?")
print(formatted_prompt)
"For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords when possible. Include both singular and plural forms for nouns if relevant. Avoid articles, prepositions, and common words. Use this format:
Why is it hard to understand why a deep learning model makes a particular prediction?
"keyword1, keyword2, keyword3"
“deep learning, model, models, prediction, predictions, understand, understanding”
msg_01HNHpvbABHGxLzxacXLXuv1
[{'text': '"deep learning, model, models, prediction, predictions, understand, understanding"', 'type': 'text'}]
claude-3-5-sonnet-20240620
assistant
end_turn
None
message
{'input_tokens': 114, 'output_tokens': 19, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}
Nice! It’s following my instructions. I’ll run through the process of generating keywords, running full text search, and evaluating the results:
['"deep learning, math, data, computers, expensive, PhD"',
'"deep learning, areas, best, world, artificial intelligence, neural networks"',
'"neuron, neurons, device, artificial, first"',
'"parallel, distributed, processing, PDP, requirements, book"',
'"neural, networks, theoretical, misunderstandings, field"']
keywords | |
---|---|
0 | "deep learning, math, data, computers, expensi... |
1 | "deep learning, areas, best, world, artificial... |
2 | "neuron, neurons, device, artificial, first" |
3 | "parallel, distributed, processing, PDP, requi... |
4 | "neural, networks, theoretical, misunderstandi... |
The retrieved context allowed me to answer 11/30 or 36% of the questions. Moving in the right direction! One of the set of keywords had some additional text and that would have likely increased the answer rate to 12/30:
'Here are the keywords for the given question:\n\n"hyperparameters, hyperparameter, parameter, parameters, machine learning, tuning"'
I’ll also ask it to include any numbers it finds in the question text as separate keywords, as that might have gotten me another correct answer:
17 | |
---|---|
chapter | 1 |
question_number | 18 |
question_text | ""Do we always have to use 224×224-pixel image... |
answer | ""No we do not. 224x224 is commonly used for h... |
is_answerable | 1 |
keywords | "cat, cats, recognition, model, image, images,... |
context | ### How Our Image Recognizer Works\n\nIn the t... |
I’ll update the prompt with the following observations:
Here’s the new prompt:
For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords when possible. Include both singular and plural forms for nouns if relevant. Include any numbers as keywords. Avoid articles, prepositions, and common words. Use this format. No yapping:
{question_text}
“keyword1, keyword2, keyword3”
promptD = """For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords when possible. Include both singular and plural forms for nouns if relevant. Include any numbers as keywords. Avoid articles, prepositions, and common words. Use this format. No yapping:
{question_text}
"keyword1, keyword2, keyword3" """
formatted_prompt = promptD.format(question_text="Why is it hard to understand why a deep learning model makes a particular prediction?")
print(formatted_prompt)
For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords when possible. Include both singular and plural forms for nouns if relevant. Include any numbers as keywords. Avoid articles, prepositions, and common words. Use this format. No yapping:
Why is it hard to understand why a deep learning model makes a particular prediction?
"keyword1, keyword2, keyword3"
“deep, learning, model, prediction, understand, hard”
msg_01QtPrjfRGNPmt8eLxDxZcZE
[{'text': '"deep, learning, model, prediction, understand, hard"', 'type': 'text'}]
claude-3-5-sonnet-20240620
assistant
end_turn
None
message
{'input_tokens': 123, 'output_tokens': 16, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}
It’s not including plural versions of nouns so I’ll remove the “if relevant” from the instruction and try again:
promptD = """For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords. Include both singular and plural forms for nouns. Include any numbers as keywords. Avoid articles, prepositions, and common words. No yapping:
{question_text}
"keyword1, keyword2, keyword3" """
formatted_prompt = promptD.format(question_text="Why is it hard to understand why a deep learning model makes a particular prediction?")
print(formatted_prompt)
For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords. Include both singular and plural forms for nouns. Include any numbers as keywords. Avoid articles, prepositions, and common words. No yapping:
Why is it hard to understand why a deep learning model makes a particular prediction?
"keyword1, keyword2, keyword3"
deep, learning, model, models, prediction, predictions, understand, understanding
msg_01EQqyCJDRCc81cLL78E8sA7
[{'text': 'deep, learning, model, models, prediction, predictions, understand, understanding', 'type': 'text'}]
claude-3-5-sonnet-20240620
assistant
end_turn
None
message
{'input_tokens': 115, 'output_tokens': 18, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}
Nice! That fixed it at least for one example.
I’ll look at the full set of keywords to make sure there are no filler texts included:
['deep, learning, math, data, computers, phd',
'deep, learning, areas, best, world',
'"neuron, neurons, device, devices, artificial, first"',
'book, requirements, parallel, distributed, processing, pdp',
'theoretical, misunderstandings, held, back, field, neural, networks',
'"GPU, GPUs, graphics, processor, processors"',
'notebook, notebooks, execute, cell, cells, 1',
'image, images, photo, photos, recognize, recognition, computer, program, traditional',
'"Samuel, weight, weights, assignment, assignments"',
'deep, learning, weights, weight, Samuel, term',
'deep, learning, model, models, prediction, predictions, understand, understanding',
'theorem, neural, networks, mathematical, problem, accuracy',
'train, model, models, training, dataset, datasets, data',
'feedback, loops, rollout, rollouts, predictive, policing, model, models',
'cat, cats, recognition, model, 224, pixel, pixels, image, images',
'classification, classifications, regression, regressions, difference, differences',
'validation, validations, test, tests, set, sets',
'fastai, validation, set, sets',
'random, sample, samples, validation, set, sets',
'overfitting, overfits, example, examples, model, models',
'metric, metrics, loss, losses, differ, difference',
'pretrained, models, model, help',
'head, heads, model, models',
'cnn, cnns, layer, layers, feature, features, early, late',
'image, images, model, models, photo, photos',
'architecture, architectures',
'"segmentation, segment, segments"',
'"y_range, ranges, plotting, visualization, axis, limits"',
'hyperparameters, hyperparameter, machine, learning, model, parameter',
'"ai, artificial, intelligence, failures, avoid, organization, organizations"']
keywords | |
---|---|
0 | deep, learning, math, data, computers, phd |
1 | deep, learning, areas, best, world |
2 | "neuron, neurons, device, devices, artificial,... |
3 | book, requirements, parallel, distributed, pro... |
4 | theoretical, misunderstandings, held, back, fi... |
This prompt resulted in 10/30 or a 33% Answer Rate and didn’t improve the retrieved contexts in the way I thought it would! I’ll go back to Prompt C and only add “no yapping”
For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords when possible. Include both singular and plural forms for nouns if relevant. Avoid articles, prepositions, and common words. Use this format. No yapping:
{question_text}
“keyword1, keyword2, keyword3”
promptE = """"For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords when possible. Include both singular and plural forms for nouns if relevant. Avoid articles, prepositions, and common words. Use this format. No yapping:
{question_text}
"keyword1, keyword2, keyword3" """
formatted_prompt = promptE.format(question_text="Why is it hard to understand why a deep learning model makes a particular prediction?")
print(formatted_prompt)
"For the given question text, generate 3-6 comma-separated keywords that capture the main concepts for a SQLite full-text search query. Prefer single-word keywords when possible. Include both singular and plural forms for nouns if relevant. Avoid articles, prepositions, and common words. Use this format. No yapping:
Why is it hard to understand why a deep learning model makes a particular prediction?
"keyword1, keyword2, keyword3"
“deep learning, model, prediction, understand, predictions”
msg_01G3w22vDXTpWryUwUpqouXN
[{'text': '"deep learning, model, prediction, understand, predictions"', 'type': 'text'}]
claude-3-5-sonnet-20240620
assistant
end_turn
None
message
{'input_tokens': 118, 'output_tokens': 15, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}
['"deep learning, math, data, computers, PhD"',
'deep learning, areas, best, world',
'"neuron, neurons, device, artificial, principle"',
'"parallel, distributed, processing, PDP, requirements, book"',
'"neural, networks, theoretical, misunderstandings, field"',
'"GPU, graphics, processor, processors, computing, hardware"',
'"notebook, execute, cell, calculation, result"',
'"computer, computers, image, images, recognition, photo, photos"',
'"Samuel, weight, weights, assignment, assignments"',
'deep learning, weights, Samuel, term',
'"deep learning, model, prediction, understand, predictions"',
'"theorem, neural, network, networks, mathematical, problem, accuracy"',
'"train, model, training, models, dataset, data"',
'"feedback, loop, rollout, predictive, policing, model, models"',
'"cat, cats, recognition, model, image, images, pixel, pixels"',
'"classification, regression, difference, differences, machine learning, models"',
'"validation, test, set, sets, need, purpose"',
'"fastai, validation, set, datasets"',
'"random, sample, samples, validation, set, sets"',
'"overfitting, overfit, example, model, data, machine learning"',
'"metric, metrics, loss, differ, difference, measurement"',
'"pretrained, models, help, pretraining, model"',
'head, model, models',
'"CNN, layers, features, early, later"',
'"image, images, model, models, photo, photos"',
'"architecture, architectures, design, structure, building"',
'"segmentation, segment, segments, divide, division, categorize"',
'"y_range, range, purpose, usage, need"',
'"hyperparameters, hyperparameter, machine learning, model, tuning, optimization"',
'"AI, failures, avoid, organization, organizations"']
keywords | |
---|---|
0 | "deep learning, math, data, computers, PhD" |
1 | deep learning, areas, best, world |
2 | "neuron, neurons, device, artificial, principle" |
3 | "parallel, distributed, processing, PDP, requi... |
4 | "neural, networks, theoretical, misunderstandi... |
The keywords generated by this prompt resulted in retrieved chunks that allowed me to answer 12/30 or 40% of the questions! This is comparable to the 40% achieved with my manually generated keywords.
I’ll go ahead and Claude-generate keywords for all 202 questions in my dataset.
# get the questions
url = 'https://gist.githubusercontent.com/vishalbakshi/309fb3abb222d32446b2c4e29db753fe/raw/5e41b9eb34f515f00321e55307cc4d5abbd75cb5/fastbookRAG_evals.csv'
questions = pd.read_csv(url).query('is_answerable == 1')
questions.shape
(202, 6)
Here are the results from this notebook. Answer Rate is the percentage of Chapter 1 Questionnaire questions answered using context retrieved from SQLite full-text search with Claude-generated keywords.
Prompt | Answer Rate |
---|---|
A | 33% |
B | 33% |
C | 36% |
D | 33% |
E | 40% |
I learned a few important lessons in this work:
I’m pretty excited about using these Claude-generated keywords as they resulted in the same answer rate (40%) as my manually generated keywords. That’s a promising start!
I hope you enjoyed this blog post. Follow me on Twitter @vishal_learner.