Setup¶
For illustration, we'll generate a toy dataset. We'll assume:
- 1,000 examples
- Each example has in inherint difficulty
- Each example is valid or not valid at a certain probability $p_e$
We could sample random quantities from random normal, but let's keep things simple with random uniform. Additionally, we'll assume that if skill > difficulty, the response is correct
In [ ]:
import sys
import os
import numpy as np
from pydantic import BaseModel
# Add similar code if using git cloned version of py-irt
# sys.path.append("/data/home/par/code/py-irt")
In [ ]:
class Item(BaseModel):
valid: bool
difficulty: float
item_id: str
category: str = 'all'
class Subject(BaseModel):
subject_id: str
skill: float
# We'll do a quick export to json to make reading the dataset easier. From the docs:
# Each row looks like this:
# {"subject_id": "<subject_id>", "responses": {"<item_id>": <response>}}
import random
from py_irt.io import write_jsonlines
from py_irt.dataset import Dataset
def write_irt_dataset(subjects: list[Subject], items: list[Item], path: str):
print("Writing dataset with")
print("N Subjects", len(subjects))
print("N Items", len(items))
rows = []
score_by_subject = {}
lookup = {}
for subject in subjects:
responses = {}
correct = 0
total = 0
for item in items:
if item.valid:
responses[item.item_id] = int(1 / (1 + np.exp(-(subject.skill - item.difficulty))) > random.random())
else:
responses[item.item_id] = int(random.random() > .5)
correct += responses[item.item_id]
total += 1
score_by_subject[subject.subject_id] = correct / total
lookup[subject.subject_id] = responses
rows.append({"subject_id": subject.subject_id, "responses": responses})
write_jsonlines(path, rows)
return score_by_subject, lookup
diff_by_cat = {
'easy': (-4, 0),
'moderate': (0, 3),
'hard': (3, 4),
'all': (-4, 4),
}
validity_rate = {
'easy': .95,
'moderate': .9,
'hard': .8,
'all': .95,
}
def create_item(item_id: str, category: str):
validity = np.random.uniform()
if validity > validity_rate[category]:
valid = 0
else:
valid = 1
min_diff, max_diff = diff_by_cat[category]
return Item(item_id=item_id, valid=valid, difficulty=np.random.uniform(low=min_diff, high=max_diff), category=category)
max_skill = 4
min_skill = -4
def create_subject(subject_id: str):
return Subject(subject_id=subject_id, skill=np.random.uniform(low=min_skill, high=max_skill))
In [ ]:
items = [create_item(f'item_{idx}', 'all') for idx in range(1_000)]
subjects = [create_subject(f'subject_{idx}') for idx in range(10)]
In [ ]:
score_by_subject, subject_responses = write_irt_dataset(subjects, items, '/tmp/irt_dataset.jsonlines')
Writing dataset with N Subjects 10 N Items 1000
In [ ]:
score_by_subject
Out[ ]:
{'subject_0': 0.827, 'subject_1': 0.208, 'subject_2': 0.39, 'subject_3': 0.395, 'subject_4': 0.119, 'subject_5': 0.693, 'subject_6': 0.256, 'subject_7': 0.567, 'subject_8': 0.425, 'subject_9': 0.299}
In [ ]:
# We could use the CLI, but let's show this more manual way to do things
import py_irt.models.tutorial_model
from py_irt.config import IrtConfig
from py_irt.dataset import Dataset
from py_irt.training import IrtModelTrainer
dataset = Dataset.from_jsonlines("/tmp/irt_dataset.jsonlines")
config = IrtConfig(model_type='tutorial', log_every=500, dropout=.2)
trainer = IrtModelTrainer(config=config, data_path=None, dataset=dataset)
trainer.train(epochs=2000, device='cuda')
[20:12:19] Vocab size: None training.py:90
args: {'device': 'cuda', 'num_items': 1000, 'num_subjects': 10} training.py:134
Parsed Model Args: {'device': 'cuda', 'num_items': 1000, 'num_subjects': 10, 'priors': training.py:147 'vague', 'dropout': 0.2, 'hidden': 100, 'vocab_size': None}
torch.Size([10000]) torch.Size([10000])
Output()
Training Pyro IRT Model for 5000 epochs
In [ ]:
for subject, skill, acc in sorted(list(zip(subjects, trainer.last_params['ability'], score_by_subject.values())), key=lambda v: v[0].skill):
print(subject.subject_id, "Real Skill", subject.skill, "Inferred Skill", skill, "Acc", acc)
subject_5 Real Skill -2.888685028673531 Inferred Skill -9.79281234741211 Acc 0.217 subject_7 Real Skill -1.8778236439071012 Inferred Skill -6.580785751342773 Acc 0.289 subject_6 Real Skill -1.6372365305681011 Inferred Skill -5.113847732543945 Acc 0.325 subject_1 Real Skill -1.0868332475296318 Inferred Skill -4.000438213348389 Acc 0.357 subject_8 Real Skill -0.8969338063093151 Inferred Skill -2.4728634357452393 Acc 0.413 subject_3 Real Skill 0.2819802458991951 Inferred Skill 0.6491830348968506 Acc 0.541 subject_0 Real Skill 1.0494960952393102 Inferred Skill 2.924746513366699 Acc 0.637 subject_9 Real Skill 1.7339647649676957 Inferred Skill 4.677668571472168 Acc 0.695 subject_4 Real Skill 2.2070139876293053 Inferred Skill 7.363603591918945 Acc 0.747 subject_2 Real Skill 3.374754881644466 Inferred Skill 11.831231117248535 Acc 0.844
In [ ]:
trainer.last_params.keys()
Out[ ]:
dict_keys(['ability', 'diff', 'disc', 'irt_model', 'item_ids', 'subject_ids'])
In [ ]:
import pandas as pd
item_rows = []
for item, difficulty, disc in zip(items, trainer.last_params['diff'], trainer.last_params['disc']):
item_rows.append({
'true_diff': item.difficulty,
'validity': 'Valid' if item.valid else 'Invalid',
'irt_diff': difficulty,
'irt_disc': disc
})
item_df = pd.DataFrame(item_rows)
In [ ]:
item_df
Out[ ]:
true_diff | validity | irt_diff | irt_disc | |
---|---|---|---|---|
0 | 1.410782 | Valid | 0.683076 | -0.378941 |
1 | -2.139332 | Valid | -1.973820 | -5.279240 |
2 | -0.782896 | Valid | -0.595877 | -4.458278 |
3 | -1.245904 | Valid | -3.752348 | -1.301824 |
4 | -3.022048 | Valid | -15.655530 | 3.058197 |
... | ... | ... | ... | ... |
995 | -1.179835 | Valid | -2.595922 | -1.203499 |
996 | -1.759242 | Valid | -5.375353 | 2.075028 |
997 | -0.352382 | Invalid | -0.330320 | -7.033720 |
998 | 3.469678 | Valid | 5.712102 | 4.391975 |
999 | -1.430196 | Valid | -1.430953 | 3.788696 |
1000 rows × 4 columns
In [ ]:
import altair as alt
disc_chart = alt.Chart(item_df).mark_bar().encode(
x=alt.X('irt_disc', title='IRT Discriminability').bin(maxbins=20),
y=alt.Y('count()', title='Count'),
color=alt.Color('validity', title='Validity', legend=None).scale(domain=['Valid', 'Invalid'], range=['green', 'red']),
row=alt.Row('validity', title='Validity')
).resolve_scale(y='independent').properties(width=600, height=200)
disc_chart.save('/data/home/par/code/tutorial_content/auto_figs/validity_disc.pdf')
disc_chart
Out[ ]:
In [ ]:
# Now let's see how this changes when items have different difficulty distributions
# We'll copy the dataset generation code, but make some modifications
n_by_cat = {
'easy': 800,
'moderate': 150,
'hard': 50,
}
items = []
idx = 0
for cat, n in n_by_cat.items():
for _ in range(n):
items.append(create_item(f'item_{idx}', cat))
idx += 1
subjects = [create_subject(f'subject_{idx}') for idx in range(10)]
In [ ]:
score_by_subject, subject_responses = write_irt_dataset(subjects, items, '/tmp/irt_dataset_model_eval.jsonlines')
Writing dataset with N Subjects 10 N Items 1000
In [ ]:
# We could use the CLI, but let's show this more manual way to do things
import py_irt.models.tutorial_model
from py_irt.config import IrtConfig
from py_irt.training import IrtModelTrainer
dataset = Dataset.from_jsonlines("/tmp/irt_dataset_model_eval.jsonlines")
config = IrtConfig(model_type='tutorial', log_every=500, dropout=.2)
trainer = IrtModelTrainer(config=config, data_path=None, dataset=dataset)
trainer.train(epochs=5000, device='cuda')
[18:50:11] amortized: False dataset.py:112
[18:50:11] Vocab size: None training.py:90
args: {'device': 'cuda', 'num_items': 1000, 'num_subjects': 10} training.py:134
Parsed Model Args: {'device': 'cuda', 'num_items': 1000, 'num_subjects': 10, 'priors': training.py:147 'vague', 'dropout': 0.2, 'hidden': 100, 'vocab_size': None}
Output()
Training Pyro IRT Model for 5000 epochs
torch.Size([10000]) torch.Size([10000])
In [ ]:
tex_rows = []
for subject, skill, acc in sorted(list(zip(subjects, trainer.last_params['ability'], score_by_subject.values())), key=lambda v: v[0].skill):
easy_acc = sum(list(subject_responses[subject.subject_id].values())[:800]) / 800
mod_acc = sum(list(subject_responses[subject.subject_id].values())[800:950]) / 150
hard_acc = sum(list(subject_responses[subject.subject_id].values())[950:]) / 50
print(subject.subject_id, "Real Skill", subject.skill, "Inferred Skill", skill, "Acc", acc, easy_acc, mod_acc, hard_acc)
tex_rows.append({
'Subject': subject.subject_id,
'True Skill': subject.skill,
'IRT Skill': skill,
"Total Accuracy": acc,
"Easy Accuracy": easy_acc,
"Moderate Accuracy": mod_acc,
"Hard Accuracy": hard_acc
})
tex_df = pd.DataFrame(tex_rows)
tex_df
subject_0 Real Skill -3.5063231302018663 Inferred Skill -12.085612297058105 Acc 0.194 0.21875 0.09333333333333334 0.1 subject_7 Real Skill -3.0009702493734345 Inferred Skill -7.614261150360107 Acc 0.256 0.30125 0.06666666666666667 0.1 subject_5 Real Skill -2.6451543432394153 Inferred Skill -4.8897600173950195 Acc 0.325 0.38 0.09333333333333334 0.14 subject_9 Real Skill -1.2143958761350433 Inferred Skill 0.3483102321624756 Acc 0.543 0.65 0.11333333333333333 0.12 subject_2 Real Skill -1.1566100082358544 Inferred Skill 1.4074431657791138 Acc 0.56 0.6675 0.12 0.16 subject_4 Real Skill -0.7486909676264784 Inferred Skill 2.6881115436553955 Acc 0.602 0.7125 0.14666666666666667 0.2 subject_6 Real Skill -0.4557575428455154 Inferred Skill 3.3689770698547363 Acc 0.631 0.74625 0.19333333333333333 0.1 subject_8 Real Skill 0.2328339248915965 Inferred Skill 5.7671217918396 Acc 0.729 0.84875 0.29333333333333333 0.12 subject_3 Real Skill 2.169872436645896 Inferred Skill 11.172179222106934 Acc 0.865 0.95625 0.5866666666666667 0.24 subject_1 Real Skill 2.5020633991553725 Inferred Skill 14.247663497924805 Acc 0.897 0.97125 0.6866666666666666 0.34
Out[ ]:
Subject | True Skill | IRT Skill | Total Accuracy | Easy Accuracy | Moderate Accuracy | Hard Accuracy | |
---|---|---|---|---|---|---|---|
0 | subject_0 | -3.506323 | -12.085612 | 0.194 | 0.21875 | 0.093333 | 0.10 |
1 | subject_7 | -3.000970 | -7.614261 | 0.256 | 0.30125 | 0.066667 | 0.10 |
2 | subject_5 | -2.645154 | -4.889760 | 0.325 | 0.38000 | 0.093333 | 0.14 |
3 | subject_9 | -1.214396 | 0.348310 | 0.543 | 0.65000 | 0.113333 | 0.12 |
4 | subject_2 | -1.156610 | 1.407443 | 0.560 | 0.66750 | 0.120000 | 0.16 |
5 | subject_4 | -0.748691 | 2.688112 | 0.602 | 0.71250 | 0.146667 | 0.20 |
6 | subject_6 | -0.455758 | 3.368977 | 0.631 | 0.74625 | 0.193333 | 0.10 |
7 | subject_8 | 0.232834 | 5.767122 | 0.729 | 0.84875 | 0.293333 | 0.12 |
8 | subject_3 | 2.169872 | 11.172179 | 0.865 | 0.95625 | 0.586667 | 0.24 |
9 | subject_1 | 2.502063 | 14.247663 | 0.897 | 0.97125 | 0.686667 | 0.34 |
In [ ]:
print(tex_df.to_latex())
\begin{tabular}{llrrrrrr} \toprule & Subject & True Skill & IRT Skill & Total Accuracy & Easy Accuracy & Moderate Accuracy & Hard Accuracy \\ \midrule 0 & subject_0 & -3.506323 & -12.085612 & 0.194000 & 0.218750 & 0.093333 & 0.100000 \\ 1 & subject_7 & -3.000970 & -7.614261 & 0.256000 & 0.301250 & 0.066667 & 0.100000 \\ 2 & subject_5 & -2.645154 & -4.889760 & 0.325000 & 0.380000 & 0.093333 & 0.140000 \\ 3 & subject_9 & -1.214396 & 0.348310 & 0.543000 & 0.650000 & 0.113333 & 0.120000 \\ 4 & subject_2 & -1.156610 & 1.407443 & 0.560000 & 0.667500 & 0.120000 & 0.160000 \\ 5 & subject_4 & -0.748691 & 2.688112 & 0.602000 & 0.712500 & 0.146667 & 0.200000 \\ 6 & subject_6 & -0.455758 & 3.368977 & 0.631000 & 0.746250 & 0.193333 & 0.100000 \\ 7 & subject_8 & 0.232834 & 5.767122 & 0.729000 & 0.848750 & 0.293333 & 0.120000 \\ 8 & subject_3 & 2.169872 & 11.172179 & 0.865000 & 0.956250 & 0.586667 & 0.240000 \\ 9 & subject_1 & 2.502063 & 14.247663 & 0.897000 & 0.971250 & 0.686667 & 0.340000 \\ \bottomrule \end{tabular}
In [ ]:
import pandas as pd
item_rows = []
for item, difficulty, disc in zip(items, trainer.last_params['diff'], trainer.last_params['disc']):
item_rows.append({
'true_diff': item.difficulty,
'validity': 'Valid' if item.valid else 'Invalid',
'irt_diff': difficulty,
'irt_disc': disc,
'category': item.category,
})
item_df = pd.DataFrame(item_rows)
item_df
Out[ ]:
true_diff | validity | irt_diff | irt_disc | category | |
---|---|---|---|---|---|
0 | -0.657178 | Valid | -2.489216 | 4.146462 | easy |
1 | -2.794370 | Valid | -5.867486 | 3.705094 | easy |
2 | -0.589497 | Valid | -0.755564 | -1.241606 | easy |
3 | -2.886102 | Valid | -3.598421 | -2.543126 | easy |
4 | -3.291845 | Valid | -5.617871 | 2.728650 | easy |
... | ... | ... | ... | ... | ... |
995 | 3.046604 | Valid | 17.771629 | 3.794744 | hard |
996 | 3.084157 | Invalid | 0.893561 | -8.122875 | hard |
997 | 3.371910 | Valid | 17.070395 | 2.952128 | hard |
998 | 3.146079 | Valid | 9.788247 | 4.310520 | hard |
999 | 3.819924 | Valid | 12.230515 | 2.920415 | hard |
1000 rows × 5 columns
In [ ]:
import altair as alt
disc_chart = alt.Chart(item_df).mark_bar().encode(
x=alt.X('irt_disc', title='IRT Discriminability').bin(maxbins=20),
y=alt.Y('count()', title='Count'),
color=alt.Color('category', title='Category'),
row=alt.Row('validity', title='Validity')
).resolve_scale(y='independent').properties(width=600, height=200)
disc_chart.save('/data/home/par/code/tutorial_content/auto_figs/validity_diff_by_cat_disc.pdf')
disc_chart
Out[ ]:
In [ ]: