Setup¶

For illustration, we'll generate a toy dataset. We'll assume:

  • 1,000 examples
  • Each example has in inherint difficulty
  • Each example is valid or not valid at a certain probability $p_e$

We could sample random quantities from random normal, but let's keep things simple with random uniform. Additionally, we'll assume that if skill > difficulty, the response is correct

In [ ]:
import sys
import os
import numpy as np
from pydantic import BaseModel

# Add similar code if using git cloned version of py-irt
# sys.path.append("/data/home/par/code/py-irt")
In [ ]:
class Item(BaseModel):
    valid: bool
    difficulty: float
    item_id: str
    category: str = 'all'

class Subject(BaseModel):
    subject_id: str
    skill: float

# We'll do a quick export to json to make reading the dataset easier. From the docs:
# Each row looks like this:
# {"subject_id": "<subject_id>", "responses": {"<item_id>": <response>}}
import random
from py_irt.io import write_jsonlines
from py_irt.dataset import Dataset

def write_irt_dataset(subjects: list[Subject], items: list[Item], path: str):
    print("Writing dataset with")
    print("N Subjects", len(subjects))
    print("N Items", len(items))
    rows = []
    score_by_subject = {}
    lookup = {}
    for subject in subjects:
        responses = {}
        correct = 0
        total = 0
        for item in items:
            if item.valid:
                responses[item.item_id] = int(1 / (1 + np.exp(-(subject.skill - item.difficulty))) > random.random())
            else:
                responses[item.item_id] = int(random.random() > .5)
            correct += responses[item.item_id]
            total += 1
        score_by_subject[subject.subject_id] = correct / total
        lookup[subject.subject_id] = responses
        rows.append({"subject_id": subject.subject_id, "responses": responses})

    write_jsonlines(path, rows)
    return score_by_subject, lookup


diff_by_cat = {
    'easy': (-4, 0),
    'moderate': (0, 3),
    'hard': (3, 4),
    'all': (-4, 4),
}

validity_rate = {
    'easy': .95,
    'moderate': .9,
    'hard': .8,
    'all': .95,
}

def create_item(item_id: str, category: str):
    validity = np.random.uniform()
    if validity > validity_rate[category]:
        valid = 0
    else:
        valid = 1
    min_diff, max_diff = diff_by_cat[category]
    
    return Item(item_id=item_id, valid=valid, difficulty=np.random.uniform(low=min_diff, high=max_diff), category=category)


max_skill = 4
min_skill = -4

def create_subject(subject_id: str):
    return Subject(subject_id=subject_id, skill=np.random.uniform(low=min_skill, high=max_skill))
In [ ]:
items = [create_item(f'item_{idx}', 'all') for idx in range(1_000)]
subjects = [create_subject(f'subject_{idx}') for idx in range(10)]
In [ ]:
score_by_subject, subject_responses = write_irt_dataset(subjects, items, '/tmp/irt_dataset.jsonlines')
Writing dataset with
N Subjects 10
N Items 1000
In [ ]:
score_by_subject
Out[ ]:
{'subject_0': 0.827,
 'subject_1': 0.208,
 'subject_2': 0.39,
 'subject_3': 0.395,
 'subject_4': 0.119,
 'subject_5': 0.693,
 'subject_6': 0.256,
 'subject_7': 0.567,
 'subject_8': 0.425,
 'subject_9': 0.299}
In [ ]:
# We could use the CLI, but let's show this more manual way to do things

import py_irt.models.tutorial_model
from py_irt.config import IrtConfig
from py_irt.dataset import Dataset
from py_irt.training import IrtModelTrainer

dataset = Dataset.from_jsonlines("/tmp/irt_dataset.jsonlines")
config = IrtConfig(model_type='tutorial', log_every=500, dropout=.2)
trainer = IrtModelTrainer(config=config, data_path=None, dataset=dataset)
trainer.train(epochs=2000, device='cuda')
[20:12:19] Vocab size: None                                                                          training.py:90
           args: {'device': 'cuda', 'num_items': 1000, 'num_subjects': 10}                          training.py:134
           Parsed Model Args: {'device': 'cuda', 'num_items': 1000, 'num_subjects': 10, 'priors':   training.py:147
           'vague', 'dropout': 0.2, 'hidden': 100, 'vocab_size': None}                                             
torch.Size([10000]) torch.Size([10000])
Output()
Training Pyro IRT Model for 5000 epochs

In [ ]:
for subject, skill, acc in sorted(list(zip(subjects, trainer.last_params['ability'], score_by_subject.values())), key=lambda v: v[0].skill):
    print(subject.subject_id, "Real Skill", subject.skill, "Inferred Skill", skill, "Acc", acc)
subject_5 Real Skill -2.888685028673531 Inferred Skill -9.79281234741211 Acc 0.217
subject_7 Real Skill -1.8778236439071012 Inferred Skill -6.580785751342773 Acc 0.289
subject_6 Real Skill -1.6372365305681011 Inferred Skill -5.113847732543945 Acc 0.325
subject_1 Real Skill -1.0868332475296318 Inferred Skill -4.000438213348389 Acc 0.357
subject_8 Real Skill -0.8969338063093151 Inferred Skill -2.4728634357452393 Acc 0.413
subject_3 Real Skill 0.2819802458991951 Inferred Skill 0.6491830348968506 Acc 0.541
subject_0 Real Skill 1.0494960952393102 Inferred Skill 2.924746513366699 Acc 0.637
subject_9 Real Skill 1.7339647649676957 Inferred Skill 4.677668571472168 Acc 0.695
subject_4 Real Skill 2.2070139876293053 Inferred Skill 7.363603591918945 Acc 0.747
subject_2 Real Skill 3.374754881644466 Inferred Skill 11.831231117248535 Acc 0.844
In [ ]:
trainer.last_params.keys()
Out[ ]:
dict_keys(['ability', 'diff', 'disc', 'irt_model', 'item_ids', 'subject_ids'])
In [ ]:
import pandas as pd

item_rows = []
for item, difficulty, disc in zip(items, trainer.last_params['diff'], trainer.last_params['disc']):
    item_rows.append({
        'true_diff': item.difficulty,
        'validity': 'Valid' if item.valid else 'Invalid',
        'irt_diff': difficulty,
        'irt_disc': disc
    })

item_df = pd.DataFrame(item_rows)
In [ ]:
item_df
Out[ ]:
true_diff validity irt_diff irt_disc
0 1.410782 Valid 0.683076 -0.378941
1 -2.139332 Valid -1.973820 -5.279240
2 -0.782896 Valid -0.595877 -4.458278
3 -1.245904 Valid -3.752348 -1.301824
4 -3.022048 Valid -15.655530 3.058197
... ... ... ... ...
995 -1.179835 Valid -2.595922 -1.203499
996 -1.759242 Valid -5.375353 2.075028
997 -0.352382 Invalid -0.330320 -7.033720
998 3.469678 Valid 5.712102 4.391975
999 -1.430196 Valid -1.430953 3.788696

1000 rows × 4 columns

In [ ]:
import altair as alt

disc_chart = alt.Chart(item_df).mark_bar().encode(
    x=alt.X('irt_disc', title='IRT Discriminability').bin(maxbins=20),
    y=alt.Y('count()', title='Count'),
    color=alt.Color('validity', title='Validity', legend=None).scale(domain=['Valid', 'Invalid'], range=['green', 'red']),
    row=alt.Row('validity', title='Validity')
).resolve_scale(y='independent').properties(width=600, height=200)
disc_chart.save('/data/home/par/code/tutorial_content/auto_figs/validity_disc.pdf')
disc_chart
Out[ ]:
In [ ]:
# Now let's see how this changes when items have different difficulty distributions
# We'll copy the dataset generation code, but make some modifications

n_by_cat = {
    'easy': 800,
    'moderate': 150,
    'hard': 50,
}

items = []
idx = 0
for cat, n in n_by_cat.items():
    for _ in range(n):
        items.append(create_item(f'item_{idx}', cat))
        idx += 1

subjects = [create_subject(f'subject_{idx}') for idx in range(10)]
In [ ]:
score_by_subject, subject_responses = write_irt_dataset(subjects, items, '/tmp/irt_dataset_model_eval.jsonlines')
Writing dataset with
N Subjects 10
N Items 1000
In [ ]:
# We could use the CLI, but let's show this more manual way to do things

import py_irt.models.tutorial_model
from py_irt.config import IrtConfig
from py_irt.training import IrtModelTrainer

dataset = Dataset.from_jsonlines("/tmp/irt_dataset_model_eval.jsonlines")
config = IrtConfig(model_type='tutorial', log_every=500, dropout=.2)
trainer = IrtModelTrainer(config=config, data_path=None, dataset=dataset)
trainer.train(epochs=5000, device='cuda')
[18:50:11] amortized: False                                                                          dataset.py:112
[18:50:11] Vocab size: None                                                                          training.py:90
           args: {'device': 'cuda', 'num_items': 1000, 'num_subjects': 10}                          training.py:134
           Parsed Model Args: {'device': 'cuda', 'num_items': 1000, 'num_subjects': 10, 'priors':   training.py:147
           'vague', 'dropout': 0.2, 'hidden': 100, 'vocab_size': None}                                             
Output()
Training Pyro IRT Model for 5000 epochs
torch.Size([10000]) torch.Size([10000])

In [ ]:
tex_rows = []
for subject, skill, acc in sorted(list(zip(subjects, trainer.last_params['ability'], score_by_subject.values())), key=lambda v: v[0].skill):
    easy_acc = sum(list(subject_responses[subject.subject_id].values())[:800]) / 800
    mod_acc = sum(list(subject_responses[subject.subject_id].values())[800:950]) / 150
    hard_acc = sum(list(subject_responses[subject.subject_id].values())[950:]) / 50
    print(subject.subject_id, "Real Skill", subject.skill, "Inferred Skill", skill, "Acc", acc, easy_acc, mod_acc, hard_acc)
    tex_rows.append({
        'Subject': subject.subject_id,
        'True Skill': subject.skill,
        'IRT Skill': skill,
        "Total Accuracy": acc,
        "Easy Accuracy": easy_acc,
        "Moderate Accuracy": mod_acc,
        "Hard Accuracy": hard_acc
    })
tex_df = pd.DataFrame(tex_rows)
tex_df
subject_0 Real Skill -3.5063231302018663 Inferred Skill -12.085612297058105 Acc 0.194 0.21875 0.09333333333333334 0.1
subject_7 Real Skill -3.0009702493734345 Inferred Skill -7.614261150360107 Acc 0.256 0.30125 0.06666666666666667 0.1
subject_5 Real Skill -2.6451543432394153 Inferred Skill -4.8897600173950195 Acc 0.325 0.38 0.09333333333333334 0.14
subject_9 Real Skill -1.2143958761350433 Inferred Skill 0.3483102321624756 Acc 0.543 0.65 0.11333333333333333 0.12
subject_2 Real Skill -1.1566100082358544 Inferred Skill 1.4074431657791138 Acc 0.56 0.6675 0.12 0.16
subject_4 Real Skill -0.7486909676264784 Inferred Skill 2.6881115436553955 Acc 0.602 0.7125 0.14666666666666667 0.2
subject_6 Real Skill -0.4557575428455154 Inferred Skill 3.3689770698547363 Acc 0.631 0.74625 0.19333333333333333 0.1
subject_8 Real Skill 0.2328339248915965 Inferred Skill 5.7671217918396 Acc 0.729 0.84875 0.29333333333333333 0.12
subject_3 Real Skill 2.169872436645896 Inferred Skill 11.172179222106934 Acc 0.865 0.95625 0.5866666666666667 0.24
subject_1 Real Skill 2.5020633991553725 Inferred Skill 14.247663497924805 Acc 0.897 0.97125 0.6866666666666666 0.34
Out[ ]:
Subject True Skill IRT Skill Total Accuracy Easy Accuracy Moderate Accuracy Hard Accuracy
0 subject_0 -3.506323 -12.085612 0.194 0.21875 0.093333 0.10
1 subject_7 -3.000970 -7.614261 0.256 0.30125 0.066667 0.10
2 subject_5 -2.645154 -4.889760 0.325 0.38000 0.093333 0.14
3 subject_9 -1.214396 0.348310 0.543 0.65000 0.113333 0.12
4 subject_2 -1.156610 1.407443 0.560 0.66750 0.120000 0.16
5 subject_4 -0.748691 2.688112 0.602 0.71250 0.146667 0.20
6 subject_6 -0.455758 3.368977 0.631 0.74625 0.193333 0.10
7 subject_8 0.232834 5.767122 0.729 0.84875 0.293333 0.12
8 subject_3 2.169872 11.172179 0.865 0.95625 0.586667 0.24
9 subject_1 2.502063 14.247663 0.897 0.97125 0.686667 0.34
In [ ]:
print(tex_df.to_latex())
\begin{tabular}{llrrrrrr}
\toprule
 & Subject & True Skill & IRT Skill & Total Accuracy & Easy Accuracy & Moderate Accuracy & Hard Accuracy \\
\midrule
0 & subject_0 & -3.506323 & -12.085612 & 0.194000 & 0.218750 & 0.093333 & 0.100000 \\
1 & subject_7 & -3.000970 & -7.614261 & 0.256000 & 0.301250 & 0.066667 & 0.100000 \\
2 & subject_5 & -2.645154 & -4.889760 & 0.325000 & 0.380000 & 0.093333 & 0.140000 \\
3 & subject_9 & -1.214396 & 0.348310 & 0.543000 & 0.650000 & 0.113333 & 0.120000 \\
4 & subject_2 & -1.156610 & 1.407443 & 0.560000 & 0.667500 & 0.120000 & 0.160000 \\
5 & subject_4 & -0.748691 & 2.688112 & 0.602000 & 0.712500 & 0.146667 & 0.200000 \\
6 & subject_6 & -0.455758 & 3.368977 & 0.631000 & 0.746250 & 0.193333 & 0.100000 \\
7 & subject_8 & 0.232834 & 5.767122 & 0.729000 & 0.848750 & 0.293333 & 0.120000 \\
8 & subject_3 & 2.169872 & 11.172179 & 0.865000 & 0.956250 & 0.586667 & 0.240000 \\
9 & subject_1 & 2.502063 & 14.247663 & 0.897000 & 0.971250 & 0.686667 & 0.340000 \\
\bottomrule
\end{tabular}

In [ ]:
import pandas as pd

item_rows = []
for item, difficulty, disc in zip(items, trainer.last_params['diff'], trainer.last_params['disc']):
    item_rows.append({
        'true_diff': item.difficulty,
        'validity': 'Valid' if item.valid else 'Invalid',
        'irt_diff': difficulty,
        'irt_disc': disc,
        'category': item.category,
    })

item_df = pd.DataFrame(item_rows)
item_df
Out[ ]:
true_diff validity irt_diff irt_disc category
0 -0.657178 Valid -2.489216 4.146462 easy
1 -2.794370 Valid -5.867486 3.705094 easy
2 -0.589497 Valid -0.755564 -1.241606 easy
3 -2.886102 Valid -3.598421 -2.543126 easy
4 -3.291845 Valid -5.617871 2.728650 easy
... ... ... ... ... ...
995 3.046604 Valid 17.771629 3.794744 hard
996 3.084157 Invalid 0.893561 -8.122875 hard
997 3.371910 Valid 17.070395 2.952128 hard
998 3.146079 Valid 9.788247 4.310520 hard
999 3.819924 Valid 12.230515 2.920415 hard

1000 rows × 5 columns

In [ ]:
import altair as alt

disc_chart = alt.Chart(item_df).mark_bar().encode(
    x=alt.X('irt_disc', title='IRT Discriminability').bin(maxbins=20),
    y=alt.Y('count()', title='Count'),
    color=alt.Color('category', title='Category'),
    row=alt.Row('validity', title='Validity')
).resolve_scale(y='independent').properties(width=600, height=200)
disc_chart.save('/data/home/par/code/tutorial_content/auto_figs/validity_diff_by_cat_disc.pdf')
disc_chart
Out[ ]:
In [ ]: