Randomized Controlled Experiments#

from datascience import *
from cs104 import *
import numpy as np
%matplotlib inline

1. Back Pain: Randomized Controlled Experiments#

rct = Table.read_table('data/bta.csv')
rct.show()
Group Result
Control 1
Control 1
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 0
Treatment 0
Treatment 0
Treatment 0
Treatment 0
Treatment 0
rct.pivot('Result', 'Group') #Recall: the default aggregation function counts the items 
Group 0.0 1.0
Control 14 2
Treatment 6 9
# This tells us the proportion in each group who improved.
rct.group('Group', np.mean)
Group Result mean
Control 0.125
Treatment 0.6

The Test Statistic#

def difference_of_proportions(table, group_label, value_label):
    """Takes: name of table,
    column label that indicates which group the row relates to
    Returns: Difference of proportions of 1's in the two groups"""
    
    # table containing group means
    proportions_table = table.group(group_label, np.mean)
    
    # array of group means
    proportions = proportions_table.column(value_label + ' mean')
    
    return abs(proportions.item(1) - proportions.item(0))
observed_diff = difference_of_proportions(rct, 'Group', 'Result')
observed_diff
0.475

Sampling#

shuffled_labels = rct.sample(with_replacement=False).column('Group')
original_and_shuffled = rct.with_column('Shuffled Label', shuffled_labels)
original_and_shuffled
Group Result Shuffled Label
Control 1 Control
Control 1 Treatment
Control 0 Treatment
Control 0 Control
Control 0 Treatment
Control 0 Treatment
Control 0 Control
Control 0 Control
Control 0 Treatment
Control 0 Treatment

... (21 rows omitted)

# Original RCT
original_and_shuffled.select('Result', 'Group').group('Group', np.mean)
Group Result mean
Control 0.125
Treatment 0.6
# Shuffled RCT from the permutation test
original_and_shuffled.select('Result', 'Shuffled Label').group('Shuffled Label', np.mean)
Shuffled Label Result mean
Control 0.4375
Treatment 0.266667
def permutation_sample(table, group_column_name):
    """
    Returns: The table with a new "Shuffled Label" column containing
    the shuffled values of the group column.
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement=False).column(group_column_name)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.with_column('Shuffled Label', shuffled_labels)
    
    return shuffled_table

Simulation and Plotting#

def empirical_difference_of_proportions_distribution(num_iterations):
    simulated_diffs = make_array()

    for i in np.arange(num_iterations):
        
        one_sample = permutation_sample(rct, 'Group')
        statistic_one_sample = difference_of_proportions(one_sample, 
                                                             "Shuffled Label", 
                                                             'Result')
        
        simulated_diffs = np.append(simulated_diffs, statistic_one_sample)
    
    return simulated_diffs
simulated_diffs = empirical_difference_of_proportions_distribution(2000)
results = Table().with_columns('Absolute Difference between\nTreatment and Control', 
                               simulated_diffs)
results.hist(left_end = percentile(95, simulated_diffs), bins=np.arange(0,0.7,0.1))
plots.scatter(observed_diff, 0, color='red', s=60, clip_on=False, zorder=3)
plots.title('Back pain experiment');
../_images/22-randomized-controlled-experiments_16_0.png

P-value#

sum(simulated_diffs >= observed_diff) / len(simulated_diffs)
0.008

2. John Snow from scratch#

snows_data = Table().read_table('data/snow_data.csv')
snows_data.show(10)
Water Company Cholera death
Lambeth False
S&V False
S&V False
S&V False
S&V False
S&V False
S&V True
S&V False
Lambeth False
S&V False

... (66143 rows omitted)

# We have not prepared anything for this example. 
# Direct us and we will make mistakes together.