Randomized Controlled Experiments
Contents
Randomized Controlled Experiments#
from datascience import *
from cs104 import *
import numpy as np
%matplotlib inline
1. Back Pain: Randomized Controlled Experiments#
rct = Table.read_table('data/bta.csv')
rct.show()
| Group | Result | 
|---|---|
| Control | 1 | 
| Control | 1 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 0 | 
| Treatment | 0 | 
| Treatment | 0 | 
| Treatment | 0 | 
| Treatment | 0 | 
| Treatment | 0 | 
rct.pivot('Result', 'Group') #Recall: the default aggregation function counts the items 
| Group | 0.0 | 1.0 | 
|---|---|---|
| Control | 14 | 2 | 
| Treatment | 6 | 9 | 
# This tells us the proportion in each group who improved.
rct.group('Group', np.mean)
| Group | Result mean | 
|---|---|
| Control | 0.125 | 
| Treatment | 0.6 | 
The Test Statistic#
def difference_of_proportions(table, group_label, value_label):
    """Takes: name of table,
    column label that indicates which group the row relates to
    Returns: Difference of proportions of 1's in the two groups"""
    
    # table containing group means
    proportions_table = table.group(group_label, np.mean)
    
    # array of group means
    proportions = proportions_table.column(value_label + ' mean')
    
    return abs(proportions.item(1) - proportions.item(0))
observed_diff = difference_of_proportions(rct, 'Group', 'Result')
observed_diff
0.475
Sampling#
shuffled_labels = rct.sample(with_replacement=False).column('Group')
original_and_shuffled = rct.with_column('Shuffled Label', shuffled_labels)
original_and_shuffled
| Group | Result | Shuffled Label | 
|---|---|---|
| Control | 1 | Control | 
| Control | 1 | Treatment | 
| Control | 0 | Treatment | 
| Control | 0 | Control | 
| Control | 0 | Treatment | 
| Control | 0 | Treatment | 
| Control | 0 | Control | 
| Control | 0 | Control | 
| Control | 0 | Treatment | 
| Control | 0 | Treatment | 
... (21 rows omitted)
# Original RCT
original_and_shuffled.select('Result', 'Group').group('Group', np.mean)
| Group | Result mean | 
|---|---|
| Control | 0.125 | 
| Treatment | 0.6 | 
# Shuffled RCT from the permutation test
original_and_shuffled.select('Result', 'Shuffled Label').group('Shuffled Label', np.mean)
| Shuffled Label | Result mean | 
|---|---|
| Control | 0.4375 | 
| Treatment | 0.266667 | 
def permutation_sample(table, group_column_name):
    """
    Returns: The table with a new "Shuffled Label" column containing
    the shuffled values of the group column.
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement=False).column(group_column_name)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.with_column('Shuffled Label', shuffled_labels)
    
    return shuffled_table
Simulation and Plotting#
def empirical_difference_of_proportions_distribution(num_iterations):
    simulated_diffs = make_array()
    for i in np.arange(num_iterations):
        
        one_sample = permutation_sample(rct, 'Group')
        statistic_one_sample = difference_of_proportions(one_sample, 
                                                             "Shuffled Label", 
                                                             'Result')
        
        simulated_diffs = np.append(simulated_diffs, statistic_one_sample)
    
    return simulated_diffs
simulated_diffs = empirical_difference_of_proportions_distribution(2000)
results = Table().with_columns('Absolute Difference between\nTreatment and Control', 
                               simulated_diffs)
results.hist(left_end = percentile(95, simulated_diffs), bins=np.arange(0,0.7,0.1))
plots.scatter(observed_diff, 0, color='red', s=60, clip_on=False, zorder=3)
plots.title('Back pain experiment');
 
P-value#
sum(simulated_diffs >= observed_diff) / len(simulated_diffs)
0.008
2. John Snow from scratch#
snows_data = Table().read_table('data/snow_data.csv')
snows_data.show(10)
| Water Company | Cholera death | 
|---|---|
| Lambeth | False | 
| S&V | False | 
| S&V | False | 
| S&V | False | 
| S&V | False | 
| S&V | False | 
| S&V | True | 
| S&V | False | 
| Lambeth | False | 
| S&V | False | 
... (66143 rows omitted)
# We have not prepared anything for this example. 
# Direct us and we will make mistakes together. 
