Boostrapping¶

from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

1. Median and percentiles¶

# A tiny set of salaries

# These are sorted, but they don't need to be to use percentile.
tiny_salaries = make_array(1317,  3909,  6015,  7467, 18632, 20828, 20950)
tiny_salaries

array([ 1317,  3909,  6015,  7467, 18632, 20828, 20950])

percentile(50, tiny_salaries)

percentile(75, tiny_salaries)

2. Boostrapping¶

# Load the 200-sample of Boston city police officers.
boston_sample = Table().read_table('data/boston-earnings-small.csv')
boston_sample.show(5)

TITLE	REGULAR	TOTAL_GROSS
Police Officer	89969	180643
Police Officer	83065	148513
Police Officer	91784	150195
Police Officer	25923	90898
Police Officer	35882	36068

... (195 rows omitted)

boston_sample.num_rows

boston_sample.hist('REGULAR')

sample_median = percentile(50, boston_sample.column('REGULAR'))
sample_median

def bootstrap(observed_sample, num_trials): 

    bootstrap_statistics = make_array()
    
    for i in np.arange(0, num_trials): 
        #Key: in bootstrapping we must always sample with replacement 
        simulated_resample = boston_sample.sample()
        
        resample_statistic = percentile(50, simulated_resample.column('REGULAR')) #get the median for that one resample 
        bootstrap_statistics = np.append(bootstrap_statistics, resample_statistic)
    
    return bootstrap_statistics

bootstrap_statistics = bootstrap(boston_sample, 10000)

# Put in Table and analyze results 
results = Table().with_column('Bootstrap Samples Median', bootstrap_statistics)
results.hist()
#Plot the median of our original sample in red
plots.scatter(sample_median, 0, color='red', s=100, zorder=10, clip_on=False);

3. “Oracle” Evaluation¶

Oracle: pretend we are an all-knowing being and can look at the true population (which our journalist does not have access to)

We can check agaist the population for the pedagogical purposes of understanding the bootstrap. However, in the real world we would mostly likely only have the sample. And if we did have the population, we wouldn’t need to bootstrap.

population = Table().read_table('data/boston-earnings.csv')
population = population.select('TITLE', 'REGULAR', 'TOTAL_GROSS')
population = population.where('TITLE', are.equal_to('Police Officer'))
population.show(5)

TITLE	REGULAR	TOTAL_GROSS
Police Officer	0	1264844
Police Officer	0	1252991
Police Officer	100963	399826
Police Officer	99102	306588
Police Officer	91784	304577

... (1316 rows omitted)

population.hist('REGULAR')

#median of the population 
population_salaries = population.column('REGULAR')
true_parameter = percentile(50, population_salaries)
true_parameter

# Compare the true parameter to our bootstrap estimate
results.hist()
#Plot the median of our original sample in red
plots.scatter(sample_median, 0, color='red', s=100, zorder=10, clip_on=False)
#Plot the true population parameter in green 
plots.scatter(true_parameter, 0, color='lightgreen', marker='s', s=100, zorder=10, clip_on=False)
plots.title('Sample Median (Red Circle) and\nPop. Median (Green Square)');

Sensitivity to Sample Size and Number of Samples¶

Here’s a way to visualize how the bootstrap distribution converges to the same distribution as the one for samples from the population.

# random_seed lets us change the random numbers used to pick samples 
# Basically, changing the seed let's us generated different samples.
# random_seed 0 uses boston_sample from above.  All others create new initial sample
def visualize_bootstrap(random_seed, sample_size, num_samples):
    np.random.seed(random_seed)
    if random_seed == 0:
        first_sample = boston_sample
    else:
        first_sample = population.sample(sample_size, with_replacement=False)
    medians = Table(["Type", "Median"])
    sample_medians = make_array()
    
    for i in np.arange(num_samples):
        sample = population.sample(sample_size)
        sample_median = percentile(50, sample.column('REGULAR'))
        medians.append([ "Realworld", sample_median ])

    for i in np.arange(num_samples):
        bootstrap_sample = first_sample.sample()
        boostrap_median = percentile(50, bootstrap_sample.column('REGULAR'))
        medians.append([ "Bootstrap", boostrap_median ])

    median_bins=np.arange(75000, 95000, 1000)
    medians.hist(group="Type", bins=median_bins)

    # Plotting parameters; you can ignore this code
    plots.scatter(true_parameter, 0.000005, color='lightgreen', marker="s", s=100, zorder=12, clip_on=False)
    plots.scatter(np.median(first_sample.column('REGULAR')), 0.000005, color='red', s=100, zorder=12, clip_on=False)
    plots.title('Bootstrap/Real World Medians\nSample Median (Red Dot); Pop Median (Green Square)\nsample size = ' + str(sample_size) + '; num samples = ' + str(num_samples));        
    
_ = widgets.interact(visualize_bootstrap, 
                     random_seed=(0,100,1),
                     sample_size=make_array(25, 50,100,200,500, 1000,2000), 
                     num_samples=make_array(20,200,2000))

CSCI 104: Understanding Data Through Computation

Boostrapping¶

1. Median and percentiles¶

2. Boostrapping¶

3. “Oracle” Evaluation¶

Sensitivity to Sample Size and Number of Samples¶