Interpreting Confidence#

from datascience import *
from cs104 import *
import numpy as np
%matplotlib inline

1. Pea plants#

Population: all 2nd generation plants

Sample: Mendel’s garden: 929 plants, 709 which had purple flowers

Statistic: Percent Purple

Load Data#

mendel_garden = Table().read_table('data/mendel_garden_sample.csv')
mendel_garden.show(4)

Plant Number	Color
0	Purple
1	Purple
2	White
3	White

... (925 rows omitted)

mendel_garden.num_rows

color_array = mendel_garden.column("Color")

Our statistic is the percent purple.

def percent_purple(color): 
    proportion = sum(color == "Purple") / len(color)
    return proportion*100 

observed_stat = percent_purple(color_array)
observed_stat

76.31862217438106

Bootstrapping#

Now we’re ready for our bootstrap_statistic function from our inference library.

results = bootstrap_statistic(color_array, percent_purple, 1000)

plot = Table().with_columns("Bootstrap Samples Percent Purple", results).hist("Bootstrap Samples Percent Purple")
plot.dot(observed_stat)

../_images/25-interpreting-confidence_14_0.png

2. Confidence Intervals#

Percentiles#

tiny_purple_stat = make_array(78, 70, 88, 82)
tiny_purple_stat

array([78, 70, 88, 82])

percentile(50, tiny_purple_stat)

percentile(75, tiny_purple_stat)

Confidence Intervals for Pea Plants#

ci_percent = 95

percent_in_each_tail = (100 - ci_percent) / 2
percent_in_each_tail

2.5

left_end = percentile(percent_in_each_tail, results)
left_end

73.41227125941873

right_end = percentile(100 - percent_in_each_tail, results)
right_end

79.00968783638321

def confidence_interval(ci_percent, statistics):
    """
    Return an array with the lower and upper bound of the ci_percent confidence interval.
    """
    # percent in each of the the left/right tails
    percent_in_each_tail = (100 - ci_percent) / 2   
    left = percentile(percent_in_each_tail, statistics)
    right = percentile(100 - percent_in_each_tail, statistics)
    return make_array(left, right)

ci_95 = confidence_interval(95, results)
ci_95

array([73.41227126, 79.00968784])

confidence_interval(90, results)

array([74.05812702, 78.57911733])

confidence_interval(99, results)

array([72.55113025, 79.6555436 ])

plot = Table().with_columns("Bootstrap Samples Percent Purple", results).hist("Bootstrap Samples Percent Purple")
plot.interval(ci_95)
plot.dot(observed_stat)

../_images/25-interpreting-confidence_29_0.png

from datascience import *
from cs104 import *
import numpy as np
%matplotlib inline

1. Estimate percent purple in all 2nd generation plants#

Population: all 2nd generation plants

Sample: Mendel’s garden: 929 plants, 709 which had purple flowers

Statistic: Percent Purple

# Table with Mendel's sample
mendel_garden = Table().read_table('data/mendel_garden_sample.csv')

mendel_garden

Plant Number	Color
0	Purple
1	Purple
2	White
3	White
4	Purple
5	Purple
6	Purple
7	Purple
8	Purple
9	Purple

... (919 rows omitted)

mendel_garden.num_rows

#Statistic: percent purple flowers 
def percent_purple(color): 
    proportion = sum(color == "Purple") / len(color)
    return proportion*100 

observed_statistic = percent_purple(mendel_garden.column('Color'))
observed_statistic

76.31862217438106

Review bootstrapping#

def bootstrap_statistic(observed_sample, compute_statistic, num_trials): 
    """
    Creates num_trials resamples of the initial sample.
    Returns an array of the provided statistic for those samples.

    * observed_sample: the initial sample, as an array.
    
    * compute_statistic: a function that takes a sample as 
                         an array and returns the statistic for that
                         sample. 
    
    * num_trials: the number of bootstrap samples to create.

    """
    statistics = make_array()
    
    for i in np.arange(0, num_trials): 
        #Key: in bootstrapping we must always sample with replacement 
        simulated_resample = np.random.choice(observed_sample, len(observed_sample))
        
        resample_statistic = compute_statistic(simulated_resample)
        statistics = np.append(statistics, resample_statistic)
    
    return statistics

bootstrap_statistics = bootstrap_statistic(mendel_garden.column('Color'), percent_purple, 10000)

# Helper function to plot our Mendel data
def mendel_plot(title, observed_statistic, bootstrap_statistics): 
    """
    Helper to plot the results of a bootstrap for Mendel with appropriate
    axes and titles.
    """
    results = Table().with_column('Bootstrap Samples Percent Purple', bootstrap_statistics)
    plot = results.hist(bins=np.arange(68, 82, 0.5))
    plot.dot(observed_statistic)
    plot.set_title(title)
    plot.set_xlim(68, 82)
    plot.set_ylim(0,0.35)

# Put in Table and analyze results 
mendel_plot("Mendel's garden A", observed_statistic, bootstrap_statistics)

../_images/25-interpreting-confidence_56_0.png

2. Bootstrap Percentile Method for Confidence Interval#

The interval of estimates is the “middle 95%” of the bootstrap estimates.

# Get the endpoints of the 95% confidence interval
left = percentile(2.5, bootstrap_statistics)
right = percentile(97.5, bootstrap_statistics)

make_array(left, right)

array([73.51991389, 79.00968784])

def confidence_interval(ci_percent, bootstrap_statistics):
    """
    Return an array with the lower and upper bound of the ci_percent confidence interval.
    """
    # percent in each of the the left/right tails
    percent_in_each_tail = (100 - ci_percent) / 2   
    left = percentile(percent_in_each_tail, bootstrap_statistics)
    right = percentile(100 - percent_in_each_tail, bootstrap_statistics)
    return make_array(left, right)

# Helper function to plot our Mendel data, now with 95% ci
def mendel_plot(title, observed_statistic, bootstrap_statistics, ci_percent): 
    """
    Helper to plot the results of a bootstrap for Mendel with appropriate
    axes and titles.
    """
    results = Table().with_column('Bootstrap Samples Percent Purple', bootstrap_statistics)
    plot = results.hist(bins=np.arange(68, 82, 0.5))
    
    plot.interval(confidence_interval(ci_percent, bootstrap_statistics))
    
    plot.dot(observed_statistic)
    plot.square(75)
    plot.set_title(title)
    plot.set_xlim(68, 82)
    plot.set_ylim(0,0.35)

mendel_plot("Mendel's garden A", observed_statistic, bootstrap_statistics, 95)

../_images/25-interpreting-confidence_61_0.png

A visualization of confidence interval sizes for different confidence levels.

def visualize_ci(ci_percent):
    mendel_plot("Mendel's garden A", observed_statistic, bootstrap_statistics, ci_percent)
    
interact(visualize_ci, ci_percent=Slider(50,100,1))

Confidence intervals will be different for different samples and different runs of the bootstrap. How different???

The following cell contains an interactive visualization. You won’t see the visualization on this web page, but you can view and interact with it if you run this notebook on our server here.

def visualize_ci_with_different_samples(random_seed, ci_percent):
    np.random.seed(random_seed)
    sample = mendel_garden.sample().column('Color')
    bootstrap_statistics = bootstrap_statistic(sample, percent_purple, 500)
    mendel_plot("Mendel's garden", percent_purple(sample), bootstrap_statistics, ci_percent)
    
interact(visualize_ci_with_different_samples, 
         random_seed=Slider(0,100),
         ci_percent=Slider(50,100,1))

Evaluating confidence intervals#

CI with Boostrap percentiles procedure:

Take a random sample of the population
Take bootstrap resamples of the sample
Construct 95% CI via percentile method on the bootstrap resamples

What does a confidence interval actually mean? If we repeat this procedure 100 times, we would expect the true parameter to fall in the confidence interval 95/100 times.

We can evaluate this via an “oracle”–looking at the true parameter. But note, we would not see this parameter in the real world.

# A population with 10k plants and 75% purple as true parameter
population = Table().read_table('data/mendel_population.csv')

population.num_rows

true_parameter = percent_purple(population.column('Color'))
true_parameter

75.0

num_eval_repeats = 10
sample_size = 929 #size of Mendel's garden  
num_bootstrap_trials = 1000

count_contains_true_param = 0 

import sys

plot = Plot()

for i in np.arange(0, num_eval_repeats):
    one_random_sample = population.sample(sample_size, with_replacement=False).column('Color')
    bootstrap_statistics = bootstrap_statistic(one_random_sample, percent_purple, num_bootstrap_trials)
    left, right = confidence_interval(95,bootstrap_statistics)
        
    if left <= true_parameter <= right: 
        count_contains_true_param += 1 
        plot_color = 'C0'
    else: 
        plot_color = 'C3'
    
    plot.line([left, right], [i, i], color=plot_color, lw=1)
    
plot.line(x=75, lw=1, linestyle="dashed", color="black")
plot.set_xlabel('Confidence Interval for Percent Purple in Sample')
plot.set_ylabel('Evaluation repeat number')
plot.set_title('Oracle evaluation')
plot.set_xlim(68,82)
plot.set_ylim(-1,100)

../_images/25-interpreting-confidence_72_0.png

count_contains_true_param

CSCI 104: Data Science and Computing for All

Interpreting Confidence

Contents