Inference with Statistics

# Some code to set up our notebook for data science!

from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=np.VisibleDeprecationWarning)

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

1. Snoopy’s Fleet

We will create a simulation for the empirical distribution of our chosen statistic.

At true inference time, we do not know N.

However, in our simulation we can make an assumption about what N might be so we can evaluate how “good” our chosen models and statistics could be.

# Our population.  Only Snoopy knows the population.  We don't.  All we can do
# is call `sample_plane_fleet` to observe planes flying overhead.

N = 300
population = np.arange(1, N+1)

def sample_plane_fleet(sample_size):
    """
    Return a random sample of planes from Snoopy's fleet.
    """
    return np.random.choice(population, sample_size)
sample = sample_plane_fleet(10)
sample
array([239,   3, 112, 181, 105, 165, 116, 271, 293,  15])

Option 1: Sample Statistic is the max plane number

# The outcome we care about is our chosen statistic evaluated on the sample 
# Chosen statistic option 1: max of sample 
outcome = max(sample)
outcome
293

Let’s put this all together using our simulation algorithm.

def simulate_plane_fleet(sample_size, num_trials): 
    all_outcomes = make_array()
    for i in np.arange(0, num_trials):
        sample = sample_plane_fleet(sample_size)
        outcome = max(sample)
        all_outcomes = np.append(all_outcomes, outcome)
    return all_outcomes
all_outcomes = simulate_plane_fleet(10, 1000)  # sample size is 10, num trials is 1000

results = Table().with_columns("Statistic Option 1: Max Sample", all_outcomes)
results.hist(bins=np.arange(1, 2 * N, 10))
../_images/17-inference-with-statistics_11_0.png

Let’s generalize a bit and create function that takes the statistic and runs the whole experiment

Option 2: Sample Statistic is Twice the Mean

Let’s generalize a bit and create function that takes the statistic and runs the whole experiment

def planes_empirical_statistic_distribution(sample_size, num_trials, statistic_function):
    """
    Simulates multiple trials of a statistic on our simulated fleet of N planes 
    
    Inputs 
        - sample_size: number of planes we see in our sample
        - num_trials: number of simulation trials 
        - statistic function: a function that takes an array 
            and returns a summary statistic (e.g. max)
        
    Output 
        Histogram of the results 
    """
    all_outcomes = make_array()
    for i in np.arange(0, num_trials):
        sample = sample_plane_fleet(sample_size)
        outcome = statistic_function(sample)
        all_outcomes = np.append(all_outcomes, outcome)
        
    results = Table().with_columns('Empirical Distribution, Statistic: ' + statistic_function.__name__, 
                                   all_outcomes)
    results.hist(bins=np.arange(1, 2 * N, 10))
# sample size is 10, num trials is 1000, statistic is max
planes_empirical_statistic_distribution(10, 1000, max)
../_images/17-inference-with-statistics_16_0.png

Our second option for the statistic:

def twice_mean(sample):
    """Twice the sample mean"""
    return 2 * np.mean(sample)

# sample size is 10, num trials is 1000, statistic is twice mean
planes_empirical_statistic_distribution(10, 1000, twice_mean)    
../_images/17-inference-with-statistics_18_0.png

Effects of Sample Size and Simulation Rounds

def visualize_distributions(N, sample_size, num_trials):
    """A single function to run our simulation for a given N, sample_size, and num_trials."""
    population = np.arange(1, N+1)
    
    # Builds up our outcomes table one row at a time.  We do this to ensure
    # we can apply both statistics to the same samples.
    outcomes_table = Table(["Max", "2*Mean"])
    
    for i in np.arange(num_trials):
        sample = np.random.choice(population, sample_size)
        outcomes_table.append(make_array(max(sample), 2 * np.mean(sample)))
        
    outcomes_table.hist(bins=np.arange(1, 2 * N, 10))

_ = widgets.interact(visualize_distributions, N=(100, 500, 10), 
                     sample_size=(1, 100, 1), 
                     num_trials=(10, 5000, 100))

3. Mendel and Pea Flowers

# Mendel owned 929 plants, of which 709 had purple flowers
observed_sample_size = 929
observed_purples = 709 / observed_sample_size
observed_purples
0.7631862217438106
# 3:1 ratio of purple:white, or 75% purple, 25% white
hypothesized_proportions = make_array(0.75, 0.25)

In the Python library reference, we see can use the function sample_proportions(sample_size, model_proportions).

sample = sample_proportions(observed_sample_size, hypothesized_proportions)
sample
array([0.74058127, 0.25941873])

Each item in the array corresponds to the proportion of times it was sampled out of sample_size times. So percent purple in sample:

percent_purple = 100 * sample.item(0)
percent_purple
74.05812701829925

Let’s use our same simulation algorithm we return to over and over.

def simulate_purple_flowers(observed_sample_size, hypothesized_proportions, num_trials):
    
    all_outcomes = make_array()
    
    for i in np.arange(0, num_trials):
        
        simulated_sample = sample_proportions(observed_sample_size, hypothesized_proportions)
        # outcome: we only want the percent of the purple, we can drop the percent white
        outcome = 100 * simulated_sample.item(0)
        
        all_outcomes = np.append(all_outcomes, outcome)
        
    return all_outcomes
all_outcomes =  simulate_purple_flowers(observed_sample_size, hypothesized_proportions, 1000)
results = Table().with_column('% of purple flowers', all_outcomes)
results.hist()
plots.title('Simulated Outcomes\n observed sample size=' + str(observed_sample_size));
../_images/17-inference-with-statistics_31_0.png

Connecting these pieces together:

  • In Mendel’s model, he hypothesized getting purple flowers was like flipping a biased coin and getting heads 0.75 percent of the time.

  • We simulated outcomes under this hypothesis.

  • Now let’s check if the observed data (that there were 76.3% purple flowers in one sample, Mendel’s own garden) “fits” with the simulated outcomes under the model

results.hist()
_ = plots.scatter(100 * observed_purples, 0.005, color='red', s=70, zorder=3)
plots.title('Simulated Outcomes\n observed sample size=' + str(observed_sample_size));
../_images/17-inference-with-statistics_33_0.png
differences_from_model = Table().with_column('abs(sample statistic - model parameter)', 
                                             abs(all_outcomes - 75))

differences_from_model.hist()
_ = plots.scatter(abs(100 * observed_purples - 75), 0.005, color='red', s=70, zorder=3);
plots.title('Differences from Model \n Parameter (75% Purple)');
../_images/17-inference-with-statistics_34_0.png

Once again, let’s use computation to abstract away the variables that we have the power to change, and the parts of the code that never change.

def pea_plants_simulation(observed_sample_size, observed_purples_count, 
                          hypothesized_purple_proportion, num_trials): 
    """
    Parameters:
        - observed_sample_size: number of plants in our experiment
        - observed_purples_count: count of plants in our experiment w/ purple flowers
        - hypothesized_purple_proportion: our model parameter (hypothesis for 
              proportion of plants will w/ purple flowers).
        - num_trials: number of simulation rounds to run
    Outputs two plots:
        - Empirical distribution of the percent of plants w/ purple flowers 
          from our simulation trials
        - Empirical distribution of how far off each trial was from our hypothesized model
    """
    
    observed_purples = observed_purples_count / observed_sample_size
    hypothesized_proportions = make_array(hypothesized_purple_proportion, 
                                         1 - hypothesized_purple_proportion)
    
    all_outcomes = make_array()
    for i in np.arange(0, num_trials):
        simulated_sample = sample_proportions(observed_sample_size, hypothesized_proportions)
        outcome = 100 * simulated_sample.item(0)
        all_outcomes = np.append(all_outcomes, outcome)
    
    #plots
    percent_purple = Table().with_column('% of purple flowers', all_outcomes)
    percent_purple.hist()
    _ = plots.scatter(100 * observed_purples, 0.005, color='red', s=70, zorder=10)
    plots.title('Simulated Outcomes\n observed sample size=' + str(observed_sample_size));
    
    distance_from_model = Table().with_column('abs(sample statistic - model parameter)', 
                                              abs(all_outcomes - 100 * hypothesized_purple_proportion))
    distance_from_model.hist()
    _ = plots.scatter(100 * abs(observed_purples - hypothesized_purple_proportion), 
                      0.005, color='red', s=70, zorder=10)
    plots.title('Differences from Model \n Parameter (' + str(hypothesized_purple_proportion) + '% Purple)');
pea_plants_simulation(929, 705, 0.75, 1000)
../_images/17-inference-with-statistics_37_0.png ../_images/17-inference-with-statistics_37_1.png
pea_plants_simulation(929, 506, 0.75, 1000)
../_images/17-inference-with-statistics_38_0.png ../_images/17-inference-with-statistics_38_1.png

As an interactive visualization:

_ = widgets.interact(pea_plants_simulation,  
                     observed_sample_size = fixed(929), 
                     observed_purples_count = (0,929), 
                     hypothesized_purple_proportion = (0,1,0.01),
                     num_trials=(10, 5000, 100))