Let's dig in to areas where the Central limit Theorem and Law of Large numbers break down.
import pandas as pd
import numpy as np
from scipy.stats import bernoulli, binom, norm, ks_2samp
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc, animation
from IPython.core.display import display, HTML
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from _plotly_future_ import v4_subplots
from plotly import tools
import plotly
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.figure_factory as ff
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(style="white", palette="husl")
sns.set_context("talk")
sns.set_style("ticks")
The Central Limit Theorem: tells us that as the sample size tends to infinity, the distribution of sample means approaches the normal distribution. This is a statement about the SHAPE of the distribution. A normal distribution is bell shaped so the shape of the distribution of sample means begins to look bell shaped as the sample size increases. https://ocw.mit.edu/courses/mathematics/18-05-introduction-to-probability-and-statistics-spring-2014/readings/MIT18_05S14_Reading6b.pdf. TODO: show that the original variable is NOT normally distributed (it is a yes/no success outcome-i.e. the basketball follows a bernoulli distribution). https://www.youtube.com/watch?v=Pujol1yC1_A.
The Law of Large Numbers: tells us where the center (maximum point) of the bell is located. Again, as the sample size approaches infinity the center of the distribution of the sample means becomes very close to the population mean. Very informally, it states that when you do an experiment a repeated number of times, add up the outcomes, and take the average, it looks like the average of the distribution (link to: https://www.nathanieldake.com/Mathematics/04-Statistics-01-Introduction.html). Again, if you do a large number of experiments, the average result is the average of your distribution. As number of trials increase, we approach the average.
In probability theory, the law of large numbers (LLN) is a theorem that describes the result of performing the same experiment a large number of times. According to the law, the average of the results obtained from a large number of trials should be close to the expected value, and will tend to become closer to the expected value as more trials are performed.
Let's look at an example where it will apply (show both trials vs. value, as well as histogram of a few different numbers of trials). Consider the following toy example: There is a basketball player who has a %72.3 free throw success rate. If he takes 50 free throws, how many do we expect him to make? The guess that maximizes the likelihood would be 50 * .723 = 36.15
. Now, if we consider a single trial to be our player taking 50 free throws, what happens to the average of our outcomes as our number of trials increases?
TODO: Explain that basketball shots are not normally distributed, in this case they are drawn from a bernoulli distribution.
def single_trial(shots_per_trial=50):
"""Returns the number of successful shots made in `shots_per_trial` attempts."""
success_rate = 0.723
outcome_prob = np.random.uniform(size=shots_per_trial)
outcome_value = outcome_prob < success_rate
return outcome_value.sum()
def run_trials(num_trials=10_000, shots_per_trial=50):
"""Performs `num_trials` of single_trial() and returns an array of the outcomes."""
trial_outcomes = []
for i in range(num_trials):
baskets_made_in_single_trial = single_trial(shots_per_trial=shots_per_trial)
trial_outcomes.append(baskets_made_in_single_trial)
return np.asarray(trial_outcomes)
def calculate_cumulative_means(trial_outcomes):
"""Calculates cumulative means based on list of trial outcomes."""
cumulative_means = []
for i in range(1, len(trial_outcomes)):
subset_mean = trial_outcomes[:i].mean()
cumulative_means.append(subset_mean)
return cumulative_means
def run_experiment(num_trials=10_000, shots_per_trial=50):
"""Orchestrates experiment by running trials and calculating cumulative means."""
trial_outcomes = run_trials(num_trials=num_trials, shots_per_trial=shots_per_trial)
trials_index = np.arange(0, trial_outcomes.shape[0], 1)
cumulative_means = calculate_cumulative_means(trial_outcomes)
return trial_outcomes, trials_index, cumulative_means
trial_outcomes, trials_index, cumulative_means = run_experiment(num_trials=10_000, shots_per_trial=50)
First let's look at a plot that shows the cumulative average-notice that it approaches the average (based on MLE-link to derivation of MLE for bernoulli experiment, or derive it). This is the law of large numbers:
trace1 = go.Scatter(
x=trials_index,
y=cumulative_means,
marker = dict(color='crimson'),
fillcolor='rgba(0, 0, 255, 0.2)',
name="Sample Cumulative Average"
)
trace2 = go.Scatter(
x=trials_index,
y=np.ones_like(trials_index)*36.15,
marker = dict(color='blue'),
fillcolor='rgba(0, 0, 255, 0.2)',
name="Theoretical Expected Value"
)
data = [trace1, trace2]
layout = go.Layout(
width=800,
height=400,
title="Cumulative average # shots made in trials vs. number of trials",
xaxis=dict(title="Trials"),
yaxis=dict(title="Cumulative average # shots made"),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
fig.update_layout(layout)
# plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
Now, this also relates to the central limit theorem, which states that as our sample size increases our distribution will approach the normal distribution:
trial_outcomes, trials_index, cumulative_means = run_experiment(num_trials=100_000, shots_per_trial=50)
trace1 = go.Histogram(
x=trial_outcomes[:100],
nbinsx=30,
name="Sample 1",
histnorm='probability density',
marker_color='crimson',
opacity=0.7,
showlegend=False
)
trace2 = go.Histogram(
x=trial_outcomes[:1_000],
nbinsx=30,
name="Sample 2",
histnorm='probability density',
marker_color='crimson',
opacity=0.7,
showlegend=False
)
trace3 = go.Histogram(
x=trial_outcomes[:10_000],
nbinsx=30,
name="Sample 3",
histnorm='probability density',
marker_color='crimson',
opacity=0.7,
showlegend=False
)
trace4 = go.Histogram(
x=trial_outcomes[:100_000],
nbinsx=30,
name="Sample 4",
histnorm='probability density',
marker_color='crimson',
opacity=0.7,
showlegend=False
)
fig = plotly.subplots.make_subplots(
rows=1,
cols=4,
print_grid=False,
subplot_titles=("100 Trials", "1,000 Trials", "10,000 Trials", "100,000 Trials")
)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)
fig.append_trace(trace4, 1, 4)
fig.update_traces(marker=dict(line=dict(width=1, color='black')))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
layout = go.Layout(
barmode='overlay',
width=950,
height=300,
xaxis1=dict(title="X"),
yaxis=dict(title="Probabilty Density"),
xaxis2=dict(title='X'),
xaxis3=dict(title="X"),
xaxis4=dict(title="X"),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig.update_layout(layout)
# plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
We can clearly see that as the number of trials increases, our distribution starts to look more and more like the traditional bell curve. Remember, the law of large numbers stated that as the number of trials increased, the mean of the outcomes should approach the expected value of the distribution (which was demonstrated in earlier plot).
We can give a bit more detail in showing as the number of trials increase we do indeed approach the normal distribution. Below, we can see an overlay of the normal distribution against the our histograms:
def overlay_normal_dist(x_data, hist_trace):
mu, std = norm.fit(x_data)
x_min = hist_trace.x.min()
x_max = hist_trace.x.max()
x_axis = np.linspace(x_min, x_max, 100)
norm_pdf = norm.pdf(x_axis, mu, std)
return x_axis, norm_pdf
def experiment_sample_size(data={}, sample_size=10):
if len(data) > 0:
trial_outcomes = data['trial_outcomes']
trials_index = data['trials_index']
cumulative_means = data['cumulative_means']
else:
trial_outcomes, trials_index, cumulative_means = run_experiment(num_trials=100_000, shots_per_trial=sample_size)
trace1a = go.Histogram(
x=trial_outcomes[:100],
nbinsx=30,
name="Sample 1",
histnorm='probability density',
marker_color='crimson',
opacity=0.7,
showlegend=False
)
x_axis, norm_pdf = overlay_normal_dist(trial_outcomes[:100], trace1a)
trace1b = go.Scatter(
x=x_axis,
y=norm_pdf,
marker = dict(color='blue'),
fillcolor='rgba(0, 0, 255, 0.2)',
name="Normal Distribution (fit to data)",
showlegend=False
)
trace2a = go.Histogram(
x=trial_outcomes[:1_000],
nbinsx=30,
name="Sample 2",
histnorm='probability density',
marker_color='crimson',
opacity=0.7,
showlegend=False
)
x_axis, norm_pdf = overlay_normal_dist(trial_outcomes[:1_000], trace2a)
trace2b = go.Scatter(
x=x_axis,
y=norm_pdf,
marker = dict(color='blue'),
fillcolor='rgba(0, 0, 255, 0.2)',
name="Normal Distribution \n (fit to data)",
showlegend=False
)
trace3a = go.Histogram(
x=trial_outcomes[:10_000],
nbinsx=30,
name="Sample 3",
histnorm='probability density',
marker_color='crimson',
opacity=0.7,
showlegend=False
)
x_axis, norm_pdf = overlay_normal_dist(trial_outcomes[:10_000], trace3a)
trace3b = go.Scatter(
x=x_axis,
y=norm_pdf,
marker = dict(color='blue'),
fillcolor='rgba(0, 0, 255, 0.2)',
name="Normal Distribution (fit to data)",
showlegend=False
)
trace4a = go.Histogram(
x=trial_outcomes[:100_000],
nbinsx=30,
name="Sample 4",
histnorm='probability density',
marker_color='crimson',
opacity=0.7,
showlegend=False
)
x_axis, norm_pdf = overlay_normal_dist(trial_outcomes[:100_000], trace4a)
trace4b = go.Scatter(
x=x_axis,
y=norm_pdf,
marker = dict(color='blue'),
fillcolor='rgba(0, 0, 255, 0.2)',
name="Normal Distribution (fit to data)",
showlegend=False
)
fig = plotly.subplots.make_subplots(
rows=1,
cols=4,
print_grid=False,
subplot_titles=("100 Trials", "1,000 Trials", "10,000 Trials", "100,000 Trials")
)
fig.append_trace(trace1a, 1, 1)
fig.append_trace(trace1b, 1, 1)
fig.append_trace(trace2a, 1, 2)
fig.append_trace(trace2b, 1, 2)
fig.append_trace(trace3a, 1, 3)
fig.append_trace(trace3b, 1, 3)
fig.append_trace(trace4a, 1, 4)
fig.append_trace(trace4b, 1, 4)
fig.update_traces(marker=dict(line=dict(width=1, color='black')))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
layout = go.Layout(
barmode='overlay',
width=950,
height=300,
xaxis1=dict(title="X"),
yaxis=dict(title="Probabilty Density"),
xaxis2=dict(title='X'),
xaxis3=dict(title="X"),
xaxis4=dict(title="X"),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig.update_layout(layout)
return plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
data_dict = {
'trial_outcomes': trial_outcomes,
'trials_index': trials_index,
'cumulative_means': cumulative_means,
}
experiment_sample_size(data_dict, sample_size=None)
We can see above that as the number of trials increases our histogram looks more and more like the normal distribution (in other words it is approaching the normal distribution). We can see that as the number of trials increase, we also converge towards the average value (if I extended the computation to 1,000,000 trials the convergence would be even more pronounced).
Why does law of large numbers and CLT hold here?
TODO: Note that we have been using a sample size of 50 trials for the CLT. Potentially may want to run an experiment showing how the distribution changes when we have a fixed number of trials (say 1,000), but a varying sample size (say, 10, 25, 50, and 100). That may be helpful to demonstrate the CLT here.
Now, what happens if our trial size (in reality a sample size) decreases from 50 to 10? Let's take a look:
trial_outcomes, trials_index, cumulative_means = run_experiment(num_trials=100_000, shots_per_trial=10)
trace1 = go.Scatter(
x=trials_index,
y=cumulative_means,
marker = dict(color='crimson'),
fillcolor='rgba(0, 0, 255, 0.2)',
name="Sample Cumulative Average"
)
trace2 = go.Scatter(
x=trials_index,
y=np.ones_like(trials_index)*7.23,
marker = dict(color='blue'),
fillcolor='rgba(0, 0, 255, 0.2)',
name="Theoretical Expected Value"
)
data = [trace1, trace2]
layout = go.Layout(
width=800,
height=400,
title="Cumulative average # shots made in trials vs. number of trials (Trial size = 10)",
xaxis=dict(title="Trials"),
yaxis=dict(title="Cumulative average # shots made"),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
fig.update_layout(layout)
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
data_dict = {
'trial_outcomes': trial_outcomes,
'trials_index': trials_index,
'cumulative_means': cumulative_means,
}
experiment_sample_size(data_dict, sample_size=None)
We can then even run an experiment based on sample size: We just looked at a sample size (number of shots per trial) of 10. Let's try 20 now:
experiment_sample_size(sample_size=20)
And now we can look at a sample size of 30:
experiment_sample_size(sample_size=30)
Clearly, as the number of shots taken per trial increases (i.e. our sample size increases), we start to approach the normal distribution more closely.
TODO: Explain why if it is sample size or total number of trials that produces the normal dist.
Arrow shot at wall.
Why does it not apply?
When thinking about the arrow at the wall problem, remember the same thing that we ran into earlier-> If we want to eventually show 1,000,000 trials, we only need to calculate 1 million in total, and then we can iteratively step through and grab an additional trial at each iteration, using that in the total.
May want to run a few different 1,000,000 size trials and plot their distribution?
TODO: Diagrams for arrow problem.
TODO: Consider relabeling graphs as just 'means' (the cumulative portion is helpful in the code to understand what is going on, but may just lead to confusion/visual noise when on the graph).
TODO: Mention that what we are doing is similar to a monte carlo approach.
def arrow_trials(arrows=50):
"""Returns mean distance of arrow shot over the course of `arrows` shots."""
angles = np.random.uniform(size=arrows) * (np.pi / 2)
distances = np.tan(angles)
return distances
def calculate_cumulative_means(trial_outcomes):
"""Calculates cumulative means based on list of trial outcomes."""
cumulative_means = np.cumsum(trial_outcomes) / np.arange(1, trial_outcomes.shape[0] + 1, 1)
return cumulative_means
def calculate_trial_stats(trial_outcomes):
"""Calculate mean, std, and max distance."""
mean_dist = trial_outcomes.mean()
std_dist = trial_outcomes.std()
max_dist = trial_outcomes.max()
return mean_dist, std_dist, max_dist
def run_arrow_experiment(arrows=100_000):
"""Coordinates arrow experiment. Calculates array of arrow distances, the cumulative mean, and stats."""
distances = arrow_trials(arrows=arrows)
cumulative_mean_distances = calculate_cumulative_means(distances)
mean_dist, std_dist, max_dist = calculate_trial_stats(distances)
distance_idx = np.arange(0, distances.shape[0], 1)
return cumulative_mean_distances, mean_dist, std_dist, max_dist, distance_idx
def plot_single_arrow_experiment(arrows=100_000):
num_plot_points = 1000
plot_step_size = int(arrows / num_plot_points)
cumulative_mean_distances, mean_dist, std_dist, max_dist, x_axis = run_arrow_experiment(arrows=arrows)
trace1 = go.Scatter(
x=x_axis[::plot_step_size],
y=cumulative_mean_distances[::plot_step_size],
marker = dict(color='red'),
name=f"Cumulative Average"
)
data = [trace1]
arrow_num_str = '{:,}'.format(arrows)
layout = go.Layout(
width=900,
height=400,
title=f"Cumulative mean distance vs. arrows shot ({arrow_num_str} arrows total)",
xaxis=dict(title="Number of arrows shot"),
yaxis=dict(title="Cumulative average arrow distance"),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
showlegend=True,
)
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
mean_dist_str = '{:,}'.format(round(mean_dist, 2))
std_dist_str = '{:,}'.format(round(std_dist, 2))
max_dist_str = '{:,}'.format(round(max_dist, 2))
stats_string = f"""
Overall mean: {mean_dist_str}<br>
Overall Std: {std_dist_str}<br>
Overall Max: {max_dist_str}
"""
fig.add_annotation(
go.layout.Annotation(
x=1.34,
y=0.9,
text=stats_string,
showarrow=False,
align='left',
bgcolor='white',
borderpad=0,
)
)
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
plot_single_arrow_experiment(arrows=100_000)
plot_single_arrow_experiment(arrows=1_000_000)
plot_single_arrow_experiment(arrows=10_000_000)
plot_single_arrow_experiment(arrows=100_000_000)
What is more interesting to look at is if we keep the number of arrows constant (at say, 10,000,000), but perform multiple trials:
def plot_multiple_arrow_trials_experiment(arrows=1_000_000, num_trials=10):
num_plot_points = 1000
plot_step_size = int(arrows / num_plot_points)
data = []
data_dict = {}
for i in range(1, num_trials + 1):
cumulative_mean_distances, mean_dist, std_dist, max_dist, x_axis = run_arrow_experiment(arrows=arrows)
trial_str = f'trial{i}'
trace = go.Scatter(
x=x_axis[::plot_step_size],
y=cumulative_mean_distances[::plot_step_size],
name=trial_str,
)
data.append(trace)
data_dict[trial_str] = {
'mean': mean_dist,
'std': std_dist,
'max': max_dist,
}
arrow_num_str = '{:,}'.format(arrows)
layout = go.Layout(
width=900,
height=400,
title=f"Cumulative mean distance vs. arrows shot ({arrow_num_str} arrows total)",
xaxis=dict(title="Number of arrows shot"),
yaxis=dict(title="Cumulative average arrow distance"),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
showlegend=True,
)
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
plotly.offline.iplot(fig)
return data_dict
data_dict = plot_multiple_arrow_trials_experiment(arrows=1_000_000, num_trials=10)
What we can see clearly above is that there is no guarantee of convergence, and we have certain very large spikes that occur. This lack of convergence is showing that the CLT and the law of large numbers do not hold in this domain (well, technically the law of large numbers will still hold, but it's expected value is infinity, so it is not that useful).
display(pd.DataFrame(data_dict).T.sort_values('mean', ascending=False))
distance = arrow_trials(arrows=100_000)
trace1 = go.Histogram(
x=distance,
nbinsx=50,
name="Sample 1",
marker_color='crimson',
opacity=0.7,
showlegend=False
)
data = [trace1]
fig = go.Figure(data=data)
plotly.offline.iplot(fig)