import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import quiz_tests
# Set plotting options
%matplotlib inline
plt.rc('figure', figsize=(16, 9))
# Sample A: Normal distribution
sample_a = stats.norm.rvs(loc=0.0, scale=1.0, size=(1000,))
# Sample B: Non-normal distribution
sample_b = stats.lognorm.rvs(s=0.5, loc=0.0, scale=1.0, size=(1000,))
# Sample A: Normal distribution
fig, axes = plt.subplots(2, 1, figsize=(16, 9), sharex=True)
#箱线图
axes[0].boxplot(sample_a, vert=False)
#柱状图
axes[1].hist(sample_a, bins=50)
axes[0].set_title("Boxplot of a Normal Distribution");
# Sample B: Non-normal distribution
fig, axes = plt.subplots(2, 1, figsize=(16, 9), sharex=True)
axes[0].boxplot(sample_b, vert=False)
axes[1].hist(sample_b, bins=50)
axes[0].set_title("Boxplot of a Lognormal Distribution");
# Q-Q plot of normally-distributed sample
plt.figure(figsize=(10, 10)); plt.axis('equal')
stats.probplot(sample_a, dist='norm', plot=plt);
# Q-Q plot of non-normally-distributed sample
plt.figure(figsize=(10, 10)); plt.axis('equal')
stats.probplot(sample_b, dist='norm', plot=plt);
正态分布的假设检验方法很多,这里利用scipy库的Shapiro-Wilk test方法进行检验。原假设是样本数据符合正态分布,如果P值大于选择的$\alpha$,则接受原假设,否则拒绝原假设。
https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.stats.shapiro.html
def is_normal_ks(sample, test=stats.kstest, p_level=0.05, **kwargs):
"""
sample: a sample distribution
test: a function that tests for normality
p_level: if the test returns a p-value > than p_level, assume normality
return: True if distribution is normal, False otherwise
"""
normal_args = (np.mean(sample),np.std(sample))
t_stat, p_value = test(sample, 'norm', normal_args, **kwargs)
print("Test statistic: {}, p-value: {}".format(t_stat, p_value))
print("Is the distribution Likely Normal? {}".format(p_value > p_level))
return p_value > p_level
quiz_tests.test_is_normal_ks(is_normal_ks)
# Using Kolmogorov-Smirnov test
print("Sample A:-"); is_normal_ks(sample_a);
print("Sample B:-"); is_normal_ks(sample_b);