123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- from matplotlib import pyplot as plt
- import numpy as np
- import scipy.stats as stats
- import pymc as pm
- from IPython.core.pylabtools import figsize
- from bayes_book.common import *
- # https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers
- plt_init()
- # 先验概率、后验概率 概念直方图
- def code_1():
- figsize(12.5, 4)
- colors = ['#348ABD', '#A60628']
- prior = [1/21., 20/21.] # 先验概率,男性是图书管理员或农民的概率是 1:20
- posterior = [0.087, 1-0.087] # 后验概率,P(A|X) = P(X|A)P(A)/P(X),计算原理如下
- # P(X|A),即A为真时特征X符合的概率,假设为0.95
- # P(X) = P(X|A)P(A) + P(X|~A)P(~A) = 0.95 * 1/21 + 0.5 * 20/21 = 0.52,其中0.5为假设A为假时的特征概率
- # P(A|X) = 0.95 * 1/21 / 0.52 = 0.087
- plt.bar([0, .7], prior, label='先验概率', width=0.25, color=colors[0], alpha=0.7, lw=3, edgecolor=colors[0])
- plt.bar([0+0.25, .7+0.25], posterior, label='后验概率', width=0.25, color=colors[1], alpha=0.7, lw=3, edgecolor=colors[1])
- plt.xticks([0.20, 0.95], ['图书管理员', '农民'])
- plt.ylabel('概率')
- plt.title('Steve 的职业的先验概率及后验概率')
- plt.legend()
- plt.show()
- # 离散变量概率 - 质量函数(假设符合泊松分布) 概念直方图
- def code_2():
- figsize(12.5, 4)
- colors = ['#348ABD', '#A60628']
- lambda_ = [1.5, 4.25] # 泊松分布强度。离散概率分布质量函数,即值为k时的概率,P(Z=k) = λ exp k * e exp -λ / k!,k=0,1,2,...
- k = np.arange(16) # 离散变量
- poi = stats.poisson # 泊松分布
- plt.bar(k, poi.pmf(k, lambda_[0]), label='$\lambda = %.1f$' % lambda_[0], color=colors[0], alpha=0.6, lw=3, edgecolor=colors[0])
- plt.bar(k, poi.pmf(k, lambda_[1]), label='$\lambda = %.1f$' % lambda_[1], color=colors[1], alpha=0.6, lw=3, edgecolor=colors[1])
- plt.xticks(k+0.4, k)
- plt.xlabel('$k$')
- plt.ylabel('取值为 $k$ 的概率')
- plt.title('不同 $\lambda$ 强度情况下泊松分布随机变量的概率质量函数')
- plt.legend()
- plt.show()
- # 连续变量概率 - 密度函数(假设指数密度分布) 概念折线图
- def code_3():
- figsize(12.5, 4)
- colors = ['#348ABD', '#A60628']
- lambda_ = [0.5, 1] # 随机变量期望。连续概率密度函数,fz(z|λ) = λ e exp -λz,z>=0
- z = np.linspace(0, 4, 100) # 连续线性空间
- expo = stats.expon # 指数密度
- for l, c in zip(lambda_, colors):
- plt.plot(z, expo.pdf(z, scale=1./l), label='$\lambda = %.1f$' % l, color=c, lw=3)
- plt.fill_between(z, expo.pdf(z, scale=1./l), color=c, alpha=.33)
- plt.xlim(0, 4)
- plt.ylim(0, 1.2)
- plt.xlabel('$z$')
- plt.ylabel('取值为 $z$ 的概率密度函数结果')
- plt.title('不同 $\lambda$ 取值情况下指数分布随机变量的概率密度函数')
- plt.legend()
- plt.show()
- # 短信数据行为推断 - 原始值呈现
- def code_4():
- figsize(12.5, 4)
- colors = ['#348ABD', '#A60628']
- count_data = np.loadtxt("bayes/data/txtdata.csv")
- n_count_data = len(count_data)
- plt.bar(np.arange(n_count_data), count_data, color=colors[0])
- plt.xlim(0, n_count_data);
- plt.xlabel("时间(天)")
- plt.ylabel("短信接收数量")
- plt.title("用户的短信使用行为是否随着时间发生变化?")
- plt.legend()
- plt.show()
- # 短信数据行为推断 - pyMC
- def code_5():
- figsize(12.5, 10)
- colors = ['#A60628', '#7A68A6', '#467821']
- count_data = np.loadtxt("bayes/data/txtdata.csv")
- n_count_data = len(count_data)
- alpha = 1.0 / count_data.mean()
- lambda_1 = pm.Exponential('lambda_1', alpha)
- lambda_2 = pm.Exponential('lambda_2', alpha)
- tau = pm.DiscreteUniform('tau', lower=0, upper=n_count_data)
- @pm.deterministic
- def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
- out = np.zeros(n_count_data)
- out[:tau] = lambda_1
- out[tau:] = lambda_2
- return out
-
- observation = pm.Poisson('obs', lambda_, value=count_data, observed=True)
- model = pm.Model([observation, lambda_1, lambda_2, tau])
- mcmc = pm.MCMC(model)
- mcmc.sample(40000, 10000, 1)
-
- lambda_1_samples = mcmc.trace('lambda_1')[:]
- lambda_2_samples = mcmc.trace('lambda_2')[:]
- tau_samples = mcmc.trace('tau')[:]
-
- # 绘图部分
- ax = plt.subplot(311)
- ax.set_autoscaley_on(False)
- plt.title("参数 $\lambda_1$、$\lambda_2$、$\tau$ 的后验分布")
- plt.hist(lambda_1_samples, histtype='stepfilled', bins=30, alpha=0.85, label="$\lambda_1$ 的后验", color=colors[0], normed=True)
- plt.legend(loc="upper left")
- plt.xlim([15, 30])
- plt.xlabel("$\lambda_1$")
- plt.ylabel("密度")
- ax = plt.subplot(312)
- ax.set_autoscaley_on(False)
- plt.hist(lambda_2_samples, histtype='stepfilled', bins=30, alpha=0.85, label="$\lambda_2$ 的后验", color=colors[1], normed=True)
- plt.legend(loc="upper left")
- plt.xlim([15, 30])
- plt.xlabel("$\lambda_2$")
- plt.ylabel("密度")
- plt.subplot(313)
- w = 1.0 / tau_samples.shape[0] * np.ones_like(tau_samples)
- plt.hist(tau_samples, bins=n_count_data, alpha=1, label=r"$\tau$ 的后验", color=colors[2], weights=w, rwidth=2.)
- plt.legend(loc="upper left")
- plt.xticks(np.arange(n_count_data))
- plt.xlim([35, len(count_data)-20])
- plt.ylim([0, .75])
- plt.xlabel(r"$\tau$ (天)")
- plt.ylabel("概率")
- plt.show()
|