|
@@ -0,0 +1,150 @@
|
|
|
|
+from matplotlib import pyplot as plt
|
|
|
|
+import numpy as np
|
|
|
|
+import scipy.stats as stats
|
|
|
|
+import pymc as pm
|
|
|
|
+from IPython.core.pylabtools import figsize
|
|
|
|
+
|
|
|
|
+from bayes.common import *
|
|
|
|
+
|
|
|
|
+plt_init()
|
|
|
|
+
|
|
|
|
+# 先验概率、后验概率 概念直方图
|
|
|
|
+def code_1():
|
|
|
|
+ figsize(12.5, 4)
|
|
|
|
+ colors = ['#348ABD', '#A60628']
|
|
|
|
+
|
|
|
|
+ prior = [1/21., 20/21.] # 先验概率,男性是图书管理员或农民的概率是 1:20
|
|
|
|
+ posterior = [0.087, 1-0.087] # 后验概率,P(A|X) = P(X|A)P(A)/P(X),计算原理如下
|
|
|
|
+ # P(X|A),即A为真时特征X符合的概率,假设为0.95
|
|
|
|
+ # P(X) = P(X|A)P(A) + P(X|~A)P(~A) = 0.95 * 1/21 + 0.5 * 20/21 = 0.52,其中0.5为假设A为假时的特征概率
|
|
|
|
+ # P(A|X) = 0.95 * 1/21 / 0.52 = 0.087
|
|
|
|
+
|
|
|
|
+ plt.bar([0, .7], prior, label='先验概率', width=0.25, color=colors[0], alpha=0.7, lw=3, edgecolor=colors[0])
|
|
|
|
+ plt.bar([0+0.25, .7+0.25], posterior, label='后验概率', width=0.25, color=colors[1], alpha=0.7, lw=3, edgecolor=colors[1])
|
|
|
|
+
|
|
|
|
+ plt.xticks([0.20, 0.95], ['图书管理员', '农民'])
|
|
|
|
+ plt.ylabel('概率')
|
|
|
|
+ plt.title('Steve 的职业的先验概率及后验概率')
|
|
|
|
+ plt.legend()
|
|
|
|
+
|
|
|
|
+ plt.show()
|
|
|
|
+
|
|
|
|
+# 离散变量概率 - 质量函数(假设符合泊松分布) 概念直方图
|
|
|
|
+def code_2():
|
|
|
|
+ figsize(12.5, 4)
|
|
|
|
+ colors = ['#348ABD', '#A60628']
|
|
|
|
+
|
|
|
|
+ lambda_ = [1.5, 4.25] # 泊松分布强度。离散概率分布质量函数,即值为k时的概率,P(Z=k) = λ exp k * e exp -λ / k!,k=0,1,2,...
|
|
|
|
+ k = np.arange(16) # 离散变量
|
|
|
|
+ poi = stats.poisson # 泊松分布
|
|
|
|
+ plt.bar(k, poi.pmf(k, lambda_[0]), label='$\lambda = %.1f$' % lambda_[0], color=colors[0], alpha=0.6, lw=3, edgecolor=colors[0])
|
|
|
|
+ plt.bar(k, poi.pmf(k, lambda_[1]), label='$\lambda = %.1f$' % lambda_[1], color=colors[1], alpha=0.6, lw=3, edgecolor=colors[1])
|
|
|
|
+
|
|
|
|
+ plt.xticks(k+0.4, k)
|
|
|
|
+ plt.xlabel('$k$')
|
|
|
|
+ plt.ylabel('取值为 $k$ 的概率')
|
|
|
|
+ plt.title('不同 $\lambda$ 强度情况下泊松分布随机变量的概率质量函数')
|
|
|
|
+ plt.legend()
|
|
|
|
+
|
|
|
|
+ plt.show()
|
|
|
|
+
|
|
|
|
+# 连续变量概率 - 密度函数(假设指数密度分布) 概念折线图
|
|
|
|
+def code_3():
|
|
|
|
+ figsize(12.5, 4)
|
|
|
|
+ colors = ['#348ABD', '#A60628']
|
|
|
|
+
|
|
|
|
+ lambda_ = [0.5, 1] # 随机变量期望。连续概率密度函数,fz(z|λ) = λ e exp -λz,z>=0
|
|
|
|
+ z = np.linspace(0, 4, 100) # 连续线性空间
|
|
|
|
+ expo = stats.expon # 指数密度
|
|
|
|
+
|
|
|
|
+ for l, c in zip(lambda_, colors):
|
|
|
|
+ plt.plot(z, expo.pdf(z, scale=1./l), label='$\lambda = %.1f$' % l, color=c, lw=3)
|
|
|
|
+ plt.fill_between(z, expo.pdf(z, scale=1./l), color=c, alpha=.33)
|
|
|
|
+
|
|
|
|
+ plt.xlim(0, 4)
|
|
|
|
+ plt.ylim(0, 1.2)
|
|
|
|
+ plt.xlabel('$z$')
|
|
|
|
+ plt.ylabel('取值为 $z$ 的概率密度函数结果')
|
|
|
|
+ plt.title('不同 $\lambda$ 取值情况下指数分布随机变量的概率密度函数')
|
|
|
|
+ plt.legend()
|
|
|
|
+
|
|
|
|
+ plt.show()
|
|
|
|
+
|
|
|
|
+# 短信数据行为推断 - 原始值呈现
|
|
|
|
+def code_4():
|
|
|
|
+ figsize(12.5, 4)
|
|
|
|
+ colors = ['#348ABD', '#A60628']
|
|
|
|
+
|
|
|
|
+ count_data = np.loadtxt("bayes/data/txtdata.csv")
|
|
|
|
+ n_count_data = len(count_data)
|
|
|
|
+
|
|
|
|
+ plt.bar(np.arange(n_count_data), count_data, color=colors[0])
|
|
|
|
+
|
|
|
|
+ plt.xlim(0, n_count_data);
|
|
|
|
+ plt.xlabel("时间(天)")
|
|
|
|
+ plt.ylabel("短信接收数量")
|
|
|
|
+ plt.title("用户的短信使用行为是否随着时间发生变化?")
|
|
|
|
+ plt.legend()
|
|
|
|
+
|
|
|
|
+ plt.show()
|
|
|
|
+
|
|
|
|
+# 短信数据行为推断 - pyMC
|
|
|
|
+def code_5():
|
|
|
|
+ figsize(12.5, 10)
|
|
|
|
+ colors = ['#A60628', '#7A68A6', '#467821']
|
|
|
|
+
|
|
|
|
+ count_data = np.loadtxt("bayes/data/txtdata.csv")
|
|
|
|
+ n_count_data = len(count_data)
|
|
|
|
+
|
|
|
|
+ alpha = 1.0 / count_data.mean()
|
|
|
|
+ lambda_1 = pm.Exponential('lambda_1', alpha)
|
|
|
|
+ lambda_2 = pm.Exponential('lambda_2', alpha)
|
|
|
|
+ tau = pm.DiscreteUniform('tau', lower=0, upper=n_count_data)
|
|
|
|
+
|
|
|
|
+ @pm.deterministic
|
|
|
|
+ def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
|
|
|
|
+ out = np.zeros(n_count_data)
|
|
|
|
+ out[:tau] = lambda_1
|
|
|
|
+ out[tau:] = lambda_2
|
|
|
|
+ return out
|
|
|
|
+
|
|
|
|
+ observation = pm.Poisson('obs', lambda_, value=count_data, observed=True)
|
|
|
|
+ model = pm.Model([observation, lambda_1, lambda_2, tau])
|
|
|
|
+
|
|
|
|
+ mcmc = pm.MCMC(model)
|
|
|
|
+ mcmc.sample(40000, 10000, 1)
|
|
|
|
+
|
|
|
|
+ lambda_1_samples = mcmc.trace('lambda_1')[:]
|
|
|
|
+ lambda_2_samples = mcmc.trace('lambda_2')[:]
|
|
|
|
+ tau_samples = mcmc.trace('tau')[:]
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # 绘图部分
|
|
|
|
+ ax = plt.subplot(311)
|
|
|
|
+ ax.set_autoscaley_on(False)
|
|
|
|
+ plt.title("参数 $\lambda_1$、$\lambda_2$、$\tau$ 的后验分布")
|
|
|
|
+ plt.hist(lambda_1_samples, histtype='stepfilled', bins=30, alpha=0.85, label="$\lambda_1$ 的后验", color=colors[0], normed=True)
|
|
|
|
+ plt.legend(loc="upper left")
|
|
|
|
+ plt.xlim([15, 30])
|
|
|
|
+ plt.xlabel("$\lambda_1$")
|
|
|
|
+ plt.ylabel("密度")
|
|
|
|
+
|
|
|
|
+ ax = plt.subplot(312)
|
|
|
|
+ ax.set_autoscaley_on(False)
|
|
|
|
+ plt.hist(lambda_2_samples, histtype='stepfilled', bins=30, alpha=0.85, label="$\lambda_2$ 的后验", color=colors[1], normed=True)
|
|
|
|
+ plt.legend(loc="upper left")
|
|
|
|
+ plt.xlim([15, 30])
|
|
|
|
+ plt.xlabel("$\lambda_2$")
|
|
|
|
+ plt.ylabel("密度")
|
|
|
|
+
|
|
|
|
+ plt.subplot(313)
|
|
|
|
+ w = 1.0 / tau_samples.shape[0] * np.ones_like(tau_samples)
|
|
|
|
+ plt.hist(tau_samples, bins=n_count_data, alpha=1, label=r"$\tau$ 的后验", color=colors[2], weights=w, rwidth=2.)
|
|
|
|
+ plt.legend(loc="upper left")
|
|
|
|
+ plt.xticks(np.arange(n_count_data))
|
|
|
|
+ plt.xlim([35, len(count_data)-20])
|
|
|
|
+ plt.ylim([0, .75])
|
|
|
|
+ plt.xlabel(r"$\tau$ (天)")
|
|
|
|
+ plt.ylabel("概率")
|
|
|
|
+
|
|
|
|
+ plt.show()
|