chapter_1.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. from matplotlib import pyplot as plt
  2. import numpy as np
  3. import scipy.stats as stats
  4. import pymc as pm
  5. from IPython.core.pylabtools import figsize
  6. from bayes.common import *
  7. # https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers
  8. plt_init()
  9. # 先验概率、后验概率 概念直方图
  10. def code_1():
  11. figsize(12.5, 4)
  12. colors = ['#348ABD', '#A60628']
  13. prior = [1/21., 20/21.] # 先验概率,男性是图书管理员或农民的概率是 1:20
  14. posterior = [0.087, 1-0.087] # 后验概率,P(A|X) = P(X|A)P(A)/P(X),计算原理如下
  15. # P(X|A),即A为真时特征X符合的概率,假设为0.95
  16. # P(X) = P(X|A)P(A) + P(X|~A)P(~A) = 0.95 * 1/21 + 0.5 * 20/21 = 0.52,其中0.5为假设A为假时的特征概率
  17. # P(A|X) = 0.95 * 1/21 / 0.52 = 0.087
  18. plt.bar([0, .7], prior, label='先验概率', width=0.25, color=colors[0], alpha=0.7, lw=3, edgecolor=colors[0])
  19. plt.bar([0+0.25, .7+0.25], posterior, label='后验概率', width=0.25, color=colors[1], alpha=0.7, lw=3, edgecolor=colors[1])
  20. plt.xticks([0.20, 0.95], ['图书管理员', '农民'])
  21. plt.ylabel('概率')
  22. plt.title('Steve 的职业的先验概率及后验概率')
  23. plt.legend()
  24. plt.show()
  25. # 离散变量概率 - 质量函数(假设符合泊松分布) 概念直方图
  26. def code_2():
  27. figsize(12.5, 4)
  28. colors = ['#348ABD', '#A60628']
  29. lambda_ = [1.5, 4.25] # 泊松分布强度。离散概率分布质量函数,即值为k时的概率,P(Z=k) = λ exp k * e exp -λ / k!,k=0,1,2,...
  30. k = np.arange(16) # 离散变量
  31. poi = stats.poisson # 泊松分布
  32. plt.bar(k, poi.pmf(k, lambda_[0]), label='$\lambda = %.1f$' % lambda_[0], color=colors[0], alpha=0.6, lw=3, edgecolor=colors[0])
  33. plt.bar(k, poi.pmf(k, lambda_[1]), label='$\lambda = %.1f$' % lambda_[1], color=colors[1], alpha=0.6, lw=3, edgecolor=colors[1])
  34. plt.xticks(k+0.4, k)
  35. plt.xlabel('$k$')
  36. plt.ylabel('取值为 $k$ 的概率')
  37. plt.title('不同 $\lambda$ 强度情况下泊松分布随机变量的概率质量函数')
  38. plt.legend()
  39. plt.show()
  40. # 连续变量概率 - 密度函数(假设指数密度分布) 概念折线图
  41. def code_3():
  42. figsize(12.5, 4)
  43. colors = ['#348ABD', '#A60628']
  44. lambda_ = [0.5, 1] # 随机变量期望。连续概率密度函数,fz(z|λ) = λ e exp -λz,z>=0
  45. z = np.linspace(0, 4, 100) # 连续线性空间
  46. expo = stats.expon # 指数密度
  47. for l, c in zip(lambda_, colors):
  48. plt.plot(z, expo.pdf(z, scale=1./l), label='$\lambda = %.1f$' % l, color=c, lw=3)
  49. plt.fill_between(z, expo.pdf(z, scale=1./l), color=c, alpha=.33)
  50. plt.xlim(0, 4)
  51. plt.ylim(0, 1.2)
  52. plt.xlabel('$z$')
  53. plt.ylabel('取值为 $z$ 的概率密度函数结果')
  54. plt.title('不同 $\lambda$ 取值情况下指数分布随机变量的概率密度函数')
  55. plt.legend()
  56. plt.show()
  57. # 短信数据行为推断 - 原始值呈现
  58. def code_4():
  59. figsize(12.5, 4)
  60. colors = ['#348ABD', '#A60628']
  61. count_data = np.loadtxt("bayes/data/txtdata.csv")
  62. n_count_data = len(count_data)
  63. plt.bar(np.arange(n_count_data), count_data, color=colors[0])
  64. plt.xlim(0, n_count_data);
  65. plt.xlabel("时间(天)")
  66. plt.ylabel("短信接收数量")
  67. plt.title("用户的短信使用行为是否随着时间发生变化?")
  68. plt.legend()
  69. plt.show()
  70. # 短信数据行为推断 - pyMC
  71. def code_5():
  72. figsize(12.5, 10)
  73. colors = ['#A60628', '#7A68A6', '#467821']
  74. count_data = np.loadtxt("bayes/data/txtdata.csv")
  75. n_count_data = len(count_data)
  76. alpha = 1.0 / count_data.mean()
  77. lambda_1 = pm.Exponential('lambda_1', alpha)
  78. lambda_2 = pm.Exponential('lambda_2', alpha)
  79. tau = pm.DiscreteUniform('tau', lower=0, upper=n_count_data)
  80. @pm.deterministic
  81. def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
  82. out = np.zeros(n_count_data)
  83. out[:tau] = lambda_1
  84. out[tau:] = lambda_2
  85. return out
  86. observation = pm.Poisson('obs', lambda_, value=count_data, observed=True)
  87. model = pm.Model([observation, lambda_1, lambda_2, tau])
  88. mcmc = pm.MCMC(model)
  89. mcmc.sample(40000, 10000, 1)
  90. lambda_1_samples = mcmc.trace('lambda_1')[:]
  91. lambda_2_samples = mcmc.trace('lambda_2')[:]
  92. tau_samples = mcmc.trace('tau')[:]
  93. # 绘图部分
  94. ax = plt.subplot(311)
  95. ax.set_autoscaley_on(False)
  96. plt.title("参数 $\lambda_1$、$\lambda_2$、$\tau$ 的后验分布")
  97. plt.hist(lambda_1_samples, histtype='stepfilled', bins=30, alpha=0.85, label="$\lambda_1$ 的后验", color=colors[0], normed=True)
  98. plt.legend(loc="upper left")
  99. plt.xlim([15, 30])
  100. plt.xlabel("$\lambda_1$")
  101. plt.ylabel("密度")
  102. ax = plt.subplot(312)
  103. ax.set_autoscaley_on(False)
  104. plt.hist(lambda_2_samples, histtype='stepfilled', bins=30, alpha=0.85, label="$\lambda_2$ 的后验", color=colors[1], normed=True)
  105. plt.legend(loc="upper left")
  106. plt.xlim([15, 30])
  107. plt.xlabel("$\lambda_2$")
  108. plt.ylabel("密度")
  109. plt.subplot(313)
  110. w = 1.0 / tau_samples.shape[0] * np.ones_like(tau_samples)
  111. plt.hist(tau_samples, bins=n_count_data, alpha=1, label=r"$\tau$ 的后验", color=colors[2], weights=w, rwidth=2.)
  112. plt.legend(loc="upper left")
  113. plt.xticks(np.arange(n_count_data))
  114. plt.xlim([35, len(count_data)-20])
  115. plt.ylim([0, .75])
  116. plt.xlabel(r"$\tau$ (天)")
  117. plt.ylabel("概率")
  118. plt.show()