郑毅 1 ヶ月 前
コミット
74ba0f99f8
9 ファイル変更239 行追加153 行削除
  1. 2 3
      __main__.py
  2. 0 0
      bayes/__init__.py
  3. 150 0
      bayes/chapter_1.py
  4. 13 0
      bayes/common.py
  5. 74 0
      bayes/data/txtdata.csv
  6. 0 36
      tensorflow/test01.py
  7. 0 18
      tensorflow/test02.py
  8. 0 26
      tensorflow/test03.py
  9. 0 70
      tensorflow/test04.py

+ 2 - 3
__main__.py

@@ -1,8 +1,7 @@
-from airport_codes.get_info import *
+from bayes.chapter_1 import *
 
 def main():
-    req_iata_for_city()
-
+    code_5()
 
 # 程序入口
 if __name__ == '__main__':

+ 0 - 0
tensorflow/__init__.py → bayes/__init__.py


+ 150 - 0
bayes/chapter_1.py

@@ -0,0 +1,150 @@
+from matplotlib import pyplot as plt
+import numpy as np
+import scipy.stats as stats
+import pymc as pm
+from IPython.core.pylabtools import figsize
+
+from bayes.common import *
+
+plt_init()
+
+# 先验概率、后验概率 概念直方图
+def code_1():
+    figsize(12.5, 4)
+    colors = ['#348ABD', '#A60628']
+
+    prior = [1/21., 20/21.]       # 先验概率,男性是图书管理员或农民的概率是 1:20
+    posterior = [0.087, 1-0.087]  # 后验概率,P(A|X) = P(X|A)P(A)/P(X),计算原理如下
+    # P(X|A),即A为真时特征X符合的概率,假设为0.95
+    # P(X) = P(X|A)P(A) + P(X|~A)P(~A) = 0.95 * 1/21 + 0.5 * 20/21 = 0.52,其中0.5为假设A为假时的特征概率
+    # P(A|X) = 0.95 * 1/21 / 0.52 = 0.087
+
+    plt.bar([0, .7], prior, label='先验概率', width=0.25, color=colors[0], alpha=0.7, lw=3, edgecolor=colors[0])
+    plt.bar([0+0.25, .7+0.25], posterior, label='后验概率', width=0.25, color=colors[1], alpha=0.7, lw=3, edgecolor=colors[1])
+
+    plt.xticks([0.20, 0.95], ['图书管理员', '农民'])
+    plt.ylabel('概率')
+    plt.title('Steve 的职业的先验概率及后验概率')
+    plt.legend()
+
+    plt.show()
+
+# 离散变量概率 - 质量函数(假设符合泊松分布) 概念直方图
+def code_2():
+    figsize(12.5, 4)
+    colors = ['#348ABD', '#A60628']
+
+    lambda_ = [1.5, 4.25]  # 泊松分布强度。离散概率分布质量函数,即值为k时的概率,P(Z=k) = λ exp k * e exp -λ / k!,k=0,1,2,...
+    k = np.arange(16)      # 离散变量
+    poi = stats.poisson    # 泊松分布
+    plt.bar(k, poi.pmf(k, lambda_[0]), label='$\lambda = %.1f$' % lambda_[0], color=colors[0], alpha=0.6, lw=3, edgecolor=colors[0])
+    plt.bar(k, poi.pmf(k, lambda_[1]), label='$\lambda = %.1f$' % lambda_[1], color=colors[1], alpha=0.6, lw=3, edgecolor=colors[1])
+
+    plt.xticks(k+0.4, k)
+    plt.xlabel('$k$')
+    plt.ylabel('取值为 $k$ 的概率')
+    plt.title('不同 $\lambda$ 强度情况下泊松分布随机变量的概率质量函数')
+    plt.legend()
+
+    plt.show()
+
+# 连续变量概率 - 密度函数(假设指数密度分布) 概念折线图
+def code_3():
+    figsize(12.5, 4)
+    colors = ['#348ABD', '#A60628']
+
+    lambda_ = [0.5, 1]            # 随机变量期望。连续概率密度函数,fz(z|λ) = λ e exp -λz,z>=0
+    z = np.linspace(0, 4, 100)    # 连续线性空间
+    expo = stats.expon            # 指数密度
+
+    for l, c in zip(lambda_, colors):
+        plt.plot(z, expo.pdf(z, scale=1./l), label='$\lambda = %.1f$' % l, color=c, lw=3)
+        plt.fill_between(z, expo.pdf(z, scale=1./l), color=c, alpha=.33)
+
+    plt.xlim(0, 4)
+    plt.ylim(0, 1.2)
+    plt.xlabel('$z$')
+    plt.ylabel('取值为 $z$ 的概率密度函数结果')
+    plt.title('不同 $\lambda$ 取值情况下指数分布随机变量的概率密度函数')
+    plt.legend()
+
+    plt.show()
+
+# 短信数据行为推断 - 原始值呈现
+def code_4():
+    figsize(12.5, 4)
+    colors = ['#348ABD', '#A60628']
+
+    count_data = np.loadtxt("bayes/data/txtdata.csv")
+    n_count_data = len(count_data)
+
+    plt.bar(np.arange(n_count_data), count_data, color=colors[0])
+
+    plt.xlim(0, n_count_data);
+    plt.xlabel("时间(天)")
+    plt.ylabel("短信接收数量")
+    plt.title("用户的短信使用行为是否随着时间发生变化?")
+    plt.legend()
+
+    plt.show()
+
+# 短信数据行为推断 - pyMC
+def code_5():
+    figsize(12.5, 10)
+    colors = ['#A60628', '#7A68A6', '#467821']
+
+    count_data = np.loadtxt("bayes/data/txtdata.csv")
+    n_count_data = len(count_data)
+
+    alpha = 1.0 / count_data.mean()
+    lambda_1 = pm.Exponential('lambda_1', alpha)
+    lambda_2 = pm.Exponential('lambda_2', alpha)
+    tau = pm.DiscreteUniform('tau', lower=0, upper=n_count_data)
+
+    @pm.deterministic
+    def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
+        out = np.zeros(n_count_data)
+        out[:tau] = lambda_1
+        out[tau:] = lambda_2
+        return out
+    
+    observation = pm.Poisson('obs', lambda_, value=count_data, observed=True)
+    model = pm.Model([observation, lambda_1, lambda_2, tau])
+
+    mcmc = pm.MCMC(model)
+    mcmc.sample(40000, 10000, 1)
+        
+    lambda_1_samples = mcmc.trace('lambda_1')[:]
+    lambda_2_samples = mcmc.trace('lambda_2')[:]
+    tau_samples = mcmc.trace('tau')[:]
+    
+
+    # 绘图部分
+    ax = plt.subplot(311)
+    ax.set_autoscaley_on(False)
+    plt.title("参数 $\lambda_1$、$\lambda_2$、$\tau$ 的后验分布")
+    plt.hist(lambda_1_samples, histtype='stepfilled', bins=30, alpha=0.85, label="$\lambda_1$ 的后验", color=colors[0], normed=True)
+    plt.legend(loc="upper left")
+    plt.xlim([15, 30])
+    plt.xlabel("$\lambda_1$")
+    plt.ylabel("密度")
+
+    ax = plt.subplot(312)
+    ax.set_autoscaley_on(False)
+    plt.hist(lambda_2_samples, histtype='stepfilled', bins=30, alpha=0.85, label="$\lambda_2$ 的后验", color=colors[1], normed=True)
+    plt.legend(loc="upper left")
+    plt.xlim([15, 30])
+    plt.xlabel("$\lambda_2$")
+    plt.ylabel("密度")
+
+    plt.subplot(313)
+    w = 1.0 / tau_samples.shape[0] * np.ones_like(tau_samples)
+    plt.hist(tau_samples, bins=n_count_data, alpha=1, label=r"$\tau$ 的后验", color=colors[2], weights=w, rwidth=2.)
+    plt.legend(loc="upper left")
+    plt.xticks(np.arange(n_count_data))
+    plt.xlim([35, len(count_data)-20])
+    plt.ylim([0, .75])
+    plt.xlabel(r"$\tau$ (天)")
+    plt.ylabel("概率")
+
+    plt.show()

+ 13 - 0
bayes/common.py

@@ -0,0 +1,13 @@
+import matplotlib
+import matplotlib.font_manager
+
+# 查看系统支持的字体
+def get_fonts():
+    for i in sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist]): 
+        print(i)
+
+# 绘图
+def plt_init():
+    matplotlib.rc('font', family='Arial Unicode MS')    # 中文字体
+    matplotlib.rc('savefig', dpi=300)                   # 保存图片分辨率
+    matplotlib.rc('figure', dpi=100)                    # 显示图片分辨率(默认100)

+ 74 - 0
bayes/data/txtdata.csv

@@ -0,0 +1,74 @@
+1.300000000000000000e+01
+2.400000000000000000e+01
+8.000000000000000000e+00
+2.400000000000000000e+01
+7.000000000000000000e+00
+3.500000000000000000e+01
+1.400000000000000000e+01
+1.100000000000000000e+01
+1.500000000000000000e+01
+1.100000000000000000e+01
+2.200000000000000000e+01
+2.200000000000000000e+01
+1.100000000000000000e+01
+5.700000000000000000e+01
+1.100000000000000000e+01
+1.900000000000000000e+01
+2.900000000000000000e+01
+6.000000000000000000e+00
+1.900000000000000000e+01
+1.200000000000000000e+01
+2.200000000000000000e+01
+1.200000000000000000e+01
+1.800000000000000000e+01
+7.200000000000000000e+01
+3.200000000000000000e+01
+9.000000000000000000e+00
+7.000000000000000000e+00
+1.300000000000000000e+01
+1.900000000000000000e+01
+2.300000000000000000e+01
+2.700000000000000000e+01
+2.000000000000000000e+01
+6.000000000000000000e+00
+1.700000000000000000e+01
+1.300000000000000000e+01
+1.000000000000000000e+01
+1.400000000000000000e+01
+6.000000000000000000e+00
+1.600000000000000000e+01
+1.500000000000000000e+01
+7.000000000000000000e+00
+2.000000000000000000e+00
+1.500000000000000000e+01
+1.500000000000000000e+01
+1.900000000000000000e+01
+7.000000000000000000e+01
+4.900000000000000000e+01
+7.000000000000000000e+00
+5.300000000000000000e+01
+2.200000000000000000e+01
+2.100000000000000000e+01
+3.100000000000000000e+01
+1.900000000000000000e+01
+1.100000000000000000e+01
+1.800000000000000000e+01
+2.000000000000000000e+01
+1.200000000000000000e+01
+3.500000000000000000e+01
+1.700000000000000000e+01
+2.300000000000000000e+01
+1.700000000000000000e+01
+4.000000000000000000e+00
+2.000000000000000000e+00
+3.100000000000000000e+01
+3.000000000000000000e+01
+1.300000000000000000e+01
+2.700000000000000000e+01
+0.000000000000000000e+00
+3.900000000000000000e+01
+3.700000000000000000e+01
+5.000000000000000000e+00
+1.400000000000000000e+01
+1.300000000000000000e+01
+2.200000000000000000e+01

+ 0 - 36
tensorflow/test01.py

@@ -1,36 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import os
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
-
-
-def foo01():
-    x = np.array([[1, 1, 1], [1, -8, 1], [1, 1, 1]])
-    w = tf.Variable(initial_value=x)
-    sess = tf.Session()
-    sess.run(tf.global_variables_initializer())
-    print(sess.run(w))
-
-
-def foo02():
-    x = tf.Variable(3)
-    y = tf.Variable(5)
-    z = tf.add(x, y)
-    sess = tf.Session()
-    sess.run(tf.global_variables_initializer())
-    print(sess.run(z))
-
-
-def foo03():
-    a = tf.Variable(tf.ones([3, 2]))
-    b = tf.Variable(tf.ones([2, 3]))
-    product = tf.matmul(5.0 * a, 4.0 * b)
-    sess = tf.Session()
-    sess.run(tf.global_variables_initializer())
-    print(sess.run(product))
-
-
-print(tf.__path__)
-foo01()
-foo02()
-foo03()

+ 0 - 18
tensorflow/test02.py

@@ -1,18 +0,0 @@
-import tensorflow.examples.tutorials.mnist.input_data as ipt
-import tensorflow.contrib.learn.python.learn.datasets.base as base
-import tensorflow.models.image.cifar10.cifar10 as cifar10
-iris_data, iris_label = base.load_iris()
-house_data, house_label = base.load_boston()
-cifar10.maybe_download_and_extract()
-images, labels = cifar10.distorted_inputs()
-
-mnist = ipt.read_data_sets("MNIST_data/", one_hot=True)
-
-print(mnist.train.images.shape)
-print(mnist.train.labels.shape)
-print(mnist.validation.images.shape)
-print(mnist.validation.labels.shape)
-print(mnist.test.images.shape)
-print(mnist.test.labels.shape)
-print(images)
-print(labels)

+ 0 - 26
tensorflow/test03.py

@@ -1,26 +0,0 @@
-import tensorflow as tf
-import tensorflow.examples.tutorials.mnist.input_data as input_data
-import os
-
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
-
-mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
-x = tf.placeholder(tf.float32, [None, 784])
-y_actual = tf.placeholder(tf.float32, shape=[None, 10])
-W = tf.Variable(tf.zeros([784, 10]))  # 初始化权值W
-b = tf.Variable(tf.zeros([10]))  # 初始化偏置项b
-y_predict = tf.nn.softmax(tf.matmul(x, W) + b)  # 加权变换并进行softmax回归,得到预测概率
-cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_actual * tf.log(y_predict)))  # 求交叉熵
-train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)  # 用梯度下降法使得残差最小
-
-correct_prediction = tf.equal(tf.argmax(y_predict, 1), tf.argmax(y_actual, 1))  # 在测试阶段,测试准确度计算
-accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))  # 多个批次的准确度均值
-
-init = tf.global_variables_initializer()
-with tf.Session() as sess:
-    sess.run(init)
-    for i in range(1000):  # 训练阶段,迭代1000次
-        batch_xs, batch_ys = mnist.train.next_batch(100)  # 按批次训练,每批100行数据
-        sess.run(train_step, feed_dict={x: batch_xs, y_actual: batch_ys})  # 执行训练
-        if i % 100 == 0:  # 每训练100次,测试一次
-            print("accuracy:", sess.run(accuracy, feed_dict={x: mnist.test.images, y_actual: mnist.test.labels}))

+ 0 - 70
tensorflow/test04.py

@@ -1,70 +0,0 @@
-# -*- coding: utf-8 -*-
-import tensorflow as tf
-import tensorflow.examples.tutorials.mnist.input_data as input_data
-
-mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)  # 下载并加载mnist数据
-x = tf.placeholder(tf.float32, [None, 784])  # 输入的数据占位符
-y_actual = tf.placeholder(tf.float32, shape=[None, 10])  # 输入的标签占位符
-
-
-# 定义一个函数,用于初始化所有的权值 W
-def weight_variable(shape):
-    initial = tf.truncated_normal(shape, stddev=0.1)
-    return tf.Variable(initial)
-
-
-# 定义一个函数,用于初始化所有的偏置项 b
-def bias_variable(shape):
-    initial = tf.constant(0.1, shape=shape)
-    return tf.Variable(initial)
-
-
-# 定义一个函数,用于构建卷积层
-def conv2d(x, W):
-    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
-
-
-# 定义一个函数,用于构建池化层
-def max_pool(x):
-    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
-
-
-# 构建网络
-x_image = tf.reshape(x, [-1, 28, 28, 1])  # 转换输入数据shape,以便于用于网络中
-W_conv1 = weight_variable([5, 5, 1, 32])
-b_conv1 = bias_variable([32])
-h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)  # 第一个卷积层
-h_pool1 = max_pool(h_conv1)  # 第一个池化层
-
-W_conv2 = weight_variable([5, 5, 32, 64])
-b_conv2 = bias_variable([64])
-h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)  # 第二个卷积层
-h_pool2 = max_pool(h_conv2)  # 第二个池化层
-
-W_fc1 = weight_variable([7 * 7 * 64, 1024])
-b_fc1 = bias_variable([1024])
-h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])  # reshape成向量
-h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)  # 第一个全连接层
-
-keep_prob = tf.placeholder("float")
-h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)  # dropout层
-
-W_fc2 = weight_variable([1024, 10])
-b_fc2 = bias_variable([10])
-y_predict = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)  # softmax层
-
-cross_entropy = -tf.reduce_sum(y_actual * tf.log(y_predict))  # 交叉熵
-train_step = tf.train.GradientDescentOptimizer(1e-3).minimize(cross_entropy)  # 梯度下降法
-correct_prediction = tf.equal(tf.argmax(y_predict, 1), tf.argmax(y_actual, 1))
-accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))  # 精确度计算
-sess = tf.InteractiveSession()
-sess.run(tf.initialize_all_variables())
-for i in range(20000):
-    batch = mnist.train.next_batch(50)
-    if i % 100 == 0:  # 训练100次,验证一次
-        train_acc = accuracy.eval(feed_dict={x: batch[0], y_actual: batch[1], keep_prob: 1.0})
-        print('step', i, 'training accuracy', train_acc)
-        train_step.run(feed_dict={x: batch[0], y_actual: batch[1], keep_prob: 0.5})
-
-test_acc = accuracy.eval(feed_dict={x: mnist.test.images, y_actual: mnist.test.labels, keep_prob: 1.0})
-print("test accuracy", test_acc)