logistic_regression.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. import torch
  2. from torch.autograd import Variable
  3. from torch import nn, optim
  4. import torchvision.transforms as transforms
  5. import torchvision.datasets as dsets
  6. import numpy as np
  7. from skorch import NeuralNet
  8. from matplotlib import pyplot as plt
  9. from spacecutter.callbacks import AscensionCallback
  10. from spacecutter.losses import CumulativeLinkLoss
  11. from spacecutter.models import OrdinalLogisticModel
  12. class LogisticRegression(nn.Module):
  13. def __init__(self, input_dim, output_dim):
  14. super(LogisticRegression, self).__init__()
  15. self.linear = nn.Linear(input_dim, output_dim)
  16. def forward(self, x):
  17. outputs = self.linear(x)
  18. return outputs
  19. # pyTorch 逻辑回归 MNIST 数据
  20. def regression_on_mnist():
  21. batch_size = 100
  22. n_iters = 5000
  23. input_dim = 784
  24. output_dim = 10
  25. lr_rate = 0.001
  26. train_dataset = dsets.MNIST(root='./torch_test/data', train=True, transform=transforms.ToTensor(), download=False)
  27. test_dataset = dsets.MNIST(root='./torch_test/data', train=False, transform=transforms.ToTensor())
  28. train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
  29. test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
  30. model = LogisticRegression(input_dim, output_dim)
  31. criterion = nn.CrossEntropyLoss() # 计算 softmax 分布之上的交叉熵损失
  32. optimizer = optim.SGD(model.parameters(), lr=lr_rate)
  33. epochs = n_iters / (len(train_dataset) / batch_size)
  34. iter = 0
  35. for epoch in range(int(epochs)):
  36. for i, (images, labels) in enumerate(train_loader):
  37. images = Variable(images.view(-1, 28 * 28))
  38. labels = Variable(labels)
  39. optimizer.zero_grad()
  40. outputs = model(images)
  41. loss = criterion(outputs, labels)
  42. loss.backward()
  43. optimizer.step()
  44. iter+=1
  45. if iter%500==0:
  46. # 计算准确率
  47. correct = 0
  48. total = 0
  49. for images, labels in test_loader:
  50. images = Variable(images.view(-1, 28*28))
  51. outputs = model(images)
  52. _, predicted = torch.max(outputs.data, 1)
  53. total+= labels.size(0)
  54. # 如果用的是 GPU,则要把预测值和标签都取回 CPU,才能用 Python 来计算
  55. correct+= (predicted == labels).sum()
  56. accuracy = 100 * correct/total
  57. print("Iteration: {}. Loss: {}. Accuracy: {}.".format(iter, loss.item(), accuracy))
  58. # pyTorch + Skorch 有序逻辑回归
  59. def ordinal_regression():
  60. # 0. 数据准备
  61. X = np.array([
  62. [0.5, 0.1, -0.1],
  63. [1.0, 0.2, 0.6],
  64. [-2.0, 0.4, 0.8]
  65. ], dtype=np.float32)
  66. y = np.array([0, 1, 2]).reshape(-1, 1)
  67. num_features = X.shape[1]
  68. num_classes = len(np.unique(y))
  69. # 1. 有序逻辑回归
  70. predictor = nn.Sequential( # 预测器
  71. nn.Linear(num_features, num_features),
  72. nn.ReLU(),
  73. nn.Linear(num_features, 1)
  74. )
  75. # model = OrdinalLogisticModel(predictor, num_classes)
  76. # y_pred = model(torch.as_tensor(X))
  77. # print(y_pred)
  78. # tensor([[0.2325, 0.2191,
  79. # [0.2324, 0.2191, 0.5485],
  80. # [0.2607, 0.2287, 0.5106]], grad_fn=<CatBackward>)
  81. # 2. Skorch 训练模型
  82. skorch_model = NeuralNet(
  83. module=OrdinalLogisticModel,
  84. module__predictor=predictor,
  85. module__num_classes=num_classes,
  86. optimizer=torch.optim.Adam, # Adam收敛速度快,非SGD优化算法
  87. criterion=CumulativeLinkLoss, # 与OrdinalLogisticModel匹配的累计链接损失函数,常用交叉熵 torch.nn.CrossEntropyLoss 此处不适用
  88. train_split=None,
  89. max_epochs= 5000, # 训练次数(epoch=全部样本训练一次,iteration=取batchsize样本训练一次,SGD随机梯度下降优化算法才分iteration)
  90. callbacks=[
  91. ('ascension', AscensionCallback())
  92. ],
  93. )
  94. skorch_model.fit(X, y) # 训练
  95. # y_proba = skorch_model.predict_proba(X) # 预测
  96. # 3. Matplotlib 可视化
  97. train_loss = skorch_model.history[:, 'train_loss']
  98. plt.plot(train_loss, label='Train Loss')
  99. plt.xlabel('Epoch')
  100. plt.ylabel('Loss')
  101. plt.legend()
  102. plt.show()
  103. import pandas as pd
  104. from sklearn.model_selection import train_test_split
  105. from sklearn.linear_model import LogisticRegression
  106. def test():
  107. data = pd.read_stata('./torch_test/data/CGSS/CGSS2021_20240607.dta', convert_categoricals=False)
  108. data.to_csv('./torch_test/data/CGSS/CGSS2021_20240607.csv', encoding='UTF-8')
  109. feature_cols = ['id', 'provinces', 'type', 'A00','A1','A12_2','A12a']
  110. X = data[feature_cols] # Features
  111. y = data.label # Target variable
  112. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)
  113. logreg = LogisticRegression(random_state=16)
  114. # fit the model with data
  115. logreg.fit(X_train, y_train)
  116. y_pred = logreg.predict(X_test)
  117. print(y_pred)
  118. # 基于 sklearn 的有序逻辑回归