Last Updated on 2023-03-23 by Clay
PyTorch 是一個經常用於架構深度學習的框架,從名字就可以看出,是使用 Python 語法來調用的。PyTorch 封裝了各式各樣常用於深度學習的函式、神經網路、模型架構,使用上非常方便。
在一般學習、測試模型的時候,我們並不需要關心如何固定模型的參數,好讓模型可以重現 —— 但今天若是在科學實驗的背景下,如何讓一組模型的實驗可以重現,就是一件非常重要的事情了。
一開始我以為只需要綁住 PyTorch 內的亂數種子即可,但實際測試時卻沒辦法重現實驗,找了好一會兒的資料才驚覺要設定的地方比想像中來得多。
以下,則透過經典的訓練手寫數字辨識模型(Mnist 資料集)為例,測試如何重現一模一樣的實驗結果。
訓練手寫數字辨識模型的範例程式碼
以下是一份訓練手寫數字辨識模型的範例程式碼,為了快速地測試,所有的參數都設得相當單純,比方說迭代次數設定為 1 幾乎很難讓模型收斂。
# coding: utf-8
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms
# Model architecture
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.main = nn.Sequential(
nn.Linear(in_features=784, out_features=128),
nn.ReLU(),
nn.Linear(in_features=128, out_features=64),
nn.ReLU(),
nn.Linear(in_features=64, out_features=10),
nn.LogSoftmax(dim=1)
)
def forward(self, input):
return self.main(input)
# Train
def train(device, model, epochs, optimizer, loss_function, train_loader):
for epoch in range(1, epochs+1):
for times, data in enumerate(train_loader, 1):
inputs = data[0].to(device)
labels = data[1].to(device)
# Zero the gradients
optimizer.zero_grad()
# Forward and backward propagation
outputs = model(inputs.view(inputs.shape[0], -1))
loss = loss_function(outputs, labels)
loss.backward()
optimizer.step()
# Show progress
if times % 100 == 0 or times == len(train_loader):
print('[{}/{}, {}/{}] loss: {:.8}'.format(epoch, epochs, times, len(train_loader), loss.item()))
return model
def test(device, model, test_loader):
# Settings
model.eval()
total = 0
correct = 0
with torch.no_grad():
for data in test_loader:
inputs = data[0].to(device)
labels = data[1].to(device)
outputs = model(inputs.view(inputs.shape[0], -1))
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accurecy:', correct / total)
def main():
# GPU device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print('Device state:', device)
# Settings
epochs = 1
batch_size = 64
lr = 0.002
loss_function = nn.NLLLoss()
model = Net().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
# Transform
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))]
)
# Data
train_set = datasets.MNIST(root='MNIST', download=True, train=True, transform=transform)
test_set = datasets.MNIST(root='MNIST', download=True, train=False, transform=transform)
train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = data.DataLoader(test_set, batch_size=batch_size, shuffle=False)
# Train
model = train(device, model, epochs, optimizer, loss_function, train_loader)
# Test
test(device, model, test_loader)
if __name__ == '__main__':
main()
然後我快速地訓練了兩次,分別得到了以下不同的實驗結果:
Device state: cuda:0
[1/1, 100/938] loss: 0.46569321
[1/1, 200/938] loss: 0.51824951
[1/1, 300/938] loss: 0.60570443
[1/1, 400/938] loss: 0.1711497
[1/1, 500/938] loss: 0.17937183
[1/1, 600/938] loss: 0.31248432
[1/1, 700/938] loss: 0.088681653
[1/1, 800/938] loss: 0.21637213
[1/1, 900/938] loss: 0.13165317
[1/1, 938/938] loss: 0.067537516
Accurecy: 0.9416
Device state: cuda:0
[1/1, 100/938] loss: 0.34765923
[1/1, 200/938] loss: 0.45036712
[1/1, 300/938] loss: 0.35380328
[1/1, 400/938] loss: 0.28400266
[1/1, 500/938] loss: 0.34874138
[1/1, 600/938] loss: 0.33628809
[1/1, 700/938] loss: 0.24677548
[1/1, 800/938] loss: 0.1359618
[1/1, 900/938] loss: 0.088348009
[1/1, 938/938] loss: 0.1509731
Accurecy: 0.9414
從訓練過程的 loss 看得出來,兩個模型訓練的過程並不一致,第二次跑這份程式碼時無法重現第一次的實驗結果。
固定種子參數
要固定結果,需要設定以下幾個種子參數,最好將其放在開頭 import
各種套件的底下:
# coding: utf-8
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms
# Even if you don't use them, you still have to import
import random
import numpy as np
# Seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
其中,random
模組和 numpy
模組就算程式碼中沒有使用也需要匯入,因為 PyTorch 呼叫的函式恐怕有使用到,如果沒有固定參數,仍舊無法固定模型結果。
完整程式碼如下:
# coding: utf-8
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms
# Even if you don't use them, you still have to import
import random
import numpy as np
# Seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# Model architecture
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.main = nn.Sequential(
nn.Linear(in_features=784, out_features=128),
nn.ReLU(),
nn.Linear(in_features=128, out_features=64),
nn.ReLU(),
nn.Linear(in_features=64, out_features=10),
nn.LogSoftmax(dim=1)
)
def forward(self, input):
return self.main(input)
# Train
def train(device, model, epochs, optimizer, loss_function, train_loader):
for epoch in range(1, epochs+1):
for times, data in enumerate(train_loader, 1):
inputs = data[0].to(device)
labels = data[1].to(device)
# Zero the gradients
optimizer.zero_grad()
# Forward and backward propagation
outputs = model(inputs.view(inputs.shape[0], -1))
loss = loss_function(outputs, labels)
loss.backward()
optimizer.step()
# Show progress
if times % 100 == 0 or times == len(train_loader):
print('[{}/{}, {}/{}] loss: {:.8}'.format(epoch, epochs, times, len(train_loader), loss.item()))
return model
def test(device, model, test_loader):
# Settings
model.eval()
total = 0
correct = 0
with torch.no_grad():
for data in test_loader:
inputs = data[0].to(device)
labels = data[1].to(device)
outputs = model(inputs.view(inputs.shape[0], -1))
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accurecy:', correct / total)
def main():
# GPU device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print('Device state:', device)
# Settings
epochs = 1
batch_size = 64
lr = 0.002
loss_function = nn.NLLLoss()
model = Net().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
# Transform
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))]
)
# Data
train_set = datasets.MNIST(root='MNIST', download=True, train=True, transform=transform)
test_set = datasets.MNIST(root='MNIST', download=True, train=False, transform=transform)
train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = data.DataLoader(test_set, batch_size=batch_size, shuffle=False)
# Train
model = train(device, model, epochs, optimizer, loss_function, train_loader)
# Test
test(device, model, test_loader)
if __name__ == '__main__':
main()
跟上方同樣,我連續執行了程式兩次,這次可以看到結果是一模一樣的。
Device state: cuda:0
[1/1, 100/938] loss: 0.26784545
[1/1, 200/938] loss: 0.22377557
[1/1, 300/938] loss: 0.19579159
[1/1, 400/938] loss: 0.29825467
[1/1, 500/938] loss: 0.30673307
[1/1, 600/938] loss: 0.20264342
[1/1, 700/938] loss: 0.12821223
[1/1, 800/938] loss: 0.21547785
[1/1, 900/938] loss: 0.18268135
[1/1, 938/938] loss: 0.054493584
Accurecy: 0.9501
Device state: cuda:0
[1/1, 100/938] loss: 0.26784545
[1/1, 200/938] loss: 0.22377557
[1/1, 300/938] loss: 0.19579159
[1/1, 400/938] loss: 0.29825467
[1/1, 500/938] loss: 0.30673307
[1/1, 600/938] loss: 0.20264342
[1/1, 700/938] loss: 0.12821223
[1/1, 800/938] loss: 0.21547785
[1/1, 900/938] loss: 0.18268135
[1/1, 938/938] loss: 0.054493584
Accurecy: 0.9501
補述
在 PyTorch 官方文件中有提到,在不同版本、不同平台、不同設備上,無法保證完全可重複的結果 —— 這個我個人有過測試,在不同的 GPU 設備上結果是不同的。
我自己沒有比較過 GPU 和 CPU 兩者訓練出的結果,不過從官方文件中看來,似乎也是無法重複同樣結果的。
References
- https://pytorch.org/docs/stable/notes/randomness.html
- https://discuss.pytorch.org/t/why-same-model-in-cuda-and-cpu-got-different-result/56241