[PyTorch] Set Seed To Reproduce Model Training Results

PyTorch is a famous deep learning framework. As you can see from the name, it is called using Python syntax.

PyTorch encapsulates various functions, neural networks, and model architectures commonly used in deep learning, which is very convenient to use.

When learning and testing models in general, we don’t need to care about how to fix the parameters of the model so that the model can be reproduced. But in the context of scientific experiments, how to make the experiments of a set of models reproducible, that’s a very important thing.

At first I thought that I only need to set the random number seeds in PyTorch, but in the actual test, I couldn’t reproduce the experiment. After looking for the information for a while, I was surprised that the place to be set was much more than I thought.

In below, take the classic training handwritten digit recognition model (Mnist data set) as an example to test how to reproduce exactly the same experimental results.

Sample Code For Training Handwritten Digit Recognition Model

For testing, the parameter settings are very simple (such as epochs = 1).

# coding: utf-8
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms


# Model architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(in_features=784, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=10),
            nn.LogSoftmax(dim=1)
        )

    def forward(self, input):
        return self.main(input)


# Train
def train(device, model, epochs, optimizer, loss_function, train_loader):
    for epoch in range(1, epochs+1):
        for times, data in enumerate(train_loader, 1):
            inputs = data[0].to(device)
            labels = data[1].to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward and backward propagation
            outputs = model(inputs.view(inputs.shape[0], -1))
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()

            # Show progress
            if times % 100 == 0 or times == len(train_loader):
                print('[{}/{}, {}/{}] loss: {:.8}'.format(epoch, epochs, times, len(train_loader), loss.item()))

    return model


def test(device, model, test_loader):
    # Settings
    model.eval()
    total = 0
    correct = 0

    with torch.no_grad():
        for data in test_loader:
            inputs = data[0].to(device)
            labels = data[1].to(device)

            outputs = model(inputs.view(inputs.shape[0], -1))
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Accurecy:', correct / total)


def main():
    # GPU device
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    print('Device state:', device)

    # Settings
    epochs = 1
    batch_size = 64
    lr = 0.002
    loss_function = nn.NLLLoss()
    model = Net().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Transform
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5,), (0.5,))]
    )

    # Data
    train_set = datasets.MNIST(root='MNIST', download=True, train=True, transform=transform)
    test_set = datasets.MNIST(root='MNIST', download=True, train=False, transform=transform)
    train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

    # Train
    model = train(device, model, epochs, optimizer, loss_function, train_loader)

    # Test
    test(device, model, test_loader)


if __name__ == '__main__':
    main()

# coding: utf-8
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms


# Model architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(in_features=784, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=10),
            nn.LogSoftmax(dim=1)
        )

    def forward(self, input):
        return self.main(input)


# Train
def train(device, model, epochs, optimizer, loss_function, train_loader):
    for epoch in range(1, epochs+1):
        for times, data in enumerate(train_loader, 1):
            inputs = data[0].to(device)
            labels = data[1].to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward and backward propagation
            outputs = model(inputs.view(inputs.shape[0], -1))
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()

            # Show progress
            if times % 100 == 0 or times == len(train_loader):
                print(‘[{}/{}, {}/{}] loss: {:.8}’.format(epoch, epochs, times, len(train_loader), loss.item()))

    return model


def test(device, model, test_loader):
    # Settings
    model.eval()
    total = 0
    correct = 0

    with torch.no_grad():
        for data in test_loader:
            inputs = data[0].to(device)
            labels = data[1].to(device)

            outputs = model(inputs.view(inputs.shape[0], -1))
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(‘Accurecy:’, correct / total)


def main():
    # GPU device
    device = ‘cuda:0’ if torch.cuda.is_available() else ‘cpu’
    print(‘Device state:’, device)

    # Settings
    epochs = 1
    batch_size = 64
    lr = 0.002
    loss_function = nn.NLLLoss()
    model = Net().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Transform
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5,), (0.5,))]
    )

    # Data
    train_set = datasets.MNIST(root=’MNIST’, download=True, train=True, transform=transform)
    test_set = datasets.MNIST(root=’MNIST’, download=True, train=False, transform=transform)
    train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

    # Train
    model = train(device, model, epochs, optimizer, loss_function, train_loader)

    # Test
    test(device, model, test_loader)


if __name__ == ‘__main__’:
    main()

Then I trained twice quickly, and got the following different experimental results:

Device state: cuda:0
[1/1, 100/938] loss: 0.46569321
[1/1, 200/938] loss: 0.51824951
[1/1, 300/938] loss: 0.60570443
[1/1, 400/938] loss: 0.1711497
[1/1, 500/938] loss: 0.17937183
[1/1, 600/938] loss: 0.31248432
[1/1, 700/938] loss: 0.088681653
[1/1, 800/938] loss: 0.21637213
[1/1, 900/938] loss: 0.13165317
[1/1, 938/938] loss: 0.067537516
Accurecy: 0.9416

Device state: cuda:0
[1/1, 100/938] loss: 0.34765923
[1/1, 200/938] loss: 0.45036712
[1/1, 300/938] loss: 0.35380328
[1/1, 400/938] loss: 0.28400266
[1/1, 500/938] loss: 0.34874138
[1/1, 600/938] loss: 0.33628809
[1/1, 700/938] loss: 0.24677548
[1/1, 800/938] loss: 0.1359618
[1/1, 900/938] loss: 0.088348009
[1/1, 938/938] loss: 0.1509731
Accurecy: 0.9414

It can be seen from the loss in the training process that the training process of the two models is not consistent, and the first experimental result cannot be reproduced when this code is run for the second time.

Fixed Seed

To fix the results, you need to set the following seed parameters, which are best placed at the bottom of the import package at the beginning:

# coding: utf-8
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms

# Even if you don't use them, you still have to import
import random
import numpy as np


# Seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = Tru

# coding: utf-8
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms

# Even if you don’t use them, you still have to import
import random
import numpy as np


# Seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = Tru

Among them, the random module and the numpy module need to be imported even if they are not used in the code, because the function called by PyTorch may be used. If there is no fixed parameter, the model result cannot be fixed.

Complete code:

# coding: utf-8
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms

# Even if you don't use them, you still have to import
import random
import numpy as np


# Seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


# Model architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(in_features=784, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=10),
            nn.LogSoftmax(dim=1)
        )

    def forward(self, input):
        return self.main(input)


# Train
def train(device, model, epochs, optimizer, loss_function, train_loader):
    for epoch in range(1, epochs+1):
        for times, data in enumerate(train_loader, 1):
            inputs = data[0].to(device)
            labels = data[1].to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward and backward propagation
            outputs = model(inputs.view(inputs.shape[0], -1))
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()

            # Show progress
            if times % 100 == 0 or times == len(train_loader):
                print('[{}/{}, {}/{}] loss: {:.8}'.format(epoch, epochs, times, len(train_loader), loss.item()))

    return model


def test(device, model, test_loader):
    # Settings
    model.eval()
    total = 0
    correct = 0

    with torch.no_grad():
        for data in test_loader:
            inputs = data[0].to(device)
            labels = data[1].to(device)

            outputs = model(inputs.view(inputs.shape[0], -1))
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Accurecy:', correct / total)


def main():
    # GPU device
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    print('Device state:', device)

    # Settings
    epochs = 1
    batch_size = 64
    lr = 0.002
    loss_function = nn.NLLLoss()
    model = Net().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Transform
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5,), (0.5,))]
    )

    # Data
    train_set = datasets.MNIST(root='MNIST', download=True, train=True, transform=transform)
    test_set = datasets.MNIST(root='MNIST', download=True, train=False, transform=transform)
    train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

    # Train
    model = train(device, model, epochs, optimizer, loss_function, train_loader)

    # Test
    test(device, model, test_loader)


if __name__ == '__main__':
    main()

# coding: utf-8
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms

# Even if you don’t use them, you still have to import
import random
import numpy as np


# Seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


# Model architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(in_features=784, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=10),
            nn.LogSoftmax(dim=1)
        )

    def forward(self, input):
        return self.main(input)


# Train
def train(device, model, epochs, optimizer, loss_function, train_loader):
    for epoch in range(1, epochs+1):
        for times, data in enumerate(train_loader, 1):
            inputs = data[0].to(device)
            labels = data[1].to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward and backward propagation
            outputs = model(inputs.view(inputs.shape[0], -1))
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()

            # Show progress
            if times % 100 == 0 or times == len(train_loader):
                print(‘[{}/{}, {}/{}] loss: {:.8}’.format(epoch, epochs, times, len(train_loader), loss.item()))

    return model


def test(device, model, test_loader):
    # Settings
    model.eval()
    total = 0
    correct = 0

    with torch.no_grad():
        for data in test_loader:
            inputs = data[0].to(device)
            labels = data[1].to(device)

            outputs = model(inputs.view(inputs.shape[0], -1))
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(‘Accurecy:’, correct / total)


def main():
    # GPU device
    device = ‘cuda:0’ if torch.cuda.is_available() else ‘cpu’
    print(‘Device state:’, device)

    # Settings
    epochs = 1
    batch_size = 64
    lr = 0.002
    loss_function = nn.NLLLoss()
    model = Net().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Transform
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5,), (0.5,))]
    )

    # Data
    train_set = datasets.MNIST(root=’MNIST’, download=True, train=True, transform=transform)
    test_set = datasets.MNIST(root=’MNIST’, download=True, train=False, transform=transform)
    train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

    # Train
    model = train(device, model, epochs, optimizer, loss_function, train_loader)

    # Test
    test(device, model, test_loader)


if __name__ == ‘__main__’:
    main()

As following, we can see the the same results.

Device state: cuda:0
[1/1, 100/938] loss: 0.26784545
[1/1, 200/938] loss: 0.22377557
[1/1, 300/938] loss: 0.19579159
[1/1, 400/938] loss: 0.29825467
[1/1, 500/938] loss: 0.30673307
[1/1, 600/938] loss: 0.20264342
[1/1, 700/938] loss: 0.12821223
[1/1, 800/938] loss: 0.21547785
[1/1, 900/938] loss: 0.18268135
[1/1, 938/938] loss: 0.054493584
Accurecy: 0.9501

Device state: cuda:0
[1/1, 100/938] loss: 0.26784545
[1/1, 200/938] loss: 0.22377557
[1/1, 300/938] loss: 0.19579159
[1/1, 400/938] loss: 0.29825467
[1/1, 500/938] loss: 0.30673307
[1/1, 600/938] loss: 0.20264342
[1/1, 700/938] loss: 0.12821223
[1/1, 800/938] loss: 0.21547785
[1/1, 900/938] loss: 0.18268135
[1/1, 938/938] loss: 0.054493584
Accurecy: 0.9501

Supplement

It is mentioned in the official PyTorch document that on different versions, different platforms and different devices, completely reproducible results cannot be guaranteed. I have personally tested this, and the results are different on different GPU devices.

I have not compared the results of GPU and CPU training, but from the official documents, it seems that the same results cannot be reproduce.

[PyTorch] Set Seed To Reproduce Model Training Results

Sample Code For Training Handwritten Digit Recognition Model

Fixed Seed

Supplement

References

References

Related

2 thoughts on “[PyTorch] Set Seed To Reproduce Model Training Results”

Leave a ReplyCancel reply

[PyTorch] Set Seed To Reproduce Model Training Results

Sample Code For Training Handwritten Digit Recognition Model

Fixed Seed

Supplement

References

References

Share this:

Related

2 thoughts on “[PyTorch] Set Seed To Reproduce Model Training Results”

Leave a ReplyCancel reply