Error in custom train loop and torch backend with multi-GPU #20415

caiuspetronius · 2024-10-26T02:14:27Z

I'm training a basic CIFAR10 classifier (two Dense layers) using multi-GPU with torch backend (see the code below). The code works fine when the net is written in torch. When written in Keras it returns the following error in line 95:

RuntimeError: Exception encountered when calling Dense.call().

Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)

Arguments received by Dense.call():
• inputs=torch.Tensor(shape=torch.Size([192, 3, 32, 32]), dtype=float32)
• training=None

The code is below

import os

os.environ[ "KERAS_BACKEND" ] = "torch"
os.environ[ "PYTORCH_CUDA_ALLOC_CONF" ] = "expandable_segments:True"

import time
import datetime

import torch
import torch.nn as nn
import torch.optim as optim

import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader

from model import pyramidnet

import keras

num_epochs = 100
batch_size = 768
num_workers = torch.cuda.device_count()
print( 'Running on {} GPUs'.format( num_workers ) )
lr = 0.01

def main():
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print('==> Preparing data..')
transforms_train = transforms.Compose( [
    transforms.RandomCrop( 32, padding = 4 ),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize( ( 0.4914, 0.4822, 0.4465 ), ( 0.2023, 0.1994, 0.2010 ) ) ] )

dataset_train = CIFAR10( root = '../data', train = True, download = True, 
                         transform = transforms_train )

train_loader = DataLoader( dataset_train, batch_size = batch_size, 
                           shuffle = True, num_workers = num_workers )

print( '==> Making model..' )
# net = pyramidnet()

# # Define Pytorch net
# class TwoLayerPerceptron( nn.Module ) :
#     def __init__( self ):
#         super( TwoLayerPerceptron, self ).__init__()
#         self.fc1 = nn.Linear( 32 * 32 * 3, 512 )
#         self.fc2 = nn.Linear( 512, 10 )

#     def forward( self, x ):
#         x = x.view( x.size( 0 ), -1 )
#         x = self.fc1( x )
#         x = nn.functional.relu( x )
#         x = self.fc2( x )
#         x = nn.functional.softmax( x )
#         return x
# # Instantiate the model
# net = TwoLayerPerceptron()

# Define Keras net
net = keras.Sequential( [ 
    keras.layers.Input( shape = ( 3, 32, 32 ) ),
    keras.layers.Dense( 512, activation = 'relu' ),
    keras.layers.Dense( 10, activation = 'softmax' ) ] )

net = nn.DataParallel( net )
net = net.to( device )
num_params = sum( p.numel() for p in net.parameters() if p.requires_grad )
print( 'The number of parameters of model is', num_params )

# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam( net.parameters(), lr = lr )
criterion = keras.losses.SparseCategoricalCrossentropy()
optimizer = keras.optimizers.Adam( learning_rate = lr )

train( net, criterion, optimizer, train_loader, device )

def train( net, criterion, optimizer, train_loader, device ):
net.train()

train_start = time.time()
for epoch in range( num_epochs ) :
    train_loss = 0
    correct = 0
    total = 0

    for batch_idx, ( inputs, targets ) in enumerate( train_loader ) :
        start = time.time()
        
        inputs = inputs.to( device )
        targets = targets.to( device )
        outputs = net( inputs )
        loss = criterion( outputs, targets )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max( 1 )
        total += targets.size( 0 )
        correct += predicted.eq( targets ).sum().item()

        acc = 100 * correct / total
        
        batch_time = time.time() - start
        
        if batch_idx % 20 == 0:
            print( 'Epoch: [{}/{}]\t| Batch: [{}/{}]\t| loss: {:.3f}\t| acc: {:.3f}\t| batch time: {:.3f}s '.format(
                epoch, num_epochs, batch_idx, len( train_loader ), train_loss / ( batch_idx + 1 ), acc, batch_time ) )

elapse_time = time.time() - train_start
elapse_time = datetime.timedelta( seconds = elapse_time )
print( "Training time {}".format( elapse_time ) )

if name == 'main':
main()

The text was updated successfully, but these errors were encountered:

divyashreepathihalli · 2024-11-14T04:47:17Z

Instead of using nn.DataParallel, consider using torch.nn.parallel.DistributedDataParallel maybe? which provides more robust multi-GPU training support.

github-actions bot assigned sachinprasadhs Oct 26, 2024

mehtamansi29 added type:Bug keras-team-review-pending Pending review by a Keras team member. labels Nov 7, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Error in custom train loop and torch backend with multi-GPU #20415

Error in custom train loop and torch backend with multi-GPU #20415

caiuspetronius commented Oct 26, 2024

divyashreepathihalli commented Nov 14, 2024

Error in custom train loop and torch backend with multi-GPU #20415

Error in custom train loop and torch backend with multi-GPU #20415

Comments

caiuspetronius commented Oct 26, 2024

The code is below

from model import pyramidnet

divyashreepathihalli commented Nov 14, 2024