Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error in custom train loop and torch backend with multi-GPU #20415

Open
caiuspetronius opened this issue Oct 26, 2024 · 1 comment
Open

Error in custom train loop and torch backend with multi-GPU #20415

caiuspetronius opened this issue Oct 26, 2024 · 1 comment
Assignees
Labels
keras-team-review-pending Pending review by a Keras team member. type:Bug

Comments

@caiuspetronius
Copy link

I'm training a basic CIFAR10 classifier (two Dense layers) using multi-GPU with torch backend (see the code below). The code works fine when the net is written in torch. When written in Keras it returns the following error in line 95:

RuntimeError: Exception encountered when calling Dense.call().

Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)

Arguments received by Dense.call():
• inputs=torch.Tensor(shape=torch.Size([192, 3, 32, 32]), dtype=float32)
• training=None

The code is below

import os

os.environ[ "KERAS_BACKEND" ] = "torch"
os.environ[ "PYTORCH_CUDA_ALLOC_CONF" ] = "expandable_segments:True"

import time
import datetime

import torch
import torch.nn as nn
import torch.optim as optim

import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader

from model import pyramidnet

import keras

num_epochs = 100
batch_size = 768
num_workers = torch.cuda.device_count()
print( 'Running on {} GPUs'.format( num_workers ) )
lr = 0.01

def main():
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print('==> Preparing data..')
transforms_train = transforms.Compose( [
    transforms.RandomCrop( 32, padding = 4 ),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize( ( 0.4914, 0.4822, 0.4465 ), ( 0.2023, 0.1994, 0.2010 ) ) ] )

dataset_train = CIFAR10( root = '../data', train = True, download = True, 
                         transform = transforms_train )

train_loader = DataLoader( dataset_train, batch_size = batch_size, 
                           shuffle = True, num_workers = num_workers )

print( '==> Making model..' )
# net = pyramidnet()

# # Define Pytorch net
# class TwoLayerPerceptron( nn.Module ) :
#     def __init__( self ):
#         super( TwoLayerPerceptron, self ).__init__()
#         self.fc1 = nn.Linear( 32 * 32 * 3, 512 )
#         self.fc2 = nn.Linear( 512, 10 )

#     def forward( self, x ):
#         x = x.view( x.size( 0 ), -1 )
#         x = self.fc1( x )
#         x = nn.functional.relu( x )
#         x = self.fc2( x )
#         x = nn.functional.softmax( x )
#         return x
# # Instantiate the model
# net = TwoLayerPerceptron()

# Define Keras net
net = keras.Sequential( [ 
    keras.layers.Input( shape = ( 3, 32, 32 ) ),
    keras.layers.Dense( 512, activation = 'relu' ),
    keras.layers.Dense( 10, activation = 'softmax' ) ] )

net = nn.DataParallel( net )
net = net.to( device )
num_params = sum( p.numel() for p in net.parameters() if p.requires_grad )
print( 'The number of parameters of model is', num_params )

# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam( net.parameters(), lr = lr )
criterion = keras.losses.SparseCategoricalCrossentropy()
optimizer = keras.optimizers.Adam( learning_rate = lr )

train( net, criterion, optimizer, train_loader, device )

def train( net, criterion, optimizer, train_loader, device ):
net.train()

train_start = time.time()
for epoch in range( num_epochs ) :
    train_loss = 0
    correct = 0
    total = 0

    for batch_idx, ( inputs, targets ) in enumerate( train_loader ) :
        start = time.time()
        
        inputs = inputs.to( device )
        targets = targets.to( device )
        outputs = net( inputs )
        loss = criterion( outputs, targets )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max( 1 )
        total += targets.size( 0 )
        correct += predicted.eq( targets ).sum().item()

        acc = 100 * correct / total
        
        batch_time = time.time() - start
        
        if batch_idx % 20 == 0:
            print( 'Epoch: [{}/{}]\t| Batch: [{}/{}]\t| loss: {:.3f}\t| acc: {:.3f}\t| batch time: {:.3f}s '.format(
                epoch, num_epochs, batch_idx, len( train_loader ), train_loss / ( batch_idx + 1 ), acc, batch_time ) )

elapse_time = time.time() - train_start
elapse_time = datetime.timedelta( seconds = elapse_time )
print( "Training time {}".format( elapse_time ) )

if name == 'main':
main()

@mehtamansi29 mehtamansi29 added type:Bug keras-team-review-pending Pending review by a Keras team member. labels Nov 7, 2024
@divyashreepathihalli
Copy link
Collaborator

Instead of using nn.DataParallel, consider using torch.nn.parallel.DistributedDataParallel maybe? which provides more robust multi-GPU training support.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
keras-team-review-pending Pending review by a Keras team member. type:Bug
Projects
None yet
Development

No branches or pull requests

4 participants