Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
1.1k views
in Technique[技术] by (71.8m points)

pytorch runs slow when data are pre-transported to GPU

I have a model written in pytorch. Since my dataset is small, I can directly load all of the data to GPU. However, I found the forward speed becomes slow if I do so. The following is a runnable example. Specifically, I have the model:

import numpy as np
from time import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

def knn(x, k):
    inner = -2*torch.matmul(x.transpose(2, 1), x)
    xx = torch.sum(x**2, dim=1, keepdim=True)
    pairwise_distance = -xx - inner - xx.transpose(2, 1)
    idx = pairwise_distance.topk(k=k, dim=-1)[1]   # (batch_size, num_points, k)
    return idx

def get_graph_feature(x, k=20, idx=None):
    batch_size = x.size(0)
    num_points = x.size(2)
    x = x.view(batch_size, -1, num_points)
    if idx is None:
        idx = knn(x, k=k)   # (batch_size, num_points, k)
    idx_base = torch.arange(0, batch_size, device=x.device).view(-1, 1, 1)*num_points
    idx = idx + idx_base
    idx = idx.view(-1)
    _, num_dims, _ = x.size()
    x = x.transpose(2, 1).contiguous()   # (batch_size, num_points, num_dims)  -> (batch_size*num_points, num_dims) #   batch_size * num_points * k + range(0, batch_size*num_points)
    feature = x.view(batch_size*num_points, -1)[idx, :]
    feature = feature.view(batch_size, num_points, k, num_dims) 
    x = x.view(batch_size, num_points, 1, num_dims).repeat(1, 1, k, 1)
    feature = torch.cat((feature-x, x), dim=3).permute(0, 3, 1, 2).contiguous()
    return feature

class DGCNN(nn.Module):
    def __init__(self, k=25, output_channels=10):
        super(DGCNN, self).__init__()
        self.k = k
        self.bn1 = nn.BatchNorm2d(64)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        self.bn4 = nn.BatchNorm2d(256)
        self.bn5 = nn.BatchNorm1d(1024)
        self.conv1 = nn.Sequential(nn.Conv2d(6, 64, kernel_size=1, bias=False),
                                   self.bn1,
                                   nn.LeakyReLU(negative_slope=0.2))
        self.conv2 = nn.Sequential(nn.Conv2d(64*2, 64, kernel_size=1, bias=False),
                                   self.bn2,
                                   nn.LeakyReLU(negative_slope=0.2))
        self.conv3 = nn.Sequential(nn.Conv2d(64*2, 128, kernel_size=1, bias=False),
                                   self.bn3,
                                   nn.LeakyReLU(negative_slope=0.2))
        self.conv4 = nn.Sequential(nn.Conv2d(128*2, 256, kernel_size=1, bias=False),
                                   self.bn4,
                                   nn.LeakyReLU(negative_slope=0.2))
        self.conv5 = nn.Sequential(nn.Conv1d(512, 1024, kernel_size=1, bias=False),
                                   self.bn5,
                                   nn.LeakyReLU(negative_slope=0.2))
        self.linear1 = nn.Linear(1024*2, 512, bias=False)
        self.bn6 = nn.BatchNorm1d(512)
        self.dp1 = nn.Dropout()
        self.linear2 = nn.Linear(512, 256)
        self.bn7 = nn.BatchNorm1d(256)
        self.dp2 = nn.Dropout()
        self.linear3 = nn.Linear(256, output_channels)

    def forward(self, x):
        x = x.transpose(2, 1)
        batch_size = x.size(0)
        x = get_graph_feature(x, k=self.k)
        x = self.conv1(x)
        x1 = x.max(dim=-1, keepdim=False)[0]
        x = get_graph_feature(x1, k=self.k)
        x = self.conv2(x)
        x2 = x.max(dim=-1, keepdim=False)[0]
        x = get_graph_feature(x2, k=self.k)
        x = self.conv3(x)
        x3 = x.max(dim=-1, keepdim=False)[0]
        x = get_graph_feature(x3, k=self.k)
        x = self.conv4(x)
        x4 = x.max(dim=-1, keepdim=False)[0]
        x = torch.cat((x1, x2, x3, x4), dim=1)
        x = self.conv5(x)
        x1 = F.adaptive_max_pool1d(x, 1).view(batch_size, -1)
        x2 = F.adaptive_avg_pool1d(x, 1).view(batch_size, -1)
        x = torch.cat((x1, x2), 1)
        x = F.leaky_relu(self.bn6(self.linear1(x)), negative_slope=0.2)
        x = self.dp1(x)
        x = F.leaky_relu(self.bn7(self.linear2(x)), negative_slope=0.2)
        x = self.dp2(x)
        x = self.linear3(x)
        return x

Here is what the dataloader and test function looks like:

class my_loader(Dataset):
    def __init__(self, device):
        self.data = torch.rand(256, 2048, 3).to(device).float()
        self.labels = torch.rand(256).to(device).long()

    def __getitem__(self, ind):
        return self.data[ind], self.labels[ind]

    def __len__(self):
        return len(self.data)

def test():
    device = torch.device('cuda:2')
    test_set = my_loader(device)
    test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
    model = DGCNN().to(device)
    model.eval()
    
    #---------- this one is 0.12s --------------#
    for inputs, labels in test_loader:
        tic = time()
        pred = model(inputs)
        print('time1 {}'.format(time() - tic))
    print('------------------')
   
    #---------- this one is 0.004s --------------#
    for inputs, labels in test_loader:
        inputs = inputs.detach().cpu().to(device)
        tic = time()
        pred = model(inputs)
        print('time2 {}'.format(time() - tic))
    print('------------------')

    #---------- this one is 0.12s --------------#
    for inputs, labels in test_loader:
        tic = time()
        inputs = inputs.detach().cpu().to(device)
        pred = model(inputs)
        print('time3 {}'.format(time() - tic))
    print('------------------')
  

Basically, it seems that if there is no explicit call of gpu to cpu transportation either before or after the forward propagation, the forward propagation would cost more time. It just seems like that the forward propagation is implicitly doing gpu->cpu transportation.

question from:https://stackoverflow.com/questions/65642697/pytorch-runs-slow-when-data-are-pre-transported-to-gpu

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

I played around with the code a little bit, and I think the problem is that you are measuring times for both cases in the same run. Here is my boiled down version of your code since your model crushed my GPU memory:

class DGCNN(nn.Module):
    def __init__(self, num_layers):
        super(DGCNN, self).__init__()
        self.layers = nn.ModuleList([nn.Linear(256, 256) for _ in range(1200)])

    def forward(self, x):
        x = x.view(-1, 256)
        for layer in self.layers:
            x = layer(x)
        return x

class my_loader(Dataset):
    def __init__(self, device):
        self.data = torch.rand(256, 2048, 3).to(device).float()
        self.labels = torch.rand(256).to(device).long()

    def __getitem__(self, ind):
        return self.data[ind], self.labels[ind]

    def __len__(self):
        return len(self.data)

Now, here I demonstrate different versions of test().

Version #1:

def test():
    device = torch.device('cuda:0')
    test_set = my_loader(device)
    test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
    model = DGCNN().to(device)
    model.eval()

    #---------- this one is 0.12s --------------#
    tic = time()
    for inputs, labels in test_loader:
        pred = model(inputs)
    tac = time()    
    print(f'# First case -> Full forward pass: {tac - tic:.6f}')

    #---------- this one is 0.004s --------------#
    tic = time()
    for inputs, labels in test_loader:
        pred = model(inputs.detach().cpu().to(device))
    tac = time()
    print(f'# Second case -> Full forward pass: {tac - tic:.6f}')

>>> # First case -> Full forward pass: 3.105103, # Second case -> Full forward pass: 2.831652

Now I switched the order of timing calculations for the cases. Version #2:

def test():
    device = torch.device('cuda:0')
    test_set = my_loader(device)
    test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
    model = DGCNN().to(device)
    model.eval()

    #---------- this one is 0.004s --------------#
    tic = time()
    for inputs, labels in test_loader:
        pred = model(inputs.detach().cpu().to(device))
    tac = time()
    print(f'# Second case -> Full forward pass: {tac - tic:.6f}')

    #---------- this one is 0.12s --------------#
    tic = time()
    for inputs, labels in test_loader:
        pred = model(inputs)
    tac = time()
    print(f'# First case -> Full forward pass: {tac - tic:.6f}')

>>> # Second case -> Full forward pass: 3.288522, # First case -> Full forward pass: 2.583231

Apparently, the first timing you calculate seems to end up slower. So, I calculated these timings separately in different runs with fresh kernels. Version #3:

def test():    
    device = torch.device('cuda:0')
    test_set = my_loader(device)
    test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
    model = DGCNN().to(device)
    model.eval()

    #---------- this one is 0.12s --------------#
    tic = time()
    for inputs, labels in test_loader:
        pred = model(inputs)
    tac = time()
    print(f'# First case -> Full forward pass: {tac - tic:.6f}')

>>> # First case -> Full forward pass: 3.091592

Version #4:

def test():
    device = torch.device('cuda:0')
    test_set = my_loader(device)
    test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
    model = DGCNN().to(device)
    model.eval()

    #---------- this one is 0.004s --------------#
    tic = time()
    for inputs, labels in test_loader:
        pred = model(inputs.detach().cpu().to(device))
    tac = time()
    print(f'# Second case -> Full forward pass: {tac - tic:.6f}')

>>> # Second case -> Full forward pass: 3.190248

So, by testing one at a time, it seems like pred = model(inputs) runs slightly faster than pred = model(inputs.detach().cpu().to(device)), which is the obvious expected result.


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

2.1m questions

2.1m answers

60 comments

57.0k users

...