Build PyTorch Extensions with CUDA and CFFI

March 28, 2018

Programming

40 minutes

Deprecated warning: PyTorch 1.0+ uses to Aten as its tensor library. So the article is no longer applicable in PyTorch 1.0+. Please refer to this GitHub repository for more information.

Python is one of the most popular programming languages today for deep learning applications. However, as an interpreted language, it has been considered too slow for high-performance computing. Therefore, I incorporated CUDA into Python to build my own extension that accelerates the computation.

In this tutorial, I give an example of CUDA extension for data augmentation in deep learning.

Create Build Scripts

First of all, we need to create a script for building FFI extension named build.py.

import os
import torch
import torch.utils.ffi
 
this_folder = os.path.dirname(os.path.abspath(__file__)) + '/'
 
Headers = []
Sources = []
Defines = []
Objects = []
 
if torch.cuda.is_available() == True:
    Headers += ['src/augmentation_cuda.h']
    Sources += ['src/augmentation_cuda.c']
    Defines += [('WITH_CUDA', None)]
    Objects += ['build/augmentation_cuda_kernel.o']
 
ffi = torch.utils.ffi.create_extension(
    name='_ext.augmentation',
    headers=Headers,
    sources=Sources,
    verbose=False,
    with_cuda=True,
    package=False,
    relative_to=this_folder,
    define_macros=Defines,
    extra_objects=[os.path.join(this_folder, Object) for Object in Objects]
)
 
if __name__ == '__main__':
    ffi.build()

Create CUDA Interface

Then, we need to create an interface for CUDA which can be invoked by Python. In this tutorial, we name this file augmentation_cuda.c.

#include &lt;THC/THC.h>
 
#include "augmentation_cuda_kernel.h"
 
// symbol to be automatically resolved by PyTorch libs
extern THCState *state;
 
int color_transform_cuda(THCudaTensor* input, THCudaTensor* output,
        float color_r, float color_g, float color_b, float brightness,
        float contrast, float gamma) {
    int nChannels = input-&gt;size[0];
    int height = input-&gt;size[1];
    int width = input-&gt;size[2];
 
    THCudaTensor_resize3d(state, output, nChannels, height, width);
    THCudaTensor_fill(state, output, 0);
 
    int success = 0;
    success = color_transform_cuda_kernel(
        THCudaTensor_data(state, input),
        THCudaTensor_data(state, output),
        nChannels, height, width,
        color_r, color_g, color_b, brightness, contrast, gamma);
     
    //Check for errors
    if ( !success ) {
        THError("aborting");
    }
    return 1;
}

Also, we need to create a header file augmentation.h and declare the corresponding prototype of the function color_transform_cuda(...).

int color_transform_cuda(THCudaTensor* input, THCudaTensor* output,
    float color_r, float color_g, float color_b, float brightness,
    float contrast, float gamma);

Create CUDA Core

We need to create a file named augmentation_cuda_kernel.cu for the core code of CUDA. The following code is used to pass variables from C to CUDA.

struct ChromaticParams {
    float color[N_CHANNELS];
    float brightness;
    float contrast;
    float gamma;
};
 
#define CUDA_NUM_THREADS 512
 
inline int GET_BLOCKS(const int N) {
    return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
 
int color_transform_cuda_kernel(float* input, float* output, int n_channels, int height, int width,
        float color_r, float color_g, float color_b, float brightness, float contrast, float gamma) {
    struct ChromaticParams cp;
    cp.color[0]     = color_r;
    cp.color[1]     = color_g;
    cp.color[2]     = color_b;
    cp.brightness   = brightness;
    cp.contrast     = contrast;
    cp.gamma        = gamma;
 
    // Memory addresses in GPU
    ChromaticParams* p_cp;
 
    //  Alloc space for GPU copies
    cudaMalloc((void **)&p_cp, sizeof(ChromaticParams));
    // Copy data to GPU
    cudaMemcpy(p_cp, &cp, sizeof(ChromaticParams), cudaMemcpyHostToDevice);
 
    // Color Augmentation
    int n_threads = height * width;
    color_augmentation&lt;&lt;&lt;GET_BLOCKS(n_threads), CUDA_NUM_THREADS>>>(
        n_threads, n_channels, height, width, input, output, p_cp);
 
    // Free alloced memory in GPU
    cudaFree(p_cp);
 
    // Check for errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Error in color_transform_cuda_kernel: %s\n", cudaGetErrorString(err));
        return 0;
    }
    return 1;
}

Then, we need to implement the function color_augmentation() which will be executed by CUDA.

#define CUDA_KERNEL_LOOP(i, n) \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
       i &lt; (n); \
       i += blockDim.x * gridDim.x)
#define N_CHANNELS 3
#define MAX_MULTIPLIER 1
 
inline __device__ __host__ float clamp(float f, float a, float b) {
    return fmaxf(a, fminf(f, b));
}
 
__global__ void color_augmentation(const int nthreads, const int channels, const int height,
        const int width, float* src_data, float* dest_data, const ChromaticParams* cp) {
    CUDA_KERNEL_LOOP(index, nthreads) {
        float x = (float)(index % width); //w-pos
        float y = (float)((index / width) % height); //h-pos
        int n = (index / width / height); // num
 
        int data_index[N_CHANNELS];
        float rgb[N_CHANNELS];
        float mean_in = 0;
        float mean_out = 0;
                
        // do color change
        for ( int c = 0; c &lt; channels; ++c ) {
            data_index = width * (height * (channels * n + c) + y) + x;
            rgb = src_data[data_index];
            mean_in += rgb;
            rgb *= cp->color;
            mean_out += rgb;
        }
         
        float brightness_coeff = mean_in / (mean_out + 0.01f);
        for ( int c = 0; c &lt; channels; ++c ) {
            //compensate brightness
            rgb = clamp(rgb * brightness_coeff, 0.f, 1.f);
 
            // do gamma change
            rgb = pow(rgb, cp->gamma);
 
            // do brightness change
            rgb = rgb + cp->brightness;
 
            // do contrast change
            rgb = 0.5f + (rgb - 0.5f) * cp->contrast;
 
            // write sample to destination
            dest_data[data_index] = clamp(rgb, 0.f, MAX_MULTIPLIER);
        }
    }
}

Similar to augmentation_cuda.h, we need to declare a prototype of the function color_transform_cuda_kernel.

int color_transform_cuda_kernel(float* input, float* output, int n_channels, int height, int width,
    float color_r, float color_g, float color_b, float brightness, float contrast, float gamma);

Build & Run

And now, we can build this CUDA extension with the following script:

#!/usr/bin/bash
TORCH=$(python3 -c "import os; import torch; print(os.path.dirname(torch.__file__))")
 
cd src
 
rm ../build/augmentation_cuda_kernel.o
rm -r ../_ext
 
nvcc -c -o ../build/augmentation_cuda_kernel.o augmentation_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_61
 
cd ../
python build.py

To make it accessible by Python, we need to create a __init__.py:

import torch
from torch.autograd import Function, Variable
 
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), '_ext'))
import augmentation
 
class Augmentation(Function):
    def color_transform(self, input, rnd):
        output = input.new()
        augmentation.color_transform_cuda(input, output, rnd['color_r'], rnd['color_g'], \
            rnd['color_b'], rnd['brightness'], rnd['contrast'], rnd['gamma'])
 
        return output

And now, we can import this module with:

import Augmentation

References

The Disqus comment system is loading ...
If the message does not appear, please check your Disqus configuration.

Infinite Script