Build PyTorch Extensions with CUDA and CFFI

Deprecated warning: PyTorch 1.0+ uses to Aten as its tensor library. So the article is no longer applicable in PyTorch 1.0+.

Python is one of the most popular programming languages today for deep learning applications. However, as an interpreted language, it has been considered too slow for high-performance computing. Therefore, I incorporated CUDA into Python to build my own extension that accelerates the computation.

In this tutorial, I give an example of CUDA extension for data augmentation in deep learning.

Create Build Scripts

First of all, we need to create a script for building FFI extension named build.py.

import os
import torch
import torch.utils.ffi
 
this_folder = os.path.dirname(os.path.abspath(__file__)) + '/'
 
Headers = []
Sources = []
Defines = []
Objects = []
 
if torch.cuda.is_available() == True:
    Headers += ['src/augmentation_cuda.h']
    Sources += ['src/augmentation_cuda.c']
    Defines += [('WITH_CUDA', None)]
    Objects += ['build/augmentation_cuda_kernel.o']
 
ffi = torch.utils.ffi.create_extension(
    name='_ext.augmentation',
    headers=Headers,
    sources=Sources,
    verbose=False,
    with_cuda=True,
    package=False,
    relative_to=this_folder,
    define_macros=Defines,
    extra_objects=[os.path.join(this_folder, Object) for Object in Objects]
)
 
if __name__ == '__main__':
    ffi.build()

Create CUDA Interface

Then, we need to create an interface for CUDA which can be invoked by Python. In this tutorial, we name this file augmentation_cuda.c.

#include <THC/THC.h>
 
#include "augmentation_cuda_kernel.h"
 
// symbol to be automatically resolved by PyTorch libs
extern THCState *state;
 
int color_transform_cuda(THCudaTensor* input, THCudaTensor* output,
        float color_r, float color_g, float color_b, float brightness,
        float contrast, float gamma) {
    int nChannels = input->size[0];
    int height = input->size[1];
    int width = input->size[2];
 
    THCudaTensor_resize3d(state, output, nChannels, height, width);
    THCudaTensor_fill(state, output, 0);
 
    int success = 0;
    success = color_transform_cuda_kernel(
        THCudaTensor_data(state, input),
        THCudaTensor_data(state, output),
        nChannels, height, width,
        color_r, color_g, color_b, brightness, contrast, gamma);
     
    //Check for errors
    if ( !success ) {
        THError("aborting");
    }
    return 1;
}

Also, we need to create a header file augmentation.h and declare the corresponding prototype of the function color_transform_cuda(...).

int color_transform_cuda(THCudaTensor* input, THCudaTensor* output,
    float color_r, float color_g, float color_b, float brightness,
    float contrast, float gamma);

Create CUDA Core

We need to create a file named augmentation_cuda_kernel.cu for core code of CUDA. The following code is used to passing variables from C to CUDA.

struct ChromaticParams {
    float color[N_CHANNELS];
    float brightness;
    float contrast;
    float gamma;
};
 
#define CUDA_NUM_THREADS 512
 
inline int GET_BLOCKS(const int N) {
    return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
 
int color_transform_cuda_kernel(float* input, float* output, int n_channels, int height, int width,
        float color_r, float color_g, float color_b, float brightness, float contrast, float gamma) {
    struct ChromaticParams cp;
    cp.color[0]     = color_r;
    cp.color[1]     = color_g;
    cp.color[2]     = color_b;
    cp.brightness   = brightness;
    cp.contrast     = contrast;
    cp.gamma        = gamma;
 
    // Memory addresses in GPU
    ChromaticParams* p_cp;
 
    //  Alloc space for GPU copies
    cudaMalloc((void **)&amp;p_cp, sizeof(ChromaticParams));
    // Copy data to GPU
    cudaMemcpy(p_cp, &amp;cp, sizeof(ChromaticParams), cudaMemcpyHostToDevice);
 
    // Color Augmentation
    int n_threads = height * width;
    color_augmentation<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS>>>(
        n_threads, n_channels, height, width, input, output, p_cp);
 
    // Free alloced memory in GPU
    cudaFree(p_cp);
 
    // Check for errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Error in color_transform_cuda_kernel: %s\n", cudaGetErrorString(err));
        return 0;
    }
    return 1;
}

Then, we need to implement the function color_augmentation() which will be executed by CUDA.

#define CUDA_KERNEL_LOOP(i, n) \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
       i < (n); \
       i += blockDim.x * gridDim.x)
#define N_CHANNELS 3
#define MAX_MULTIPLIER 1
 
inline __device__ __host__ float clamp(float f, float a, float b) {
    return fmaxf(a, fminf(f, b));
}
 
__global__ void color_augmentation(const int nthreads, const int channels, const int height,
        const int width, float* src_data, float* dest_data, const ChromaticParams* cp) {
    CUDA_KERNEL_LOOP(index, nthreads) {
        float x = (float)(index % width); //w-pos
        float y = (float)((index / width) % height); //h-pos
        int n = (index / width / height); // num
 
        int data_index[N_CHANNELS];
        float rgb[N_CHANNELS];
        float mean_in = 0;
        float mean_out = 0;
                
        // do color change
        for ( int c = 0; c < channels; ++c ) {
            data_index = width * (height * (channels * n + c) + y) + x;
            rgb = src_data[data_index];
            mean_in += rgb;
            rgb *= cp->color;
            mean_out += rgb;
        }
         
        float brightness_coeff = mean_in / (mean_out + 0.01f);
        for ( int c = 0; c < channels; ++c ) {
            //compensate brightness
            rgb = clamp(rgb * brightness_coeff, 0.f, 1.f);
 
            // do gamma change
            rgb = pow(rgb, cp->gamma);
 
            // do brightness change
            rgb = rgb + cp->brightness;
 
            // do contrast change
            rgb = 0.5f + (rgb - 0.5f) * cp->contrast;
 
            // write sample to destination
            dest_data[data_index] = clamp(rgb, 0.f, MAX_MULTIPLIER);
        }
    }
}

Similar to augmentation_cuda.h, we need to decalre a prototype of the function color_transform_cuda_kernel.

int color_transform_cuda_kernel(float* input, float* output, int n_channels, int height, int width,
    float color_r, float color_g, float color_b, float brightness, float contrast, float gamma);

Build & Run

And now, we can build this CUDA extension with the following script:

#!/usr/bin/bash
TORCH=$(python3 -c "import os; import torch; print(os.path.dirname(torch.__file__))")
 
cd src
 
rm ../build/augmentation_cuda_kernel.o
rm -r ../_ext
 
nvcc -c -o ../build/augmentation_cuda_kernel.o augmentation_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_61
 
cd ../
python build.py

To make it accessible by Python, we need to create a __init__.py:

import torch
from torch.autograd import Function, Variable
 
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), '_ext'))
import augmentation
 
class Augmentation(Function):
    def color_transform(self, input, rnd):
        output = input.new()
        augmentation.color_transform_cuda(input, output, rnd['color_r'], rnd['color_g'], \
            rnd['color_b'], rnd['brightness'], rnd['contrast'], rnd['gamma'])
 
        return output

And now, we can import this module with:

import Augmentation

References

  • https://github.com/chrischoy/pytorch-custom-cuda-tutorial/
  • https://developer.nvidia.com/how-to-cuda-python
Contact Us
  • SenseTime Research, Shenzhenwan Sci. and Tech. Ecological Garden
  • cshzxie [at] gmail [dot] com