Build PyTorch Extensions with CUDA and CFFI

Python is one of the most popular programming languages today for deep learning applications. However, as an interpreted language, it has been considered too slow for high-performance computing. Therefore, I incorporated CUDA into Python to build my own extension that accelerates the computation.

In this tutorial, I give an example of CUDA extension for data augmentation in deep learning.

Create Build Scripts

First of all, we need to create a script for building FFI extension named build.py.

import os
import torch
import torch.utils.ffi

this_folder = os.path.dirname(os.path.abspath(__file__)) + '/'

Headers = []
Sources = []
Defines = []
Objects = []

if torch.cuda.is_available() == True:
    Headers += ['src/augmentation_cuda.h']
    Sources += ['src/augmentation_cuda.c']
    Defines += [('WITH_CUDA', None)]
    Objects += ['build/augmentation_cuda_kernel.o']

ffi = torch.utils.ffi.create_extension(
    name='_ext.augmentation',
    headers=Headers,
    sources=Sources,
    verbose=False,
    with_cuda=True,
    package=False,
    relative_to=this_folder,
    define_macros=Defines,
    extra_objects=[os.path.join(this_folder, Object) for Object in Objects]
)

if __name__ == '__main__':
    ffi.build()

Create CUDA Interface

Then, we need to create an interface for CUDA which can be invoked by Python. In this tutorial, we name this file augmentation_cuda.c.

#include <THC/THC.h>

#include "augmentation_cuda_kernel.h"

// symbol to be automatically resolved by PyTorch libs
extern THCState *state;

int color_transform_cuda(THCudaTensor* input, THCudaTensor* output, 
        float color_r, float color_g, float color_b, float brightness, 
        float contrast, float gamma) {
    int nChannels = input->size[0];
    int height = input->size[1];
    int width = input->size[2];

    THCudaTensor_resize3d(state, output, nChannels, height, width);
    THCudaTensor_fill(state, output, 0);

    int success = 0;
    success = color_transform_cuda_kernel(
        THCudaTensor_data(state, input),
        THCudaTensor_data(state, output),
        nChannels, height, width, 
        color_r, color_g, color_b, brightness, contrast, gamma);
    
    //Check for errors
    if ( !success ) {
        THError("aborting");
    }
    return 1;
}

Also, we need to create a header file augmentation.h and declare the corresponding prototype of the function color_transform_cuda(...).

int color_transform_cuda(THCudaTensor* input, THCudaTensor* output, 
    float color_r, float color_g, float color_b, float brightness, 
    float contrast, float gamma);

Create CUDA Core

We need to create a file named augmentation_cuda_kernel.cu for core code of CUDA. The following code is used to passing variables from C to CUDA.

struct ChromaticParams {
    float color[N_CHANNELS];
    float brightness;
    float contrast;
    float gamma;
};

#define CUDA_NUM_THREADS 512

inline int GET_BLOCKS(const int N) {
    return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}

int color_transform_cuda_kernel(float* input, float* output, int n_channels, int height, int width, 
        float color_r, float color_g, float color_b, float brightness, float contrast, float gamma) {
    struct ChromaticParams cp;
    cp.color[0]     = color_r;
    cp.color[1]     = color_g;
    cp.color[2]     = color_b;
    cp.brightness   = brightness;
    cp.contrast     = contrast;
    cp.gamma        = gamma;

    // Memory addresses in GPU
    ChromaticParams* p_cp;

    //  Alloc space for GPU copies
    cudaMalloc((void **)&p_cp, sizeof(ChromaticParams));
    // Copy data to GPU
    cudaMemcpy(p_cp, &cp, sizeof(ChromaticParams), cudaMemcpyHostToDevice);

    // Color Augmentation
    int n_threads = height * width;
    color_augmentation<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS>>>(
        n_threads, n_channels, height, width, input, output, p_cp);

    // Free alloced memory in GPU
    cudaFree(p_cp);

    // Check for errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Error in color_transform_cuda_kernel: %s\n", cudaGetErrorString(err));
        return 0;
    }
    return 1;
}

Then, we need to implement the function color_augmentation() which will be executed by CUDA.

#define CUDA_KERNEL_LOOP(i, n) \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
       i < (n); \
       i += blockDim.x * gridDim.x)
#define N_CHANNELS 3
#define MAX_MULTIPLIER 1

inline __device__ __host__ float clamp(float f, float a, float b) {
    return fmaxf(a, fminf(f, b));
}

__global__ void color_augmentation(const int nthreads, const int channels, const int height, 
        const int width, float* src_data, float* dest_data, const ChromaticParams* cp) {
    CUDA_KERNEL_LOOP(index, nthreads) {
        float x = (float)(index % width); //w-pos
        float y = (float)((index / width) % height); //h-pos
        int n = (index / width / height); // num

        int data_index[N_CHANNELS];
        float rgb[N_CHANNELS];
        float mean_in = 0;
        float mean_out = 0;
               
        // do color change
        for ( int c = 0; c < channels; ++c ) {
            data_index = width * (height * (channels * n + c) + y) + x;
            rgb = src_data[data_index];
            mean_in += rgb;
            rgb *= cp->color;
            mean_out += rgb;
        }
        
        float brightness_coeff = mean_in / (mean_out + 0.01f);
        for ( int c = 0; c < channels; ++c ) {
            //compensate brightness
            rgb = clamp(rgb * brightness_coeff, 0.f, 1.f);

            // do gamma change
            rgb = pow(rgb, cp->gamma);

            // do brightness change
            rgb = rgb + cp->brightness;

            // do contrast change
            rgb = 0.5f + (rgb - 0.5f) * cp->contrast;

            // write sample to destination
            dest_data[data_index] = clamp(rgb, 0.f, MAX_MULTIPLIER);
        }
    }
}

Similar to augmentation_cuda.h, we need to decalre a prototype of the function color_transform_cuda_kernel.

int color_transform_cuda_kernel(float* input, float* output, int n_channels, int height, int width, 
    float color_r, float color_g, float color_b, float brightness, float contrast, float gamma);

Build & Run

And now, we can build this CUDA extension with the following script:

#!/usr/bin/bash
TORCH=$(python3 -c "import os; import torch; print(os.path.dirname(torch.__file__))")

cd src

rm ../build/augmentation_cuda_kernel.o
rm -r ../_ext

nvcc -c -o ../build/augmentation_cuda_kernel.o augmentation_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_61

cd ../
python build.py

To make it accessible by Python, we need to create a __init__.py:

import torch
from torch.autograd import Function, Variable

import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), '_ext'))
import augmentation

class Augmentation(Function):
    def color_transform(self, input, rnd):
        output = input.new()
        augmentation.color_transform_cuda(input, output, rnd['color_r'], rnd['color_g'], \
            rnd['color_b'], rnd['brightness'], rnd['contrast'], rnd['gamma'])

        return output

And now, we can import this module with:

import Augmentation

Reference

  • https://github.com/chrischoy/pytorch-custom-cuda-tutorial/
  • https://developer.nvidia.com/how-to-cuda-python
Contact Us
  • Room 614, Zonghe Building, Harbin Institute of Technology
  • cshzxie [at] gmail.com