Deprecated warning: PyTorch 1.0+ uses to Aten as its tensor library. So the article is no longer applicable in PyTorch 1.0+. Please refer to this GitHub repository for more information.
Python is one of the most popular programming languages today for deep learning applications. However, as an interpreted language, it has been considered too slow for high-performance computing. Therefore, I incorporated CUDA into Python to build my own extension that accelerates the computation.
In this tutorial, I give an example of CUDA extension for data augmentation in deep learning.
Create Build Scripts
First of all, we need to create a script for building FFI extension named build.py
.
import os
import torch
import torch.utils.ffi
this_folder = os.path.dirname(os.path.abspath(__file__)) + '/'
Headers = []
Sources = []
Defines = []
Objects = []
if torch.cuda.is_available() == True:
Headers += ['src/augmentation_cuda.h']
Sources += ['src/augmentation_cuda.c']
Defines += [('WITH_CUDA', None)]
Objects += ['build/augmentation_cuda_kernel.o']
ffi = torch.utils.ffi.create_extension(
name='_ext.augmentation',
headers=Headers,
sources=Sources,
verbose=False,
with_cuda=True,
package=False,
relative_to=this_folder,
define_macros=Defines,
extra_objects=[os.path.join(this_folder, Object) for Object in Objects]
)
if __name__ == '__main__':
ffi.build()
Create CUDA Interface
Then, we need to create an interface for CUDA which can be invoked by Python. In this tutorial, we name this file augmentation_cuda.c
.
#include <THC/THC.h>
#include "augmentation_cuda_kernel.h"
// symbol to be automatically resolved by PyTorch libs
extern THCState *state;
int color_transform_cuda(THCudaTensor* input, THCudaTensor* output,
float color_r, float color_g, float color_b, float brightness,
float contrast, float gamma) {
int nChannels = input->size[0];
int height = input->size[1];
int width = input->size[2];
THCudaTensor_resize3d(state, output, nChannels, height, width);
THCudaTensor_fill(state, output, 0);
int success = 0;
success = color_transform_cuda_kernel(
THCudaTensor_data(state, input),
THCudaTensor_data(state, output),
nChannels, height, width,
color_r, color_g, color_b, brightness, contrast, gamma);
//Check for errors
if ( !success ) {
THError("aborting");
}
return 1;
}
Also, we need to create a header file augmentation.h
and declare the corresponding prototype of the function color_transform_cuda(...)
.
int color_transform_cuda(THCudaTensor* input, THCudaTensor* output,
float color_r, float color_g, float color_b, float brightness,
float contrast, float gamma);
Create CUDA Core
We need to create a file named augmentation_cuda_kernel.cu
for the core code of CUDA. The following code is used to pass variables from C to CUDA.
struct ChromaticParams {
float color[N_CHANNELS];
float brightness;
float contrast;
float gamma;
};
#define CUDA_NUM_THREADS 512
inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
int color_transform_cuda_kernel(float* input, float* output, int n_channels, int height, int width,
float color_r, float color_g, float color_b, float brightness, float contrast, float gamma) {
struct ChromaticParams cp;
cp.color[0] = color_r;
cp.color[1] = color_g;
cp.color[2] = color_b;
cp.brightness = brightness;
cp.contrast = contrast;
cp.gamma = gamma;
// Memory addresses in GPU
ChromaticParams* p_cp;
// Alloc space for GPU copies
cudaMalloc((void **)&p_cp, sizeof(ChromaticParams));
// Copy data to GPU
cudaMemcpy(p_cp, &cp, sizeof(ChromaticParams), cudaMemcpyHostToDevice);
// Color Augmentation
int n_threads = height * width;
color_augmentation<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS>>>(
n_threads, n_channels, height, width, input, output, p_cp);
// Free alloced memory in GPU
cudaFree(p_cp);
// Check for errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Error in color_transform_cuda_kernel: %s\n", cudaGetErrorString(err));
return 0;
}
return 1;
}
Then, we need to implement the function color_augmentation()
which will be executed by CUDA.
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
#define N_CHANNELS 3
#define MAX_MULTIPLIER 1
inline __device__ __host__ float clamp(float f, float a, float b) {
return fmaxf(a, fminf(f, b));
}
__global__ void color_augmentation(const int nthreads, const int channels, const int height,
const int width, float* src_data, float* dest_data, const ChromaticParams* cp) {
CUDA_KERNEL_LOOP(index, nthreads) {
float x = (float)(index % width); //w-pos
float y = (float)((index / width) % height); //h-pos
int n = (index / width / height); // num
int data_index[N_CHANNELS];
float rgb[N_CHANNELS];
float mean_in = 0;
float mean_out = 0;
// do color change
for ( int c = 0; c < channels; ++c ) {
data_index = width * (height * (channels * n + c) + y) + x;
rgb = src_data[data_index];
mean_in += rgb;
rgb *= cp->color;
mean_out += rgb;
}
float brightness_coeff = mean_in / (mean_out + 0.01f);
for ( int c = 0; c < channels; ++c ) {
//compensate brightness
rgb = clamp(rgb * brightness_coeff, 0.f, 1.f);
// do gamma change
rgb = pow(rgb, cp->gamma);
// do brightness change
rgb = rgb + cp->brightness;
// do contrast change
rgb = 0.5f + (rgb - 0.5f) * cp->contrast;
// write sample to destination
dest_data[data_index] = clamp(rgb, 0.f, MAX_MULTIPLIER);
}
}
}
Similar to augmentation_cuda.h
, we need to declare a prototype of the function color_transform_cuda_kernel
.
int color_transform_cuda_kernel(float* input, float* output, int n_channels, int height, int width,
float color_r, float color_g, float color_b, float brightness, float contrast, float gamma);
Build & Run
And now, we can build this CUDA extension with the following script:
#!/usr/bin/bash
TORCH=$(python3 -c "import os; import torch; print(os.path.dirname(torch.__file__))")
cd src
rm ../build/augmentation_cuda_kernel.o
rm -r ../_ext
nvcc -c -o ../build/augmentation_cuda_kernel.o augmentation_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_61
cd ../
python build.py
To make it accessible by Python, we need to create a __init__.py
:
import torch
from torch.autograd import Function, Variable
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), '_ext'))
import augmentation
class Augmentation(Function):
def color_transform(self, input, rnd):
output = input.new()
augmentation.color_transform_cuda(input, output, rnd['color_r'], rnd['color_g'], \
rnd['color_b'], rnd['brightness'], rnd['contrast'], rnd['gamma'])
return output
And now, we can import this module with:
import Augmentation
The Disqus comment system is loading ...
If the message does not appear, please check your Disqus configuration.