PyCUDA¶
Introduction¶
Authors: Andreas Klockner
PyCUDA can access Nvidia’s CUDA parallel computation API from Python
Object cleanup tied to lifetime of objects (RAII, Resource Acquisition Is Initialization).
Makes it much easier to write correct, leak- and crash-free code
PyCUDA knows about dependencies (e.g.. it won’t detach from a context before all memory allocated in it is also freed)
Convenience
Abstractions to compile CUDA code from Python:
pycuda.driver.SourceModule
A GPU memory buffer: texttt{pycuda.gpuarray.GPUArray}
Completeness
Binding to all of CUDA’s driver API
Automatic Error Checking
All CUDA errors are automatically translated into Python exceptions
Speed
PyCUDA’s base layer is written in C++
Helpful documentation
Example¶
import pycuda.autoinit
import pycuda.driver as drv
import numpy
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)
dest = numpy.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400,1,1), grid=(1,1))
assert numpy.allclose(dest, a*b)
print dest
Exercise 6¶
Run the above example
Modify and execute it to work for a matrix of 20 x 10
Theano + PyCUDA¶
import numpy, theano
import theano.misc.pycuda_init
from pycuda.compiler import SourceModule
import theano.sandbox.cuda as cuda
class PyCUDADoubleOp(theano.Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, inp):
inp = cuda.basic_ops.gpu_contiguous(
cuda.basic_ops.as_cuda_ndarray_variable(inp))
assert inp.dtype == "float32"
return theano.Apply(self, [inp], [inp.type()])
def make_thunk(self, node, storage_map, _, _2):
mod = SourceModule("""
__global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<size){
o0[i] = i0[i]*2;
}
}""")
pycuda_fct = mod.get_function("my_fct")
inputs = [ storage_map[v] for v in node.inputs]
outputs = [ storage_map[v] for v in node.outputs]
def thunk():
z = outputs[0]
if z[0] is None or z[0].shape!=inputs[0][0].shape:
z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
block=(512,1,1), grid=grid)
return thunk
Test it!
>>> x = theano.tensor.fmatrix() # doctest: +SKIP
>>> f = theano.function([x], PyCUDADoubleOp()(x)) # doctest: +SKIP
>>> xv=numpy.ones((4,5), dtype="float32") # doctest: +SKIP
>>> assert numpy.allclose(f(xv), xv*2) # doctest: +SKIP
>>> print numpy.asarray(f(xv)) # doctest: +SKIP
Exercises 7¶
Run the above example
Modify and execute the example to multiple two matrix: x * y
Modify and execute the example to return 2 outputs: x + y and x - y
Our current elemwise fusion generate computation with only 1 outputs
Modify and execute the example to support stride? (Don’t force the input to be c contiguous)