import numba
from numba import cuda
import numpy as np
import math

# CONSTANTS
ATOM_MAX = 29
N = int(1e4)

crds = np.random.random((N, ATOM_MAX, 3))
crds_natoms = np.array([ATOM_MAX]*N)

# create result array on the GPU
shape = (N, ATOM_MAX, ATOM_MAX)
distance_mat_gpu = cuda.device_array(shape, dtype=np.float32)

@cuda.jit
def compute_distance_mat(crds, natoms, dmat):
    
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    
    dmat[bx, tx, ty] = 0.0
    
    # bounds check
    N = natoms[bx]
    if tx >= N: return
    if ty >= N: return 
    
    for i in range(3):
        dmat[bx, tx, ty] += ( crds[bx, tx, i] - crds[bx, ty, i] )**2
    dmat[bx, tx, ty] = math.sqrt(dmat[bx, tx, ty])

%%timeit -n1 -r1 -o

#transfer structures data (the coordinates) to GPU
crds_gpu = cuda.to_device(crds)
crds_natoms_gpu = cuda.to_device(crds_natoms)

# launch kernel
compute_distance_mat[ N , (29, 29)](crds_gpu, crds_natoms_gpu, distance_mat_gpu)

# copy the data back
distance_mat_cpu = distance_mat_gpu.copy_to_host()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-5-16fc9f4d26e3> in <module>
----> 1 get_ipython().run_cell_magic('timeit', '-n1 -r1 -o', '\n#transfer structures data (the coordinates) to GPU\ncrds_gpu = cuda.to_device(crds)\ncrds_natoms_gpu = cuda.to_device(crds_natoms)\n\n# launch kernel\ncompute_distance_mat[ N , (29, 29)](crds_gpu, crds_natoms_gpu, distance_mat_gpu)\n\n# copy the data back\ndistance_mat_cpu = distance_mat_gpu.copy_to_host()\n')

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
   2380             with self.builtin_trap:
   2381                 args = (magic_arg_s, cell)
-> 2382                 result = fn(*args, **kwargs)
   2383             return result
   2384 

<decorator-gen-53> in timeit(self, line, cell, local_ns)

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    185     # but it's overkill for just that one bit of state.
    186     def magic_deco(arg):
--> 187         call = lambda f, *a, **k: f(*a, **k)
    188 
    189         if callable(arg):

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/IPython/core/magics/execution.py in timeit(self, line, cell, local_ns)
   1171                     break
   1172 
-> 1173         all_runs = timer.repeat(repeat, number)
   1174         best = min(all_runs) / number
   1175         worst = max(all_runs) / number

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/timeit.py in repeat(self, repeat, number)
    203         r = []
    204         for i in range(repeat):
--> 205             t = self.timeit(number)
    206             r.append(t)
    207         return r

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/IPython/core/magics/execution.py in timeit(self, number)
    167         gc.disable()
    168         try:
--> 169             timing = self.inner(it, self.timer)
    170         finally:
    171             if gcold:

<magic-timeit> in inner(_it, _timer)

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/numba/cuda/compiler.py in __call__(self, *args)
    767 
    768     def __call__(self, *args):
--> 769         return self.dispatcher.call(args, self.griddim, self.blockdim,
    770                                     self.stream, self.sharedmem)
    771 

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/numba/cuda/compiler.py in call(self, args, griddim, blockdim, stream, sharedmem)
    859         argtypes = tuple(
    860             [self.typingctx.resolve_argument_type(a) for a in args])
--> 861         kernel = self.compile(argtypes)
    862         kernel.launch(args, griddim, blockdim, stream, sharedmem)
    863 

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/numba/cuda/compiler.py in compile(self, sig)
    933             self.definitions[(cc, argtypes)] = kernel
    934             if self._bind:
--> 935                 kernel.bind()
    936             self.sigs.append(sig)
    937         return kernel

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/numba/cuda/compiler.py in bind(self)
    574         Force binding to current CUDA context
    575         """
--> 576         self._func.get()
    577 
    578     @property

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/numba/cuda/compiler.py in get(self)
    444         cufunc = self.cache.get(device.id)
    445         if cufunc is None:
--> 446             ptx = self.ptx.get()
    447 
    448             # Link

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/numba/cuda/compiler.py in get(self)
    412         ptx = self.cache.get(cc)
    413         if ptx is None:
--> 414             arch = nvvm.get_arch_option(*cc)
    415             ptx = nvvm.llvm_to_ptx(self.llvmir, arch=arch,
    416                                    **self._extra_options)

/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/numba/cuda/cudadrv/nvvm.py in get_arch_option(major, minor)
    343     else:
    344         arch = find_closest_arch((major, minor))
--> 345     return 'compute_%d%d' % arch
    346 
    347 

TypeError: not enough arguments for format string

# store the timing result
GPU_TIMING = _

dmat_np_cpu = np.zeros((N, ATOM_MAX, ATOM_MAX))

%%timeit -n1 -r1 -o

for m in range(N):
    for i in range(ATOM_MAX):
        for j in range(ATOM_MAX):
            dmat_np_cpu[m,i,j] = np.sqrt(np.sum((crds[m, i] - crds[m,j])**2))

# store the timing result
CPU_TIMING = _

print('Speedup factor: ', CPU_TIMING.average / GPU_TIMING.average, 'X')

# copy the data back (again)
distance_mat_cpu = distance_mat_gpu.copy_to_host()

# check results
tol = 1e-4
error = 0
for m in range(N):
    for i in range(ATOM_MAX):
        for j in range(ATOM_MAX):
            r = dmat_np_cpu[m,i,j] - distance_mat_cpu[m,i,j]
            if r>tol:
                error=1
if not error:
    print('results agree')

Source Information¶

Goal¶

CUDA-Python with Numba¶

2D Threadblocks¶

Required Modules for the Jupyter Notebook¶

Input Data¶

Kernel Design¶

Call CUDA Kernel¶

Naive Numpy Version¶

Speedup Factor¶

Checking Results¶

Submit Ticket¶