Critical code
out[o] = in_storage[j] + 3
in_storage[j]__add__ or __ladd__!Work
def my_code(x, y):
for i in range(100):
x[i] = y + 20
...
my_code(x, y)
fast_my_code = numba.njit()(my_code)
fast_my_code(x, y)
fast_my_code(x, y)
for loops with parallel version
chalk.hcat(
[
matrix(2, 4, colormap=lambda i, j: color(1, 4)(0, j)),
matrix(4, 3, colormap=lambda i, j: color(1, 4)(0, i)),
],
0.5,
)
image_matmul_simple()
image_matmul_full()
A.shape == (I, J)
B.shape == (J, K)
out.shape == (I, K)
for outer_index in out.indices():
for inner_val in range(J):
out[outer_index] += A[outer_index[0], inner_val] * \
B[inner_val, outer_index[1]]
What can be parallelized?
for outer_index in out.indices():
for inner_val in range(J):
out[outer_index] += A[outer_index[0], inner_val] * \
B[inner_val, outer_index[1]]
save_for_backwards


def add(a, b):
b = a + 10
cuda_add = numba.cuda.jit()(add)
cuda_add[1, 1](a, b)

@numba.cuda.jit()
def add(a, b):
b = a + 10
cuda_add[1, 10](a, b)

@numba.cuda.jit()
def cuda_add(a, b):
b = a + 10
cuda_add[1, (10, 10)](a, b)

@numba.cuda.jit()
def add(a, b):
b = a + 10
cuda_add[(10, 10), (10, 10)](a, b)
@numba.cuda.jit()
def printer(a):
print("hello!")
a[:] = 10 + 50
a = numpy.zeros(10)
printer[10, 10](a)
threads_per_block x total_blocksPrinting code
@numba.cuda.jit()
def printer(a):
print(numba.cuda.threadIdx.x, numba.cuda.threadIdx.y)
a[:] = 10 + 50
a = numpy.zeros(10)
printer[1, (10, 10)](a)

@numba.cuda.jit()
def printer(a):
print(numba.cuda.blockIdx.x,
numba.cuda.threadIdx.x, numba.cuda.threadIdx.y)
a[:] = 10 + 50
a = numpy.zeros(10)
printer[10, (10, 10)](a)
Name
BLOCKS_X = 32
BLOCKS_Y = 32
THREADS_X = 10
THREADS_Y = 10
@numba.cuda.jit()
def fn(a):
x = numba.cuda.blockIdx.x * THREADS_X + numba.cuda.threadIdx.x
y = numba.cuda.blockIdx.y * THREADS_Y + numba.cuda.threadIdx.y
...
fn[(BLOCKS_X, BLOCKS_Y), (THREADS_X, THREAD_Y)](a)
BLOCKS_X = 32
THREADS_X = 32
@numba.cuda.jit()
def fn(out, a):
x = numba.cuda.blockIdx.x * THREADS_X + numba.cuda.threadIdx.x
if x >=0 and x < a.size:
out[x] = a[x] + 10
fn[BLOCKS_X, THREADS_X](a)
Guards
x = numba.cuda.blockIdx.x * BLOCKS_X + numba.cuda.threadIdx.x
if x >=0 and x < a.size: