Python-for-HPC · ggeorgakoudis · Oct 20, 2023
diff --git a/heatsim/Makefile b/heatsim/Makefile
@@ -0,0 +1,12 @@
+.PHONY: all
+
+all: heat_data_reg.x heat_data_reg-transpose.x
+
+heat_data_reg.x: heat_data_reg.c
+	clang -O3 -fopenmp -fopenmp-targets=nvptx64 -fopenmp-target-new-runtime heat_data_reg.c -o heat_data_reg.x -lm
+
+heat_data_reg-transpose.x: heat_data_reg-transpose.c
+	clang -O3 -fopenmp -fopenmp-targets=nvptx64 -fopenmp-target-new-runtime heat_data_reg-transpose.c -o heat_data_reg-transpose.x -lm
+
+clean:
+	rm -f *.ll *.o *.bc *.s *.i *.cubin a.out* *.x
diff --git a/heatsim/cudaprofile.py b/heatsim/cudaprofile.py
@@ -0,0 +1,19 @@
+import ctypes
+
+_cudart = ctypes.CDLL('libcudart.so')
+
+
+def start():
+    # As shown at http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__PROFILER.html,
+    # the return value will unconditionally be 0. This check is just in case it changes in 
+    # the future.
+    ret = _cudart.cudaProfilerStart()
+    if ret != 0:
+        raise Exception("cudaProfilerStart() returned %d" % ret)
+
+def stop():
+    ret = _cudart.cudaProfilerStop()
+    if ret != 0:
+        raise Exception("cudaProfilerStop() returned %d" % ret)
+
+
diff --git a/heatsim/heat-jit-noomp.py b/heatsim/heat-jit-noomp.py
@@ -0,0 +1,111 @@
+from numba import njit
+from numba.openmp import openmp_context as openmp
+from numba.openmp import omp_get_wtime
+import numpy as np
+import sys
+import math
+
+@njit
+def initial_value(n, dx, length, u):
+    y = dx
+    for j in range(n):
+        x = dx
+        for i in range(n):
+            u[j, i] = math.sin(math.pi * x / length) * math.sin(math.pi * y / length)
+            x += dx
+        y += dx
+
+@njit
+def solution(t, x, y, alpha, length):
+    return math.exp(-2.0 * alpha * (math.pi ** 2) * t / (length ** 2)) * math.sin(math.pi * x / length) * math.sin(math.pi * y / length)
+
+
+@njit
+def l2norm(n, u, nsteps, dt, alpha, dx, length):
+    time = dt * nsteps
+    l2norm_ret = 0.0
+
+    y = dx
+    for j in range(n):
+        x = dx
+        for i in range(n):
+            answer = solution(time, x, y, alpha, length)
+            l2norm_ret += (u[j, i] - answer) ** 2
+            x += dx
+        y += dx
+
+    return math.sqrt(l2norm_ret)
+
+
+@njit
+def solve(n, alpha, dx, dt, u, u_tmp):
+    r = alpha * dt / (dx ** 2)
+    r2 = 1.0 - 4.0 * r
+    for i in range(n):
+        for j in range(n):
+            u_tmp[j, i] = (r2 * u[j, i] +
+                           (r * u[j, i+1] if i < n-1 else 0.0) +
+                           (r * u[j, i-1] if i > 0   else 0.0) +
+                           (r * u[j+1, i] if j < n-1 else 0.0) +
+                           (r * u[j-1, i] if j > 0 else 0.0))
+
+@njit
+def core(nsteps, n, alpha, dx, dt, u, u_tmp):
+    for t in range(nsteps):
+        solve(n, alpha, dx, dt, u, u_tmp)
+        u, u_tmp = u_tmp, u
+
+if __name__ == "__main__":
+    start = omp_get_wtime()
+
+    n = 1000
+    nsteps = 10
+
+    if len(sys.argv) == 3:
+        n = int(sys.argv[1])
+        nsteps = int(sys.argv[2])
+
+    alpha = 0.1
+    length = 1000.0
+    dx = length / (n + 1)
+    dt = 0.5 / nsteps
+    r = alpha * dt / (dx ** 2)
+    print(" MMS heat equation")
+    print("Problem input")
+    print(f" Grid size: {n} x {n}")
+    print(f" Cell width: {dx}")
+    print(f" Grid length: {length} x {length}\n")
+    print(f" Alpha: {alpha}\n");
+    print(f" Steps: {nsteps}");
+    print(f" Total time: {dt*nsteps}");
+    print(f" Time step: {dt}");
+
+    # Stability check
+    print("Stability");
+    print(f" r value: {r}");
+    if r > 0.5:
+        print("Warning: unstable")
+
+    tic = omp_get_wtime()
+    core.compile("none(int64, int64, float64, float64, float64, Array(float64, 2, 'C'), Array(float64, 2, 'C'))")
+    toc = omp_get_wtime()
+    print('core compile', toc-tic)
+    print('COMPILED')
+
+    u = np.zeros((n,n))
+    u_tmp = np.zeros((n,n))
+    initial_value(n, dx, length, u)
+
+    tic = omp_get_wtime()
+    core(nsteps, n, alpha, dx, dt, u, u_tmp)
+    toc = omp_get_wtime()
+
+    norm = l2norm(n, u, nsteps, dt, alpha, dx, length)
+
+    stop = omp_get_wtime()
+
+    print("Error (L2norm):", norm)
+    print("Solve time (s):", toc-tic)
+    print("total time:", stop-start)
+
+print("DONE")
diff --git a/heatsim/heat-jit-omp.py b/heatsim/heat-jit-omp.py
@@ -0,0 +1,111 @@
+from numba import njit, prange
+from numba.openmp import openmp_context as openmp
+from numba.openmp import omp_get_wtime, omp_set_num_threads, omp_get_num_threads, omp_get_num_devices, omp_is_initial_device, omp_get_thread_num
+import numpy as np
+import sys
+import math
+
+@njit
+def initial_value(n, dx, length, u):
+    y = dx
+    for j in range(n):
+        x = dx
+        for i in range(n):
+            u[j, i] = math.sin(math.pi * x / length) * math.sin(math.pi * y / length)
+            x += dx
+        y += dx
+
+@njit
+def solution(t, x, y, alpha, length):
+    return math.exp(-2.0 * alpha * (math.pi ** 2) * t / (length ** 2)) * math.sin(math.pi * x / length) * math.sin(math.pi * y / length)
+
+@njit
+def l2norm(n, u, nsteps, dt, alpha, dx, length):
+    time = dt * nsteps
+    l2norm_ret = 0.0
+
+    y = dx
+    for j in range(n):
+        x = dx
+        for i in range(n):
+            answer = solution(time, x, y, alpha, length)
+            l2norm_ret += (u[j, i] - answer) ** 2
+            x += dx
+            #print(f'u[{j}, {i}]', u[j, i],'==', answer) 
+        y += dx
+
+    return math.sqrt(l2norm_ret)
+
+@njit
+def solve(n, alpha, dx, dt, u, u_tmp):
+    r = alpha * dt / (dx ** 2)
+    r2 = 1.0 - 4.0 * r
+    with openmp('parallel for'):
+        for i in range(n):
+            for j in range(n):
+                u_tmp[j, i] = (r2 * u[j, i] +
+                               (r * u[j, i+1] if i < n-1 else 0.0) +
+                               (r * u[j, i-1] if i > 0   else 0.0) +
+                               (r * u[j+1, i] if j < n-1 else 0.0) +
+                               (r * u[j-1, i] if j > 0 else 0.0))
+
+@njit
+def core(nsteps, n, alpha, dx, dt, u, u_tmp):
+    for t in range(nsteps):
+        solve(n, alpha, dx, dt, u, u_tmp)
+        u, u_tmp = u_tmp, u
+
+
+if __name__ == "__main__":
+    start = omp_get_wtime()
+
+    n = 1000
+    nsteps = 10
+
+    if len(sys.argv) == 3:
+        n = int(sys.argv[1])
+        nsteps = int(sys.argv[2])
+
+    alpha = 0.1
+    length = 1000.0
+    dx = length / (n + 1)
+    dt = 0.5 / nsteps
+    r = alpha * dt / (dx ** 2)
+    print(" MMS heat equation")
+    print("Problem input")
+    print(f" Grid size: {n} x {n}")
+    print(f" Cell width: {dx}")
+    print(f" Grid length: {length} x {length}\n")
+    print(f" Alpha: {alpha}\n");
+    print(f" Steps: {nsteps}");
+    print(f" Total time: {dt*nsteps}");
+    print(f" Time step: {dt}");
+
+    # Stability check
+    print("Stability");
+    print(f" r value: {r}");
+    if r > 0.5:
+        print("Warning: unstable")
+
+    tic = omp_get_wtime()
+    core.compile("none(int64, int64, float64, float64, float64, Array(float64, 2, 'C'), Array(float64, 2, 'C'))")
+    toc = omp_get_wtime()
+    print('core compile', toc-tic)
+    print('COMPILED')
+
+    u = np.zeros((n,n))
+    u_tmp = np.zeros((n,n))
+    initial_value(n, dx, length, u)
+    tic = omp_get_wtime()
+    core(nsteps, n, alpha, dx, dt, u, u_tmp)
+    toc = omp_get_wtime()
+
+    norm = l2norm(n, u, nsteps, dt, alpha, dx, length)
+
+    stop = omp_get_wtime()
+
+    print("Error (L2norm):", norm)
+    print("Solve time (s):", toc-tic)
+    print("total time:", stop-start)
+
+print("DONE")
diff --git a/heatsim/heat-jit-omptarget-collapse.py b/heatsim/heat-jit-omptarget-collapse.py
@@ -0,0 +1,134 @@
+from numba import njit, prange
+from numba.core.typing.typeof import Purpose, typeof
+from numba.openmp import openmp_context as openmp
+from numba.openmp import omp_get_wtime, omp_set_num_threads, omp_get_num_threads, omp_get_num_devices, omp_is_initial_device, omp_get_thread_num
+import numpy as np
+import sys
+import math
+import cudaprofile
+
+@njit
+def initial_value(n, dx, length, u):
+    y = dx
+    for j in range(n):
+        x = dx
+        for i in range(n):
+            u[j, i] = math.sin(math.pi * x / length) * math.sin(math.pi * y / length)
+            x += dx
+        y += dx
+
+@njit
+def solution(t, x, y, alpha, length):
+    return math.exp(-2.0 * alpha * (math.pi ** 2) * t / (length ** 2)) * math.sin(math.pi * x / length) * math.sin(math.pi * y / length)
+
+@njit
+def l2norm(n, u, nsteps, dt, alpha, dx, length):
+    time = dt * nsteps
+    l2norm_ret = 0.0
+
+    y = dx
+    for j in range(n):
+        x = dx
+        for i in range(n):
+            answer = solution(time, x, y, alpha, length)
+            l2norm_ret += (u[j, i] - answer) ** 2
+            x += dx
+            #print(f'u[{j}, {i}]', u[j, i],'==', answer) 
+        y += dx
+
+    return math.sqrt(l2norm_ret)
+
+@njit
+def solve(n, alpha, dx, dt, u, u_tmp):
+    r = alpha * dt / (dx ** 2)
+    r2 = 1.0 - 4.0 * r
+    #with openmp ("target teams distribute parallel for map(to: n, r, r2) device(1)"):
+    with openmp ("target teams distribute parallel for firstprivate(n, r, r2) device(1)"):
+        for ji in range(n*n):
+            j = int(ji/n)
+            i = int(ji - j*n)
+        #for j in range(n):
+        #    for i in range(n):
+            u_tmp[j, i] = (r2 * u[j, i] +
+                    (r * u[j, i+1] if i < n-1 else 0.0) +
+                    (r * u[j, i-1] if i > 0   else 0.0) +
+                    (r * u[j+1, i] if j < n-1 else 0.0) +
+                    (r * u[j-1, i] if j > 0 else 0.0))
+
+
+@njit
+def core(nsteps, n, alpha, dx, dt, u, u_tmp):
+    with openmp ("target enter data map(to: u, u_tmp) device(1)"):
+        pass
+
+    tic = omp_get_wtime()
+    for t in range(nsteps):
+        solve(n, alpha, dx, dt, u, u_tmp)
+        u, u_tmp = u_tmp, u
+    toc = omp_get_wtime()
+
+    with openmp ("target exit data map(from: u) device(1)"):
+        pass
+
+    # Why do we need this use?
+    u = u
+    return toc-tic
+
+if __name__ == "__main__":
+    start = omp_get_wtime()
+
+    n = 1000
+    nsteps = 10
+
+    if len(sys.argv) == 3:
+        n = int(sys.argv[1])
+        nsteps = int(sys.argv[2])
+
+    alpha = 0.1
+    length = 1000.0
+    dx = length / (n + 1)
+    dt = 0.5 / nsteps
+    r = alpha * dt / (dx ** 2)
+    print(" MMS heat equation")
+    print("Problem input")
+    print(f" Grid size: {n} x {n}")
+    print(f" Cell width: {dx}")
+    print(f" Grid length: {length} x {length}\n")
+    print(f" Alpha: {alpha}\n");
+    print(f" Steps: {nsteps}");
+    print(f" Total time: {dt*nsteps}");
+    print(f" Time step: {dt}");
+
+    # Stability check
+    print("Stability");
+    print(f" r value: {r}");
+    if r > 0.5:
+        print("Warning: unstable")
+
+    u = np.zeros((n,n))
+    u_tmp = np.zeros((n,n))
+    initial_value(n, dx, length, u)
+
+    tic = omp_get_wtime()
+    #core.compile("Array(float64, 2, 'C')(int64, int64, float64, float64, float64, Array(float64, 2, 'C'), Array(float64, 2, 'C'))")
+    core.compile("float64(int64, int64, float64, float64, float64, Array(float64, 2, 'C'), Array(float64, 2, 'C'))")
+    toc = omp_get_wtime()
+    print('core compile', toc-tic)
+    print('COMPILED')
+
+    u = np.zeros((n,n))
+    u_tmp = np.zeros((n,n))
+    initial_value(n, dx, length, u)
+    cudaprofile.start()
+    exetime = core(nsteps, n, alpha, dx, dt, u, u_tmp)
+    cudaprofile.stop()
+
+    norm = l2norm(n, u, nsteps, dt, alpha, dx, length)
+
+    stop = omp_get_wtime()
+
+    print("Error (L2norm):", norm)
+    print("Solve time (s):", exetime)
+    print("total time:", stop-start)
+
+print("DONE")