diff --git a/.gitignore b/.gitignore
index 59d905f..5284f58 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ cov_html
 .vscode
 .pytest_cache
 .coverage
+.idea
 *.c
diff --git a/Architecture.md b/Architecture.md
index 2615a0a..5e92f48 100644
--- a/Architecture.md
+++ b/Architecture.md
@@ -40,19 +40,19 @@ Buffer:
 
 Types can be composed of:
 - scalar: numbers, String
-- compound: Struct, Array, Ref, UnionRef 
+- compound: Struct, Array, Ref, UnionRef
 
 ### Scalars
 - examples: Float64, Int64, ...
 - create: Float64(3.14)
 - memory layout
-    - data 
+    - data
 
 ### String:
 - create: String(string_or_int)
 - memory layout
     - size
-    - data 
+    - data
 
 
 ### Struct
diff --git a/LICENSE b/LICENSE
index f49a4e1..261eeb9 100644
--- a/LICENSE
+++ b/LICENSE
@@ -198,4 +198,4 @@
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
+   limitations under the License.
diff --git a/pyproject.toml b/pyproject.toml
index 7959e2a..e5df678 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,4 +1,43 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "xobjects"
+dynamic = ["version"]
+description = "In-memory serialization and code generator for CPU and GPU"
+readme = ""
+authors = [
+    { name = "Riccardo De Maria", email = "riccardo.de.maria@cern.ch" }
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.7"
+dependencies = [
+    "numpy",
+    "cffi",
+    "scipy"
+]
+[project.urls]
+Homepage = "https://xsuite.readthedocs.io/"
+"Bug Tracker" = "https://github.com/xsuite/xsuite/issues"
+Documentation = "https://xsuite.readthedocs.io/"
+"Source Code" = "https://github.com/xsuite/xobjects"
+"Download" = "https://pypi.python.org/pypi/xobjects"
+
+[project.optional-dependencies]
+tests = ["pytest", "pytest-mock"]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["xobjects"]
+
+[tool.setuptools.dynamic]
+version = {attr = "xobjects._version.__version__"}
+
 [tool.black]
 line-length = 79
-target-version = ['py36', 'py37', 'py38']
+target-version = ['py310', 'py311', 'py312']
 include = '\.pyi?$'
+
+[project.entry-points.xobjects]
+include = "xobjects"
diff --git a/tests/test_capi.py b/tests/test_capi.py
index df2fcbb..096750d 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -11,7 +11,7 @@
 import cffi
 
 import xobjects as xo
-
+from xobjects.test_helpers import for_all_test_contexts
 
 ffi = cffi.FFI()
 
@@ -158,6 +158,70 @@ def test_array_dynamic_type_init_get_set(array_cls, example_shape):
             assert arr[ii].field1[idx_in_field] == 13 * vv
 
 
+@for_all_test_contexts
+@pytest.mark.parametrize(
+    "array_type",
+    [
+        xo.UInt64[3, 5, 7],
+        xo.UInt64[:, :, :],
+        xo.UInt64[:, 5, :],
+    ],
+)
+def test_array_get_shape(test_context, array_type):
+    source = """
+        #include "xobjects/headers/common.h"
+
+        GPUKERN void get_nd_and_shape(
+            ARRAY_TYPE arr,
+            GPUGLMEM int64_t* out_nd,
+            GPUGLMEM int64_t* out_shape
+        ) {
+            *out_nd = ARRAY_TYPE_nd(arr);
+            ARRAY_TYPE_shape(arr, out_shape);
+        }
+    """.replace(
+        "ARRAY_TYPE", array_type.__name__
+    )
+
+    kernels = {
+        "get_nd_and_shape": xo.Kernel(
+            c_name="get_nd_and_shape",
+            args=[
+                xo.Arg(array_type, name="arr"),
+                xo.Arg(xo.Int64, pointer=True, name="out_nd"),
+                xo.Arg(xo.Int64, pointer=True, name="out_shape"),
+            ],
+        ),
+    }
+
+    test_context.add_kernels(
+        sources=[source],
+        kernels=kernels,
+    )
+
+    instance = array_type(
+        np.array(range(3 * 5 * 7)).reshape((3, 5, 7)),
+        _context=test_context,
+    )
+
+    expected_nd = 3
+    result_nd = test_context.zeros((1,), dtype=np.int64)
+
+    expected_shape = [3, 5, 7]
+    result_shape = test_context.zeros((expected_nd,), dtype=np.int64)
+
+    test_context.kernels.get_nd_and_shape(
+        arr=instance,
+        out_nd=result_nd,
+        out_shape=result_shape,
+    )
+
+    assert result_nd[0] == expected_nd
+    assert result_shape[0] == expected_shape[0]
+    assert result_shape[1] == expected_shape[1]
+    assert result_shape[2] == expected_shape[2]
+
+
 def test_struct1():
     kernels = Struct1._gen_kernels()
     ctx = xo.ContextCpu()
@@ -539,45 +603,190 @@ def test_getp1_dyn_length_dyn_type_string_array():
         assert ord(ffi.cast("char *", s2)[8 + ii]) == ch
 
 
-def test_gpu_api():
-    for ctx in xo.context.get_test_contexts():
-        src_code = """
-        /*gpufun*/
-        void myfun(double x, double y,
-            double* z){
-            z[0] = x * y;
-            }
+@for_all_test_contexts
+def test_gpu_api(test_context):
+    src_code = """
+    /*gpufun*/
+    void myfun(double x, double y,
+        double* z){
+        z[0] = x * y;
+        }
 
-        /*gpukern*/
-        void my_mul(const int n,
-            /*gpuglmem*/ const double* x1,
-            /*gpuglmem*/ const double* x2,
-            /*gpuglmem*/       double* y) {
-            int tid = 0 //vectorize_over tid n
-            double z;
-            myfun(x1[tid], x2[tid], &z);
-            y[tid] = z;
-            //end_vectorize
+    /*gpukern*/
+    void my_mul(const int n,
+        /*gpuglmem*/ const double* x1,
+        /*gpuglmem*/ const double* x2,
+        /*gpuglmem*/       double* y) {
+        int tid = 0 //vectorize_over tid n
+        double z;
+        myfun(x1[tid], x2[tid], &z);
+        y[tid] = z;
+        //end_vectorize
+        }
+    """
+
+    kernel_descriptions = {
+        "my_mul": xo.Kernel(
+            args=[
+                xo.Arg(xo.Int32, name="n"),
+                xo.Arg(xo.Float64, pointer=True, const=True, name="x1"),
+                xo.Arg(xo.Float64, pointer=True, const=True, name="x2"),
+                xo.Arg(xo.Float64, pointer=True, const=False, name="y"),
+            ],
+            n_threads="n",
+        ),
+    }
+
+    test_context.add_kernels(
+        sources=[src_code],
+        kernels=kernel_descriptions,
+        save_source_as=None,
+        compile=True,
+        extra_classes=[xo.String[:]],
+    )
+
+
+@for_all_test_contexts
+def test_array_of_arrays(test_context):
+    cell_ids = [3, 5, 7]
+    particle_per_cell = [
+        [1, 8],
+        [9, 3, 2],
+        [4, 5, 6, 7],
+    ]
+
+    class Cells(xo.Struct):
+        ids = xo.Int64[:]
+        particles = xo.Int64[:][:]
+
+    cells = Cells(
+        ids=cell_ids, particles=particle_per_cell, _context=test_context
+    )
+
+    # Data layout (displayed as uint64):
+    #
+    #   [0] 216 (cells size)
+    #   [8]  56 (offset field 2 -- particles field)
+    #  [16] cell_ids data:
+    #        [0] 40 (cell_ids size)
+    #        [8]  3 (cell_ids length)
+    #       [16] {3, 5, 7} (cell_ids elements)
+    #  [56] particles data:
+    #        [0] 160 (particles size)
+    #        [8]   3 (particles length)
+    #       [16]  40 (offset particles[0])
+    #       [24]  72 (offset particles[1])
+    #       [32] 112 (offset particles[2])
+    #       [40] particles[0] data:
+    #             [0] 32 (particles[0] size)
+    #             [8]  2 (particles[0] length)
+    #            [16] {1, 8} (particles[0] elements)
+    #       [72] particles[1] data:
+    #             [0] 40 (particles[1] size)
+    #             [8]  3 (particles[1] length)
+    #            [16] {9, 3, 2} (particles[1
+    #      [112] particles[2] data:
+    #             [0] 48 (particles[2] size)
+    #             [8]  4 (particles[2] length)
+    #            [16] {4, 5, 6, 7} (particles[2] elements)
+
+    src = r"""
+    #include "xobjects/headers/common.h"
+
+    static const int MAX_PARTICLES = 4;
+    static const int MAX_CELLS = 3;
+
+    GPUKERN void loop_over(
+        Cells cells,
+        GPUGLMEM uint64_t* out_counts,
+        GPUGLMEM uint64_t* out_vals,
+        GPUGLMEM uint8_t* success
+    )
+    {
+        int64_t num_cells = Cells_len_ids(cells);
+
+        for (int64_t i = 0; i < num_cells; i++) {
+            int64_t id = Cells_get_ids(cells, i);
+            int64_t count = Cells_len1_particles(cells, i);
+
+            if (i >= MAX_CELLS) {
+                *success = 0;
+                continue;
             }
-        """
-
-        kernel_descriptions = {
-            "my_mul": xo.Kernel(
-                args=[
-                    xo.Arg(xo.Int32, name="n"),
-                    xo.Arg(xo.Float64, pointer=True, const=True, name="x1"),
-                    xo.Arg(xo.Float64, pointer=True, const=True, name="x2"),
-                    xo.Arg(xo.Float64, pointer=True, const=False, name="y"),
-                ],
-                n_threads="n",
-            ),
+
+            out_counts[i] = count;
+
+            ArrNInt64 particles = Cells_getp1_particles(cells, i);
+            uint32_t num_particles = ArrNInt64_len(particles);
+
+            VECTORIZE_OVER(j, num_particles);
+                int64_t val = ArrNInt64_get(particles, j);
+
+                if (j >= MAX_PARTICLES) {
+                    *success = 0;
+                } else {
+                    out_vals[i * MAX_PARTICLES + j] = val;
+                }
+            END_VECTORIZE;
         }
+    }
+
+    GPUKERN void kernel_Cells_get_particles(
+        Cells obj,
+        int64_t i0,
+        int64_t i1,
+        GPUGLMEM int64_t* out
+    ) {
+        *out = Cells_get_particles(obj, i0, i1);
+    }
+    """
+
+    kernels = {
+        "loop_over": xo.Kernel(
+            args=[
+                xo.Arg(Cells, name="cells"),
+                xo.Arg(xo.UInt64, pointer=True, name="out_counts"),
+                xo.Arg(xo.UInt64, pointer=True, name="out_vals"),
+                xo.Arg(xo.UInt8, pointer=True, name="success"),
+            ],
+            n_threads=4,
+        ),
+        "kernel_Cells_get_particles": xo.Kernel(
+            args=[
+                xo.Arg(Cells, name="obj"),
+                xo.Arg(xo.Int64, name="i0"),
+                xo.Arg(xo.Int64, name="i1"),
+                xo.Arg(xo.Int64, pointer=True, name="out"),
+            ],
+        ),
+    }
+
+    test_context.add_kernels(
+        sources=[src],
+        kernels=kernels,
+    )
 
-        ctx.add_kernels(
-            sources=[src_code],
-            kernels=kernel_descriptions,
-            # save_src_as=f'_test_{name}.c')
-            save_source_as=None,
-            compile=True,
-            extra_classes=[xo.String[:]],
-        )
+    counts = test_context.zeros(len(cell_ids), dtype=np.uint64)
+    vals = test_context.zeros(12, dtype=np.uint64)
+    success = test_context.zeros((1,), dtype=np.uint8) + 1
+
+    for i, _ in enumerate(particle_per_cell):
+        for j, expected in enumerate(particle_per_cell[i]):
+            result = test_context.zeros(shape=(1,), dtype=np.int64)
+            test_context.kernels.kernel_Cells_get_particles(
+                obj=cells, i0=i, i1=j, out=result
+            )
+            assert result[0] == expected
+
+    test_context.kernels.loop_over(
+        cells=cells,
+        out_counts=counts,
+        out_vals=vals,
+        success=success,
+    )
+    counts = test_context.nparray_from_context_array(counts)
+    vals = test_context.nparray_from_context_array(vals)
+
+    assert success[0] == 1
+    assert np.all(counts == [2, 3, 4])
+    assert np.all(vals == [1, 8, 0, 0, 9, 3, 2, 0, 4, 5, 6, 7])
diff --git a/tests/test_common.py b/tests/test_common.py
new file mode 100644
index 0000000..c1a741c
--- /dev/null
+++ b/tests/test_common.py
@@ -0,0 +1,179 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2025.                   #
+# ########################################### #
+
+import xobjects as xo
+import numpy as np
+import pytest
+
+from xobjects.test_helpers import for_all_test_contexts
+
+
+@for_all_test_contexts
+def test_common_atomicadd(test_context):
+    src = r"""
+    #include "xobjects/headers/common.h"
+    #include "xobjects/headers/atomicadd.h"
+
+    GPUKERN void test_atomic_add(GPUGLMEM double* out, int32_t iterations)
+    {
+        VECTORIZE_OVER(i, iterations);
+            // If on CPU do some work to avoid the loop being optimized out
+            #if defined(XO_CONTEXT_CPU_OPENMP)
+                usleep(10);
+            #endif
+            atomicAdd(out, 1.0);
+        END_VECTORIZE;
+    }
+    """
+
+    n_threads = 1
+    if type(test_context).__name__ in {"ContextCupy", "ContextPyopencl"}:
+        n_threads = 1000
+    elif (
+        test_context.omp_num_threads == "auto"
+        or test_context.omp_num_threads > 1
+    ):
+        n_threads = 8
+
+    test_context.add_kernels(
+        sources=[src],
+        kernels={
+            "test_atomic_add": xo.Kernel(
+                c_name="test_atomic_add",
+                args=[
+                    xo.Arg(xo.Float64, pointer=True, name="out"),
+                    xo.Arg(xo.Int32, name="iterations"),
+                ],
+                n_threads=n_threads,
+            )
+        },
+    )
+
+    expected = 1000
+    result = np.array([0], dtype=np.float64)
+    result_ctx = test_context.nparray_to_context_array(result)
+    test_context.kernels.test_atomic_add(out=result_ctx, iterations=expected)
+    result = test_context.nparray_from_context_array(result_ctx)
+
+    assert result == expected
+
+
+@for_all_test_contexts
+@pytest.mark.parametrize(
+    "overload", [True, False], ids=["overload", "no_overload"]
+)
+@pytest.mark.parametrize(
+    "ctype",
+    [
+        xo.Int8,
+        xo.Int16,
+        xo.Int32,
+        xo.Int64,
+        xo.UInt8,
+        xo.UInt16,
+        xo.UInt32,
+        xo.UInt64,
+        xo.Float32,
+        xo.Float64,
+    ],
+)
+def test_atomic(overload, ctype, test_context):
+    if overload:
+        func_name = "atomicAdd"
+    else:
+        func_name = f'atomicAdd_{ctype.__name__.lower()[0]}{ctype.__name__.split("t")[1]}'
+
+    class TestAtomic(xo.Struct):
+        val = ctype
+        _extra_c_sources = [
+            f"""
+            #include "xobjects/headers/common.h"
+            #include "xobjects/headers/atomicadd.h"
+
+            GPUKERN
+            void run_atomic_test(TestAtomic el, GPUGLMEM {ctype._c_type}* increments,
+                                 GPUGLMEM {ctype._c_type}* retvals, int length) {{
+                VECTORIZE_OVER(ii, length);
+                    GPUGLMEM {ctype._c_type}* val = TestAtomic_getp_val(el);
+                    {ctype._c_type} ret = {func_name}(val, increments[ii]);
+                    retvals[ii] = ret;
+                END_VECTORIZE;
+            }}
+            """
+        ]
+
+    kernels = {
+        "run_atomic_test": xo.Kernel(
+            c_name="run_atomic_test",
+            args=[
+                xo.Arg(TestAtomic, name="el"),
+                xo.Arg(ctype, pointer=True, name="increments"),
+                xo.Arg(ctype, pointer=True, name="retvals"),
+                xo.Arg(xo.Int32, name="length"),
+            ],
+            n_threads="length",
+        )
+    }
+
+    atomic = TestAtomic(_context=test_context, val=0)
+    assert atomic.val == 0
+
+    test_context.add_kernels(kernels=kernels)
+
+    # Test with all increments = 1, so we can check the return values easily.
+    num_steps = 10000
+    if ctype.__name__.startswith("Int") or ctype.__name__.startswith("Uint"):
+        # Less steps to avoid overflow
+        num_steps = min(num_steps, 2 ** (8 * ctype._size - 1) - 1)
+    increments = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype) + 1
+    retvals = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype)
+    test_context.kernels.run_atomic_test(
+        el=atomic, increments=increments, retvals=retvals, length=num_steps
+    )
+    assert atomic.val == num_steps
+    retvals = np.sort(test_context.nparray_from_context_array(retvals))
+    assert np.allclose(
+        retvals,
+        np.arange(num_steps, dtype=ctype._dtype),
+        atol=1.0e-15,
+        rtol=1.0e-15,
+    )
+
+    # Test with random increments, where we now only can check the total sum
+    # (retvals can be anything). Watch out: overflow is undefined behaviour,
+    # except for unsigned integers, so we skip this test for signed integers.
+    atomic.val = 0
+    retvals = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype)
+    if ctype.__name__.startswith("Uint"):
+        low = 0
+        high = 2 ** (8 * ctype._size) - 1
+        increments = np.random.randint(
+            low, high + 1, size=num_steps, dtype=ctype._dtype
+        )
+        increments = test_context.nparray_to_context_array(increments)
+        test_context.kernels.run_atomic_test(
+            el=atomic, increments=increments, retvals=retvals, length=num_steps
+        )
+        increments = test_context.nparray_from_context_array(increments)
+        assert atomic.val == (
+            np.sum(increments).item() % (2 ** (8 * ctype._size))
+        )
+
+    elif ctype.__name__.startswith("Float"):
+        increments = np.zeros(shape=(num_steps,), dtype=ctype._dtype)
+        increments += np.random.uniform(0, 10, size=num_steps)
+        increments = test_context.nparray_to_context_array(increments)
+        test_context.kernels.run_atomic_test(
+            el=atomic, increments=increments, retvals=retvals, length=num_steps
+        )
+        increments = test_context.nparray_from_context_array(increments)
+        if ctype == xo.Float32:
+            assert np.isclose(
+                atomic.val, np.sum(increments), atol=10.0, rtol=1.0e-4
+            )
+        else:
+            assert np.isclose(
+                atomic.val, np.sum(increments), atol=1.0e-6, rtol=1.0e-12
+            )
diff --git a/tests/test_shared_memory.py b/tests/test_shared_memory.py
index 4f2e6fc..31156d6 100644
--- a/tests/test_shared_memory.py
+++ b/tests/test_shared_memory.py
@@ -53,7 +53,7 @@ class TestElement(xo.HybridClass):
 
                   // sum s[0] += s[1]
                   if (tid == 0){
-                    sdata[tid] += sdata[tid + 1]; 
+                    sdata[tid] += sdata[tid + 1];
 
                     // write sum from shared to global mem
                     atomicAdd(&result[tid], sdata[tid]);
diff --git a/update_cprght_statement.py b/update_cprght_statement.py
index c3db6d0..0e9c049 100644
--- a/update_cprght_statement.py
+++ b/update_cprght_statement.py
@@ -6,8 +6,8 @@
 import os
 
 copyright_statement = """copyright #################################
-This file is part of the Xobjects Package. 
-Copyright (c) CERN, 2021.                  
+This file is part of the Xobjects Package.
+Copyright (c) CERN, 2021.
 ###########################################"""
 
 config = [
diff --git a/xobjects/capi.py b/xobjects/capi.py
index 622803f..ec09ea6 100644
--- a/xobjects/capi.py
+++ b/xobjects/capi.py
@@ -5,7 +5,7 @@
 
 from .context import Kernel, Arg
 
-from .scalar import Int64, Void, Int8, is_scalar
+from .scalar import UInt32, Int64, Void, is_scalar
 from .struct import is_field, is_struct
 from .array import is_index, is_array
 from .ref import is_unionref, is_ref
@@ -103,9 +103,10 @@ def get_layers(parts):
 
 
 def int_from_obj(offset, conf):
-    inttype = gen_pointer(conf.get("inttype", "int64_t") + "*", conf)
-    chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
-    return f"*({inttype})(({chartype}) obj+{offset})"
+    """Generate code to read the integer at location obj + offset (bytes)."""
+    int_pointer_type = gen_pointer(conf.get("inttype", "int64_t") + "*", conf)
+    char_pointer_type = gen_pointer(conf.get("chartype", "char") + "*", conf)
+    return f"*({int_pointer_type})(({char_pointer_type}) obj+{offset})"
 
 
 def Field_get_c_offset(self, conf):
@@ -146,7 +147,7 @@ def Index_get_c_offset(part, conf, icount):
         out.append(f"  offset+={soffset};")
     else:
         lookup_field_offset = f"offset+{soffset}"
-        out.append(f"  offset={int_from_obj(lookup_field_offset, conf)};")
+        out.append(f"  offset+={int_from_obj(lookup_field_offset, conf)};")
     return out
 
 
@@ -206,16 +207,17 @@ def gen_fun_kernel(cls, path, action, const, extra, ret, add_nindex=False):
 def gen_c_pointed(target: Arg, conf):
     size = gen_c_size_from_arg(target, conf)
     ret = gen_c_type_from_arg(target, conf)
+
     if target.pointer or is_compound(target.atype) or is_string(target.atype):
         chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
         return f"({ret})(({chartype}) obj+offset)"
-    else:
-        rettype = gen_pointer(ret + "*", conf)
-        if size == 1:
-            return f"*(({rettype}) obj+offset)"
-        else:
-            chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
-            return f"*({rettype})(({chartype}) obj+offset)"
+
+    rettype = gen_pointer(ret + "*", conf)
+    if size == 1:
+        return f"*(({rettype}) obj+offset)"
+
+    chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
+    return f"*({rettype})(({chartype}) obj+offset)"
 
 
 def gen_method_get(cls, path, conf):
@@ -364,13 +366,74 @@ def gen_method_size(cls, path, conf):
 
 
 def gen_method_shape(cls, path, conf):
-    "return shape in an array"
-    return None, None
+    array_type = path[-1]
+
+    out_shape_arg = Arg(Int64, name="out_shape", pointer=True)
+
+    kernel = gen_fun_kernel(
+        cls,
+        path,
+        const=False,
+        action="shape",
+        extra=[out_shape_arg],
+        ret=None,
+        add_nindex=True,
+    )
+    decl = gen_c_decl_from_kernel(kernel, conf)
+
+    lst = [decl + "{"]
+
+    nd = len(array_type._shape)
+
+    if array_type._is_static_shape:
+        terms = [str(dim) for dim in array_type._shape]
+    else:
+        lst.append(gen_method_offset(path, conf))
+        arrarg = Arg(Int64, pointer=True)
+        pointed = gen_c_pointed(arrarg, conf)
+        typearr = gen_pointer("int64_t*", conf)
+        lst.append(f"  {typearr} arr = {pointed};")
+        dim_len_idx = 1
+        terms = []
+        for dim_len in array_type._shape:
+            if dim_len:
+                terms.append(str(dim_len))
+            else:
+                terms.append(f"arr[{dim_len_idx}]")
+                dim_len_idx += 1
+
+    for dim_idx in range(nd):
+        lst.append(f"  out_shape[{dim_idx}] = {terms[dim_idx]};")
+
+    lst.append("}")
+
+    return "\n".join(lst), kernel
 
 
 def gen_method_nd(cls, path, conf):
     "return length of shape"
-    return None, None
+    array_type = path[-1]
+
+    retarg = Arg(UInt32)
+
+    kernel = gen_fun_kernel(
+        cls,
+        path,
+        const=False,
+        action="nd",
+        extra=[],
+        ret=retarg,
+        add_nindex=True,
+    )
+    decl = gen_c_decl_from_kernel(kernel, conf)
+
+    lst = [decl + "{"]
+
+    nd = len(array_type._shape)
+    lst.append(f"  return {nd};")
+    lst.append("}")
+
+    return "\n".join(lst), kernel
 
 
 def gen_method_strides(cls, path, conf):
@@ -507,8 +570,8 @@ def methods_from_path(cls, path, conf):
 
     if is_array(lasttype):
         out.append(gen_method_len(cls, path, conf))
-    #    out.append(gen_method_shape(cls, path, conf))
-    #    out.append(gen_method_nd(cls, path, conf))
+        out.append(gen_method_shape(cls, path, conf))
+        out.append(gen_method_nd(cls, path, conf))
     #    out.append(gen_method_strides(cls, path, conf))
     #    out.append(gen_method_getpos(cls, path, conf))
 
diff --git a/xobjects/context.py b/xobjects/context.py
index bcd03db..0ae2b54 100644
--- a/xobjects/context.py
+++ b/xobjects/context.py
@@ -375,12 +375,30 @@ def get_installed_c_source_paths(self) -> List[str]:
         return sources
 
     @abstractmethod
-    def nparray_to_context_array(self, arr):
-        return arr
+    def nparray_to_context_array(self, arr, copy=False):
+        """Obtain an array on the context, given a numpy array.
+
+        Args:
+            arr: numpy array
+            copy: if True, always create a copy in the context. If False,
+                try to avoid copy if possible (not guaranteed).
+
+        Returns:
+            array on the context
+        """
 
     @abstractmethod
-    def nparray_from_context_array(self, dev_arr):
-        return dev_arr
+    def nparray_from_context_array(self, dev_arr, copy=False):
+        """Obtain a numpy array, given an array on the context.
+
+        Args:
+            arr: array on the context
+            copy: if True, always create a copy in the context. If False,
+                try to avoid copy if possible (not guaranteed).
+
+        Returns:
+            Numpy array
+        """
 
     @property
     @abstractmethod
diff --git a/xobjects/context_cpu.py b/xobjects/context_cpu.py
index 4104ed0..c85bbcd 100644
--- a/xobjects/context_cpu.py
+++ b/xobjects/context_cpu.py
@@ -543,7 +543,7 @@ def cffi_module_for_c_types(c_types, containing_dir="."):
 
         return None
 
-    def nparray_to_context_array(self, arr):
+    def nparray_to_context_array(self, arr, copy=False):
         """
         Moves a numpy array to the device memory. No action is performed by
         this function in the CPU context. The method is provided
@@ -551,14 +551,16 @@ def nparray_to_context_array(self, arr):
 
         Args:
             arr (numpy.ndarray): Array to be transferred
+            copy (bool): If True, a copy of the array is made.
 
         Returns:
-            numpy.ndarray: The same array (no copy!).
-
+            numpy.ndarray: Numpy array with the same data, original or a copy.
         """
+        if copy:
+            arr = np.copy(arr)
         return arr
 
-    def nparray_from_context_array(self, dev_arr):
+    def nparray_from_context_array(self, dev_arr, copy=False):
         """
         Moves an array to the device to a numpy array. No action is performed by
         this function in the CPU context. The method is provided so that the CPU
@@ -566,10 +568,13 @@ def nparray_from_context_array(self, dev_arr):
 
         Args:
             dev_arr (numpy.ndarray): Array to be transferred
-        Returns:
-            numpy.ndarray: The same array (no copy!)
+            copy (bool): If True, a copy of the array is made.
 
+        Returns:
+            numpy.ndarray: Numpy array with the same data, original or a copy.
         """
+        if copy:
+            dev_arr = np.copy(dev_arr)
         return dev_arr
 
     @property
@@ -579,7 +584,6 @@ def nplike_lib(self):
         through ``nplike_lib`` to keep compatibility with the other contexts.
 
         """
-
         return np
 
     @property
@@ -589,7 +593,6 @@ def splike_lib(self):
         through ``splike_lib`` to keep compatibility with the other contexts.
 
         """
-
         return sp
 
     def synchronize(self):
diff --git a/xobjects/context_cupy.py b/xobjects/context_cupy.py
index 8b93372..3838376 100644
--- a/xobjects/context_cupy.py
+++ b/xobjects/context_cupy.py
@@ -487,29 +487,32 @@ def build_kernels(
     def __str__(self):
         return f"{type(self).__name__}:{cupy.cuda.get_device_id()}"
 
-    def nparray_to_context_array(self, arr):
+    def nparray_to_context_array(self, arr, copy=False):
         """
         Copies a numpy array to the device memory.
 
         Args:
             arr (numpy.ndarray): Array to be transferred
+            copy (bool): This parameter is ignored for CUDA, as the data lives
+                on a different device.
 
         Returns:
             cupy.ndarray:The same array copied to the device.
-
         """
         dev_arr = cupy.array(arr)
         return dev_arr
 
-    def nparray_from_context_array(self, dev_arr):
+    def nparray_from_context_array(self, dev_arr, copy=False):
         """
         Copies an array to the device to a numpy array.
 
         Args:
             dev_arr (cupy.ndarray): Array to be transferred.
+            copy (bool): This parameter is ignored for CUDA, as the data lives
+                on a different device.
+
         Returns:
             numpy.ndarray: The same data copied to a numpy array.
-
         """
         return dev_arr.get()
 
diff --git a/xobjects/context_pyopencl.py b/xobjects/context_pyopencl.py
index 843bf85..b48e027 100644
--- a/xobjects/context_pyopencl.py
+++ b/xobjects/context_pyopencl.py
@@ -253,28 +253,30 @@ def __str__(self):
         device_id = self.platform.get_devices().index(self.device)
         return f"{type(self).__name__}:{platform_id}.{device_id}"
 
-    def nparray_to_context_array(self, arr):
-        """
-        Copies a numpy array to the device memory.
+    def nparray_to_context_array(self, arr, copy=False):
+        """Copies a numpy array to the device memory.
+
         Args:
             arr (numpy.ndarray): Array to be transferred
+            copy (bool): This parameter is ignored for OpenCL, as the data lives
+                on a different device.
 
         Returns:
             pyopencl.array.Array:The same array copied to the device.
-
         """
         dev_arr = cla.to_device(self.queue, arr)
         return dev_arr
 
-    def nparray_from_context_array(self, dev_arr):
-        """
-        Copies an array to the device to a numpy array.
+    def nparray_from_context_array(self, dev_arr, copy=False):
+        """Copies an array to the device to a numpy array.
 
         Args:
             dev_arr (pyopencl.array.Array): Array to be transferred.
+            copy (bool): This parameter is ignored for OpenCL, as the data lives
+                on a different device.
+
         Returns:
             numpy.ndarray: The same data copied to a numpy array.
-
         """
         return dev_arr.get()
 
diff --git a/xobjects/headers/atomicadd.h b/xobjects/headers/atomicadd.h
new file mode 100644
index 0000000..6782705
--- /dev/null
+++ b/xobjects/headers/atomicadd.h
@@ -0,0 +1,350 @@
+// copyright ################################# //
+// This file is part of the Xtrack Package.    //
+// Copyright (c) CERN, 2025.                   //
+// ########################################### //
+
+#ifndef _ATOMICADD_H_
+#define _ATOMICADD_H_
+
+#include "xobjects/headers/common.h"
+
+
+// Ensure no conflicting atomicAdd macro is defined earlier for CPU or OpenCL
+#ifndef XO_CONTEXT_CUDA
+#ifdef atomicAdd
+    #warning "Warning: atomicAdd macro already defined, undefining it."
+    #undef atomicAdd
+#endif // atomicAdd
+#endif // XO_CONTEXT_CUDA
+
+
+// ################################# //
+// ###        CPU Contexts       ### //
+// ################################# //
+
+#ifdef XO_CONTEXT_CPU
+#include <stdint.h>
+
+#ifdef XO_CONTEXT_CPU_OPENMP
+    #ifndef _OPENMP
+    #error "XO_CONTEXT_CPU_OPENMP set, but compiled without -fopenmp"
+    #endif
+    // OpenMP atomic capture gives us read+write atomically
+    #define OMP_ATOMIC_CAPTURE  _Pragma("omp atomic capture")
+#else
+    #define OMP_ATOMIC_CAPTURE  // No OpenMP: non-atomic fallback
+#endif // XO_CONTEXT_CPU_OPENMP
+
+// Macro to define atomicAdd for different types, will be overloaded via _Generic.
+#define DEF_ATOMIC_ADD(T, SUF)                                  \
+GPUFUN T atomicAdd_##SUF(GPUGLMEM T *addr, T val) {             \
+    T old;                                                      \
+    const T inc = (T)val;                                       \
+    OMP_ATOMIC_CAPTURE                                          \
+    { old = *addr; *addr = *addr + inc; }                       \
+    return old;                                                 \
+}
+
+DEF_ATOMIC_ADD(int8_t ,   i8)
+DEF_ATOMIC_ADD(int16_t,  i16)
+DEF_ATOMIC_ADD(int32_t,  i32)
+DEF_ATOMIC_ADD(int64_t,  i64)
+DEF_ATOMIC_ADD(uint8_t ,  u8)
+DEF_ATOMIC_ADD(uint16_t, u16)
+DEF_ATOMIC_ADD(uint32_t, u32)
+DEF_ATOMIC_ADD(uint64_t, u64)
+DEF_ATOMIC_ADD(float   , f32)
+DEF_ATOMIC_ADD(double  , f64)
+
+// Using _Generic to select the right function based on type (since C11).
+// See https://en.cppreference.com/w/c/language/generic.html
+#define atomicAdd(addr, val) _Generic((addr),        \
+    int8_t*:   atomicAdd_i8,                         \
+    int16_t*:  atomicAdd_i16,                        \
+    int32_t*:  atomicAdd_i32,                        \
+    int64_t*:  atomicAdd_i64,                        \
+    uint8_t*:  atomicAdd_u8,                         \
+    uint16_t*: atomicAdd_u16,                        \
+    uint32_t*: atomicAdd_u32,                        \
+    uint64_t*: atomicAdd_u64,                        \
+    float*:    atomicAdd_f32,                        \
+    double*:   atomicAdd_f64                         \
+)(addr, (val))
+#endif // XO_CONTEXT_CPU
+
+
+// ################################# //
+// ###        GPU Contexts       ### //
+// ################################# //
+
+/* -------------- Design notes for GPU atomics ---------------------------------
+* We will go for a unified approach for CUDA and OpenCL, with macros where needed.
+* We implement 8/16-bit atomic add by atomically CAS'ing the *containing*
+ * 32-bit word. Only the target byte/halfword lane is modified; the neighbor
+ * lane is preserved. This is linearisable: each successful CAS is one atomic
+ * RMW on the 32-bit word.
+ * Assumptions: little-endian lane layout (true on NVIDIA) and natural alignment
+ * of the 16-bit addresses (addr % 2 == 0). 8-bit has no alignment requirement.
+ * Return value matches CUDA semantics: the **old** value at *addr* (fetch-add).
+ * ---------------------------------------------------------------------------*/
+
+// Info: a CAS function (Compare-And-Swap) takes three arguments: a pointer to
+// the value to be modified, the expected old value, and the new value. The
+// second argument helps to recognise if another thread has modified the value
+// in the meantime. The function compares the value at the address with the
+// expected values, and if they are the same, it writes the new value at the
+// address. In any case, the function returns the actual old value at the
+// address. If the returned value is different from the expected value, it
+// means that another thread has modified the value in the meantime.
+
+// Define types and macros for CUDA and OpenCL
+// -------------------------------------------
+#if defined(XO_CONTEXT_CUDA)
+    // CUDA compiler may not have <stdint.h>, so define the types if needed.
+    #ifdef __CUDACC_RTC__
+        // NVRTC (CuPy RawModule default) can’t see <stdint.h>, so detect it via __CUDACC_RTC__
+        typedef signed char        int8_t;
+        typedef short              int16_t;
+        typedef int                int32_t;
+        typedef long long          int64_t;
+        typedef unsigned char      uint8_t;
+        typedef unsigned short     uint16_t;
+        typedef unsigned int       uint32_t;
+        typedef unsigned long long uint64_t;
+    #else
+        // Alternatively, NVCC path is fine with host headers
+        #include <stdint.h>
+    #endif // __CUDACC_RTC__
+    #define GPUVOLATILE    GPUGLMEM
+    #define __XT_CAS_U32(ptr, exp, val) atomicCAS((GPUVOLATILE uint32_t*)(ptr), (uint32_t)(exp), (uint32_t)(val))
+    #define __XT_CAS_U64(ptr, exp, val) atomicCAS((GPUVOLATILE uint64_t*)(ptr), (uint64_t)(exp), (uint64_t)(val))
+    #define __XT_AS_U32_FROM_F32(x) __float_as_uint(x)
+    #define __XT_AS_F32_FROM_U32(x) __uint_as_float(x)
+    #define __XT_AS_U64_FROM_F64(x) __double_as_longlong(x)
+    #define __XT_AS_F64_FROM_U64(x) __longlong_as_double(x)
+#elif defined(XO_CONTEXT_CL)
+    // It seems OpenCL already has the types from <stdint.h> defined.
+    // typedef char           int8_t;
+    // typedef short          int16_t;
+    // typedef int            int32_t;
+    // typedef long           int64_t;
+    // typedef unsigned char  uint8_t;
+    // typedef unsigned short uint16_t;
+    // typedef unsigned int   uint32_t;
+    // typedef unsigned long  uint64_t;
+    #define GPUVOLATILE    GPUGLMEM volatile
+    #if __OPENCL_C_VERSION__ < 110
+        // Map 1.0 "atom_*" names to 1.1+ "atomic_*"
+        #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+        #define atomic_add     atom_add
+        #define atomic_cmpxchg atom_cmpxchg
+    #endif
+    #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #define __XT_CAS_U32(ptr, exp, val) atomic_cmpxchg((GPUVOLATILE uint32_t*)(ptr), (uint32_t)(exp), (uint32_t)(val))
+    #define __XT_CAS_U64(ptr, exp, val) atom_cmpxchg((GPUVOLATILE uint64_t*)(ptr), (uint64_t)(exp), (uint64_t)(val))
+    #define __XT_AS_U32_FROM_F32(x) as_uint(x)
+    #define __XT_AS_F32_FROM_U32(x) as_float(x)
+    #define __XT_AS_U64_FROM_F64(x) as_ulong(x)
+    #define __XT_AS_F64_FROM_U64(x) as_double(x)
+#endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+
+// Define atomic functions per type
+// --------------------------------
+#if defined(XO_CONTEXT_CUDA) || defined(XO_CONTEXT_CL)
+    // Helper: compute (base 32-bit word pointer, shift, mask) for a byte in that word.
+    GPUFUN inline void __xt_lane8(GPUVOLATILE void* addr, GPUVOLATILE uint32_t** w, uint32_t* sh, uint32_t* mask){
+        size_t a = (size_t)addr;
+        *w    = (GPUVOLATILE uint32_t*)(a & ~(size_t)3); // word: align down to 4-byte boundary
+        *sh   = (uint32_t)((a & (size_t)3) * 8U);        // shift 0,8,16,24 depending on byte lane
+        *mask = (uint32_t)(0xFFu << *sh);
+    }
+    // Helper: same for a halfword (16-bit) in the containing 32-bit word.
+    GPUFUN inline void __xt_lane16(GPUVOLATILE void* addr, GPUVOLATILE uint32_t** w, uint32_t* sh, uint32_t* mask){
+        size_t a = (size_t)addr;
+        *w  = (GPUVOLATILE uint32_t*)(a & ~(size_t)3);  // word: align down to 4-byte boundary
+        *sh = (uint32_t)((a & (size_t)2) ? 16U : 0U);   // shift0 or 16 depending on halfword
+        *mask  = (uint32_t)(0xFFFFU << *sh);
+    }
+
+    // ---------------- 8-bit: int8_t / uint8_t (CAS on 32-bit word) --------------
+    GPUFUN int8_t atomicAdd_i8(GPUVOLATILE int8_t* addr, int8_t val){
+        GPUVOLATILE uint32_t *w;
+        uint32_t sh, mask;
+        __xt_lane8(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;  // byte, newbyte, newword
+        do {
+            assumed = old;
+            b  = (assumed & mask) >> sh;                    // Extract current 8-bit lane
+            nb = (uint32_t)((uint8_t)b + (uint8_t)val);     // Add in modulo-256 (two's complement)
+            nw = (assumed & ~mask) | ((nb & 0xFFU) << sh);  // Merge back updated lane; leave neighbor lanes intact
+            // Try to publish; if someone raced us, retry with their value
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (int8_t)((assumed & mask) >> sh);
+    }
+    GPUFUN uint8_t atomicAdd_u8(GPUVOLATILE uint8_t* addr, uint8_t val){
+        GPUVOLATILE uint32_t *w;
+        uint32_t sh, mask;
+        __xt_lane8(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;
+        do {
+            assumed = old;
+            b  = (assumed & mask) >> sh;
+            nb = (uint32_t)(b + val);                             /* mod 256 */
+            nw = (assumed & ~mask) | ((nb & 0xFFU) << sh);
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (uint8_t)((assumed & mask) >> sh);
+    }
+
+    // ---------------- 16-bit: int16_t / uint16_t (CAS on 32-bit word) -----------
+    GPUFUN int16_t atomicAdd_i16(GPUVOLATILE int16_t* addr, int16_t val){
+        GPUVOLATILE uint32_t* w;
+        uint32_t sh, mask;
+        __xt_lane16(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;
+        do {
+            assumed = old;
+            b  = (assumed & mask) >> sh;
+            nb = (uint32_t)((uint16_t)b + (uint16_t)val);
+            nw = (assumed & ~mask) | ((nb & 0xFFFFU) << sh);
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (int16_t)((assumed & mask) >> sh);
+    }
+    GPUFUN uint16_t atomicAdd_u16(GPUVOLATILE uint16_t* addr, uint16_t val){
+        GPUVOLATILE uint32_t* w;
+        uint32_t sh, mask;
+        __xt_lane16(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;
+        do {
+            assumed = old;
+            b  = (assumed & mask) >> sh;
+            nb = (uint32_t)(b + val);
+            nw = (assumed & ~mask) | ((nb & 0xFFFFU) << sh);
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (uint16_t)((assumed & mask) >> sh);
+    }
+
+    // ---------------- 32-bit: int32_t / uint32_t (built-in) -----------
+    GPUFUN int32_t atomicAdd_i32(GPUVOLATILE int32_t* addr, int32_t val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        return atomic_add(addr, val);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+    GPUFUN uint32_t atomicAdd_u32(GPUVOLATILE uint32_t* addr, uint32_t val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        return atomic_add(addr, val);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+
+    // ---------------- 64-bit: int64_t / uint64_t (built-in or CAS on 64-bit word) -----------
+    GPUFUN int64_t atomicAdd_i64(GPUVOLATILE int64_t* addr, int64_t val){
+        uint64_t old, nw;
+        do {
+            old = *addr;
+            nw = old + val;
+        } while (__XT_CAS_U64((GPUVOLATILE uint64_t*)addr, old, nw) != old);
+        return old;
+    }
+    GPUFUN uint64_t atomicAdd_u64(GPUVOLATILE uint64_t* addr, uint64_t val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        return atom_add(addr, val);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+
+    // ---------------- 32-bit: float (built-in or CAS on 32-bit word) -----------
+    GPUFUN float atomicAdd_f32(GPUVOLATILE float* addr, float val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        uint32_t old, nw;
+        do {
+            old = __XT_AS_U32_FROM_F32(*addr);
+            nw = __XT_AS_U32_FROM_F32(__XT_AS_F32_FROM_U32(old) + val);
+        } while (__XT_CAS_U32((GPUVOLATILE uint32_t*)addr, old, nw) != old);
+        return __XT_AS_F32_FROM_U32(old);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+
+    // ---------------- 64-bit: float (built-in or CAS on 64-bit word) -----------
+    GPUFUN double atomicAdd_f64(GPUVOLATILE double* addr, double val){
+    #if __CUDA_ARCH__ >= 600
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL || __CUDA_ARCH__ < 600
+        uint64_t old, nw;
+        do {
+            old = __XT_AS_U64_FROM_F64(*addr);
+            nw = __XT_AS_U64_FROM_F64(__XT_AS_F64_FROM_U64(old) + val);
+        } while (__XT_CAS_U64((GPUVOLATILE uint64_t*)addr, old, nw) != old);
+        return __XT_AS_F64_FROM_U64(old);
+    #endif // __CUDA_ARCH__ >= 600 / XO_CONTEXT_CL
+    }
+#endif // defined(XO_CONTEXT_CUDA) || defined(XO_CONTEXT_CL)
+
+// Define the overloaded function
+// ------------------------------
+#ifdef XO_CONTEXT_CUDA
+    // NVRTC (CuPy RawModule) usually compiles under extern "C".
+    // In C, function overloading is not possible, but we can cheat by doing it in
+    // C++ (with a different name to avoid clashes with the built-in atomicAdd).
+    // This function will then be remapped to atomicAdd() via a macro in C.
+    #ifdef __cplusplus
+    extern "C++" {
+
+    GPUFUN int8_t   xt_atomicAdd(GPUVOLATILE int8_t*  p, int8_t  v)  { return atomicAdd_i8 (p, v); }
+    GPUFUN uint8_t  xt_atomicAdd(GPUVOLATILE uint8_t* p, uint8_t v)  { return atomicAdd_u8 (p, v); }
+    GPUFUN int16_t  xt_atomicAdd(GPUVOLATILE int16_t* p, int16_t v)  { return atomicAdd_i16(p, v); }
+    GPUFUN uint16_t xt_atomicAdd(GPUVOLATILE uint16_t*p, uint16_t v) { return atomicAdd_u16(p, v); }
+    GPUFUN int64_t  xt_atomicAdd(GPUVOLATILE int64_t*p, int64_t v)   { return atomicAdd_i64(p, v); }
+
+    // Existing type definitions: forward to CUDA built-ins
+    GPUFUN int32_t  xt_atomicAdd(GPUVOLATILE int32_t* p, int32_t v)   { return ::atomicAdd(p, v); }
+    GPUFUN uint32_t xt_atomicAdd(GPUVOLATILE uint32_t* p, uint32_t v) { return ::atomicAdd(p, v); }
+    GPUFUN uint64_t xt_atomicAdd(GPUVOLATILE uint64_t* p, uint64_t v) { return ::atomicAdd(p, v); }
+    GPUFUN float    xt_atomicAdd(GPUVOLATILE float* p, float v)       { return ::atomicAdd(p, v); }
+    #if __CUDA_ARCH__ >= 600
+    GPUFUN double   xt_atomicAdd(GPUVOLATILE double* p, double v)     { return ::atomicAdd(p, v); }
+    #else
+    GPUFUN double   xt_atomicAdd(GPUVOLATILE double* p, double v)     { return atomicAdd_f64(p, v); }
+    #endif
+
+    }
+    #endif // __cplusplus
+
+    // ---------- Global remap of the public name on device code ----------
+    // Define AFTER the wrappers so we don't macro-rewrite our own calls.
+    #ifdef atomicAdd
+    #undef atomicAdd
+    #endif
+    #define atomicAdd(ptr, val) xt_atomicAdd((ptr), (val))
+#endif /* XO_CONTEXT_CUDA */
+
+#ifdef XO_CONTEXT_CL
+    #if !__has_attribute(overloadable)
+        #error "The current OpenCL compiler/architecture does not support __attribute__((overloadable))"
+    #endif
+    #define OCL_OVERLOAD __attribute__((overloadable))
+    GPUFUN int8_t   OCL_OVERLOAD atomicAdd(GPUVOLATILE int8_t*  p, int8_t  v)  { return atomicAdd_i8 (p, v); }
+    GPUFUN uint8_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE uint8_t* p, uint8_t v)  { return atomicAdd_u8 (p, v); }
+    GPUFUN int16_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int16_t* p, int16_t v)  { return atomicAdd_i16(p, v); }
+    GPUFUN uint16_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint16_t*p, uint16_t v) { return atomicAdd_u16(p, v); }
+    GPUFUN int64_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int64_t*p, int64_t v)   { return atomicAdd_i64(p, v); }
+    GPUFUN float    OCL_OVERLOAD atomicAdd(GPUVOLATILE float* p, float v)      { return atomicAdd_f32(p, v); }
+    GPUFUN double   OCL_OVERLOAD atomicAdd(GPUVOLATILE double* p, double v)    { return atomicAdd_f64(p, v); }
+
+    // Existing type definitions: forward to OpenCL built-ins
+    GPUFUN int32_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int32_t* p, int32_t v)   { return atomic_add(p, v); }
+    GPUFUN uint32_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint32_t* p, uint32_t v) { return atomic_add(p, v); }
+    GPUFUN uint64_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint64_t* p, uint64_t v) { return atom_add(p, v); }
+#endif // XO_CONTEXT_CL
+
+#endif //_ATOMICADD_H_
diff --git a/xobjects/headers/common.h b/xobjects/headers/common.h
new file mode 100644
index 0000000..6c7f04c
--- /dev/null
+++ b/xobjects/headers/common.h
@@ -0,0 +1,107 @@
+// copyright ################################# //
+// This file is part of the Xtrack Package.    //
+// Copyright (c) CERN, 2025.                   //
+// ########################################### //
+
+
+#ifndef XOBJECTS_COMMON_H
+#define XOBJECTS_COMMON_H
+
+/*
+    Common macros for vectorization and parallelization, as well as common
+    arithmetic operations.
+*/
+
+#ifdef XO_CONTEXT_CPU_SERIAL
+    // We are on CPU, without OpenMP
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        for (int64_t INDEX_NAME = 0; INDEX_NAME < (COUNT); INDEX_NAME++) {
+
+    #define END_VECTORIZE \
+        }
+#endif  // XO_CONTEXT_CPU_SERIAL
+
+#ifdef XO_CONTEXT_CPU_OPENMP
+    // We are on CPU with the OpenMP context switched on
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        _Pragma("omp parallel for") \
+        for (int64_t INDEX_NAME = 0; INDEX_NAME < (COUNT); INDEX_NAME++) {
+
+    #define END_VECTORIZE \
+        }
+#endif  // XO_CONTEXT_CPU_OPENMP
+
+
+#ifdef XO_CONTEXT_CUDA
+    // We are on a CUDA GPU
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) { \
+            int64_t INDEX_NAME = blockDim.x * blockIdx.x + threadIdx.x; \
+            if (INDEX_NAME < (COUNT)) {
+
+    #define END_VECTORIZE \
+            } \
+        }
+#endif  // XO_CONTEXT_CUDA
+
+
+#ifdef XO_CONTEXT_CL
+    // We are on an OpenCL GPU
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        { \
+            int64_t INDEX_NAME = get_global_id(0); \
+            if (INDEX_NAME < (COUNT)) { \
+
+    #define END_VECTORIZE \
+            } \
+        }
+#endif  // XO_CONTEXT_CL
+
+
+/*
+    Qualifier keywords for GPU and optimisation
+*/
+
+#ifdef XO_CONTEXT_CPU // for both serial and OpenMP
+    #define GPUKERN
+    #define GPUFUN      static inline
+    #define GPUGLMEM
+    #define RESTRICT    restrict
+#endif
+
+
+#ifdef XO_CONTEXT_CUDA
+    #define GPUKERN     __global__
+    #define GPUFUN      __device__
+    #define GPUGLMEM
+    #define RESTRICT
+#endif // XO_CONTEXT_CUDA
+
+
+#ifdef XO_CONTEXT_CL
+    #define GPUKERN     __kernel
+    #define GPUFUN
+    #define GPUGLMEM    __global
+    #define RESTRICT
+#endif // XO_CONTEXT_CL
+
+
+/*
+    Common maths-related macros
+*/
+
+#define POW2(X) ((X)*(X))
+#define POW3(X) ((X)*(X)*(X))
+#define POW4(X) ((X)*(X)*(X)*(X))
+#define NONZERO(X) ((X) != 0.0)
+#define NONZERO_TOL(X, TOL) (fabs((X)) > (TOL))
+
+
+#ifndef VECTORIZE_OVER
+#error "Unknown context, or the expected context (XO_CONTEXT_*) flag undefined. Try updating Xobjects?"
+#endif
+
+#endif  // XOBJECTS_COMMON_H