Move some macros that should be in xobjects to it; black

szymonlopaciuk · szymonlopaciuk · commit 1ec795bf79d5 · 2025-08-07T10:18:18.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,5 @@ cov_html
 .vscode
 .pytest_cache
 .coverage
+.idea
 *.c
diff --git a/Architecture.md b/Architecture.md
@@ -40,19 +40,19 @@ Buffer:
 
 Types can be composed of:
 - scalar: numbers, String
-- compound: Struct, Array, Ref, UnionRef 
+- compound: Struct, Array, Ref, UnionRef
 
 ### Scalars
 - examples: Float64, Int64, ...
 - create: Float64(3.14)
 - memory layout
-    - data 
+    - data
 
 ### String:
 - create: String(string_or_int)
 - memory layout
     - size
-    - data 
+    - data
 
 
 ### Struct
diff --git a/LICENSE b/LICENSE
@@ -198,4 +198,4 @@
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
+   limitations under the License.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,4 +1,43 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "xobjects"
+dynamic = ["version"]
+description = "In-memory serialization and code generator for CPU and GPU"
+readme = ""
+authors = [
+    { name = "Riccardo De Maria", email = "riccardo.de.maria@cern.ch" }
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.7"
+dependencies = [
+    "numpy",
+    "cffi",
+    "scipy"
+]
+[project.urls]
+Homepage = "https://xsuite.readthedocs.io/"
+"Bug Tracker" = "https://github.com/xsuite/xsuite/issues"
+Documentation = "https://xsuite.readthedocs.io/"
+"Source Code" = "https://github.com/xsuite/xobjects"
+"Download" = "https://pypi.python.org/pypi/xobjects"
+
+[project.optional-dependencies]
+tests = ["pytest", "pytest-mock"]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["xobjects"]
+
+[tool.setuptools.dynamic]
+version = {attr = "xobjects._version.__version__"}
+
 [tool.black]
 line-length = 79
-target-version = ['py36', 'py37', 'py38']
+target-version = ['py310', 'py311', 'py312']
 include = '\.pyi?$'
+
+[project.entry-points.xobjects]
+include = "xobjects"
diff --git a/tests/test_common.py b/tests/test_common.py
@@ -0,0 +1,54 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2025.                   #
+# ########################################### #
+
+import xobjects as xo
+from xobjects.test_helpers import for_all_test_contexts
+
+
+@for_all_test_contexts
+def test_common_atomicadd(test_context):
+    src = r"""
+    #include <xobjects/headers/common.h>
+
+    GPUKERN
+    double test_atomic_add()
+    {
+        int iterations = 1000;
+        double sum = 0;
+        VECTORIZE_OVER(i, iterations);
+            // If on CPU do some work to avoid the loop being optimized out
+            #if defined(XO_CONTEXT_CPU_OPENMP)
+                usleep(10);
+            #endif
+            atomicAdd(&sum, 1.0);
+        END_VECTORIZE;
+        return sum;
+    }
+    """
+
+    n_threads = 1
+    if type(test_context).__name__ in {"ContextCupy", "ContextPyopencl"}:
+        n_threads = 1000
+    elif (
+        test_context.omp_num_threads == "auto"
+        or test_context.omp_num_threads > 1
+    ):
+        n_threads = 8
+
+    test_context.add_kernels(
+        sources=[src],
+        kernels={
+            "test_atomic_add": xo.Kernel(
+                args=[],
+                n_threads=n_threads,
+                ret=xo.Arg(xo.Float64),
+            )
+        },
+    )
+
+    expected = 1000
+    result = test_context.kernels.test_atomic_add()
+
+    assert result == expected
diff --git a/tests/test_shared_memory.py b/tests/test_shared_memory.py
@@ -53,7 +53,7 @@ class TestElement(xo.HybridClass):
 
                   // sum s[0] += s[1]
                   if (tid == 0){
-                    sdata[tid] += sdata[tid + 1]; 
+                    sdata[tid] += sdata[tid + 1];
 
                     // write sum from shared to global mem
                     atomicAdd(&result[tid], sdata[tid]);
diff --git a/update_cprght_statement.py b/update_cprght_statement.py
@@ -6,8 +6,8 @@
 import os
 
 copyright_statement = """copyright #################################
-This file is part of the Xobjects Package. 
-Copyright (c) CERN, 2021.                  
+This file is part of the Xobjects Package.
+Copyright (c) CERN, 2021.
 ###########################################"""
 
 config = [
diff --git a/xobjects/context_cupy.py b/xobjects/context_cupy.py
@@ -457,7 +457,11 @@ def build_kernels(
 
         extra_include_paths = self.get_installed_c_source_paths()
         include_flags = [f"-I{path}" for path in extra_include_paths]
-        extra_compile_args = (*extra_compile_args, *include_flags, "-DXO_CONTEXT_CUDA")
+        extra_compile_args = (
+            *extra_compile_args,
+            *include_flags,
+            "-DXO_CONTEXT_CUDA",
+        )
 
         module = cupy.RawModule(
             code=specialized_source, options=extra_compile_args
diff --git a/xobjects/headers/atomicadd.h b/xobjects/headers/atomicadd.h
@@ -0,0 +1,55 @@
+// copyright ################################# //
+// This file is part of the Xfields Package.   //
+// Copyright (c) CERN, 2021.                   //
+// ########################################### //
+
+#ifndef _ATOMICADD_H_
+#define _ATOMICADD_H_
+
+/*
+    Atomic add function (double type) for different contexts.
+    Following the blueprint of CUDA's atomicAdd function, the return
+    value is the old value of the address before the addition.
+*/
+
+#if defined(XO_CONTEXT_CPU_SERIAL)
+    inline double atomicAdd(double *addr, double val)
+    {
+        double old_val = *addr;
+        *addr = *addr + val;
+        return old_val;
+    }
+#elif defined(XO_CONTEXT_CPU_OPENMP)
+    inline double atomicAdd(double *addr, double val)
+    {
+        double old_val = *addr;
+        #pragma omp atomic
+        *addr += val;
+        return old_val;
+    }
+#elif defined(XO_CONTEXT_CL)
+    #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+    inline double atomicAdd(volatile __global double *addr, double val)
+    {
+        union {
+            long u64;
+            double f64;
+        } next, expected, current;
+        current.f64 = *addr;
+        do {
+            expected.f64 = current.f64;
+            next.f64 = expected.f64 + val;
+            current.u64 = atom_cmpxchg(
+                (volatile __global long *)addr,
+                (long) expected.u64,
+                (long) next.u64);
+        } while( current.u64 != expected.u64 );
+        return current.f64;
+    }
+#elif defined(XO_CONTEXT_CUDA)
+    // CUDA already provides this
+#else
+    #error "Atomic add not implemented for this context"
+#endif
+
+#endif // _ATOMICADD_H_
diff --git a/xobjects/headers/common.h b/xobjects/headers/common.h
@@ -0,0 +1,104 @@
+#ifndef XOBJECTS_COMMON_H
+#define XOBJECTS_COMMON_H
+
+#include "xobjects/headers/atomicadd.h"
+
+/*
+    Common macros for vectorization and parallelization, as well as common
+    arithmetic operations.
+*/
+
+#ifdef XO_CONTEXT_CPU_SERIAL
+    // We are on CPU, without OpenMP
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        for (int64_t INDEX_NAME = 0; INDEX_NAME < (COUNT); INDEX_NAME++) {
+
+    #define END_VECTORIZE \
+        }
+#endif  // XO_CONTEXT_CPU_SERIAL
+
+#ifdef XO_CONTEXT_CPU_OPENMP
+    // We are on CPU with the OpenMP context switched on
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        _Pragma("omp parallel for") \
+        for (int64_t INDEX_NAME = 0; INDEX_NAME < (COUNT); INDEX_NAME++) {
+
+    #define END_VECTORIZE \
+        }
+
+#endif  // XO_CONTEXT_CPU_OPENMP
+
+
+#ifdef XO_CONTEXT_CUDA
+    // We are on a CUDA GPU
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) { \
+            int64_t INDEX_NAME = blockDim.x * blockIdx.x + threadIdx.x; \
+            if (INDEX_NAME < (COUNT)) {
+
+    #define END_VECTORIZE \
+            } \
+        }
+#endif  // XO_CONTEXT_CUDA
+
+
+#ifdef XO_CONTEXT_CL
+    // We are on an OpenCL GPU
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        { \
+            int64_t INDEX_NAME = get_global_id(0);
+            if (INDEX_NAME < (COUNT)) { \
+
+    #define END_VECTORIZE \
+            } \
+        }
+#endif  // XO_CONTEXT_CL
+
+
+/*
+    Qualifier keywords for GPU and optimisation
+*/
+
+#ifdef XO_CONTEXT_CPU // for both serial and OpenMP
+    #define GPUKERN
+    #define GPUFUN      static inline
+    #define GPUGLMEM
+    #define RESTRICT    restrict
+#endif
+
+
+#ifdef XO_CONTEXT_CUDA
+    #define GPUKERN     __global__
+    #define GPUFUN      __device__
+    #define GPUGLMEM
+    #define RESTRICT
+#endif // XO_CONTEXT_CUDA
+
+
+#ifdef XO_CONTEXT_CL
+    #define GPUKERN     __kernel
+    #define GPUFUN
+    #define GPUGLMEM    __global
+    #define RESTRICT
+#endif // XO_CONTEXT_CL
+
+
+/*
+    Common maths-related macros
+*/
+
+#define POW2(X) ((X)*(X))
+#define POW3(X) ((X)*(X)*(X))
+#define POW4(X) ((X)*(X)*(X)*(X))
+#define NONZERO(X) ((X) != 0.0)
+#define NONZERO_TOL(X, TOL) (fabs((X)) > (TOL))
+
+
+#ifndef VECTORIZE_OVER
+#error "Unknown context, or the expected context (XO_CONTEXT_*) flag undefined. Try updating Xobjects?"
+#endif
+
+#endif  // XOBJECTS_COMMON_H
diff --git a/xobjects/hybrid_class.py b/xobjects/hybrid_class.py
@@ -40,7 +40,6 @@ def __set__(self, container, value):
         if self.isnplikearray:
             self.__get__(container=container)[:] = value
         elif hasattr(value, "_xobject"):  # value is a dressed xobject
-
             # Copy xobject data from value inside self._xobject
             # (unless same memory area or Ref and same buffer,
             #  in the latter case reference mechanism is used)
@@ -381,7 +380,6 @@ def compile_kernels(self, *args, **kwargs):
         return self._xobject.compile_kernels(*args, **kwargs)
 
     def __repr__(self):
-
         if hasattr(self, "_repr_fields"):
             fnames = self._repr_fields
         else: