diff --git a/examples/element_internal_record/000_internal_record.py b/examples/element_internal_record/000_internal_record.py
index 71c272137..0c6c5719f 100644
--- a/examples/element_internal_record/000_internal_record.py
+++ b/examples/element_internal_record/000_internal_record.py
@@ -36,20 +36,22 @@ class TestElementRecord(xo.HybridClass):
 # element number.
 
 track_method_source = r'''
-/*gpufun*/
-void TestElement_track_local_particle(TestElementData el, LocalParticle* part0){
+#include "xtrack/headers/track.h"
 
+GPUFUN
+void TestElement_track_local_particle(TestElementData el, LocalParticle* part0)
+{
     // Extract the record and record_index
     TestElementRecordData record = TestElementData_getp_internal_record(el, part0);
     RecordIndex record_index = NULL;
-    if (record){
+    if (record) {
         record_index = TestElementRecordData_getp__index(record);
     }
 
     int64_t n_kicks = TestElementData_get_n_kicks(el);
     printf("n_kicks %d\n", (int)n_kicks);
 
-    //start_per_particle_block (part0->part)
+    START_PER_PARTICLE_BLOCK(part0, part);
 
         for (int64_t i = 0; i < n_kicks; i++) {
             double rr = 1e-6 * RandomUniform_generate(part);
@@ -72,8 +74,7 @@ class TestElementRecord(xo.HybridClass):
             }
         }
 
-
-    //end_per_particle_block
+    END_PER_PARTICLE_BLOCK;
 }
 '''
 
diff --git a/examples/element_internal_record/001_multirecord.py b/examples/element_internal_record/001_multirecord.py
index 2d081187c..d4f21212c 100644
--- a/examples/element_internal_record/001_multirecord.py
+++ b/examples/element_internal_record/001_multirecord.py
@@ -41,7 +41,9 @@ class TestElementRecord(xo.HybridClass):
 # code of the beam element.
 
 TestElement_track_method_source = r'''
-    /*gpufun*/
+    #include "xtrack/headers/track.h"
+    
+    GPUFUN
     void TestElement_track_local_particle(TestElementData el, LocalParticle* part0){
 
         // Extract the record and record_index
@@ -60,7 +62,7 @@ class TestElementRecord(xo.HybridClass):
         int64_t n_kicks = TestElementData_get_n_kicks(el);
         printf("n_kicks %d\n", (int)n_kicks);
 
-        //start_per_particle_block (part0->part)
+        START_PER_PARTICLE_BLOCK(part0, part);
 
             // Record in table1 info about the ingoing particle
             if (record){
@@ -106,7 +108,7 @@ class TestElementRecord(xo.HybridClass):
                 }
             }
 
-        //end_per_particle_block
+        END_PER_PARTICLE_BLOCK;
     }
     '''
 
diff --git a/examples/element_internal_record/002_record_in_individual_element.py b/examples/element_internal_record/002_record_in_individual_element.py
index 57cc4581f..a144876b9 100644
--- a/examples/element_internal_record/002_record_in_individual_element.py
+++ b/examples/element_internal_record/002_record_in_individual_element.py
@@ -40,7 +40,9 @@ class TestElementRecord(xo.HybridClass):
 # code of the beam element.
 
 TestElement_track_method_source = r'''
-    /*gpufun*/
+    #include "xtrack/headers/track.h"
+    
+    GPUFUN
     void TestElement_track_local_particle(TestElementData el, LocalParticle* part0){
 
         // Extract the record and record_index
@@ -59,7 +61,7 @@ class TestElementRecord(xo.HybridClass):
         int64_t n_kicks = TestElementData_get_n_kicks(el);
         // printf("n_kicks %d\n", (int)n_kicks);
 
-        //start_per_particle_block (part0->part)
+        START_PER_PARTICLE_BLOCK(part0, part);
 
             // Record in table1 info about the ingoing particle
             if (record){
@@ -105,7 +107,7 @@ class TestElementRecord(xo.HybridClass):
                 }
             }
 
-        //end_per_particle_block
+        END_PER_PARTICLE_BLOCK;
     }
     '''
 
diff --git a/examples/random_number_generator/001_usage_in_beam_element.py b/examples/random_number_generator/001_usage_in_beam_element.py
index 32cc71e71..e91908af8 100644
--- a/examples/random_number_generator/001_usage_in_beam_element.py
+++ b/examples/random_number_generator/001_usage_in_beam_element.py
@@ -27,12 +27,15 @@ class TestElement(xt.BeamElement):
 
     _extra_c_sources = [
         '''
-        /*gpufun*/
-        void TestElement_track_local_particle(TestElementData el, LocalParticle* part0){
-            //start_per_particle_block (part0->part)
+        #include "xtrack/headers/track.h"
+        
+        GPUFUN
+        void TestElement_track_local_particle(TestElementData el, LocalParticle* part0)
+        {
+            START_PER_PARTICLE_BLOCK(part0, part);
                 double rr = !!GENERATOR!!_generate(part);
                 LocalParticle_set_x(part, rr);
-            //end_per_particle_block
+            END_PER_PARTICLE_BLOCK;
         }
         '''.replace('!!GENERATOR!!', generator_to_test)
         ]
diff --git a/examples/spin_lep/chirp_kicker.py b/examples/spin_lep/chirp_kicker.py
index e0a0b377d..de8079743 100644
--- a/examples/spin_lep/chirp_kicker.py
+++ b/examples/spin_lep/chirp_kicker.py
@@ -19,22 +19,22 @@ class VerticalChirpKicker(xt.BeamElement):
 
     _extra_c_sources =['''
 
-        #include <headers/track.h>
-        #include <beam_elements/elements_src/track_magnet.h>
+        #include "xtrack/headers/track.h"
+        #include "xtrack/beam_elements/elements_src/track_magnet.h"
 
-        /*gpufun*/
+        GPUFUN
         void VerticalChirpKicker_track_local_particle(
-                VerticalChirpKickerData el, LocalParticle* part0){
-
+                VerticalChirpKickerData el, LocalParticle* part0)
+        {
             double const k0sl = VerticalChirpKickerData_get_k0sl(el);
             double const q_start = VerticalChirpKickerData_get_q_start(el);
             double const q_end = q_start + VerticalChirpKickerData_get_q_span(el);
             double const num_turns = VerticalChirpKickerData_get_num_turns(el);
             double const length = VerticalChirpKickerData_get_length(el);
 
-            //start_per_particle_block (part0->part)
+            START_PER_PARTICLE_BLOCK(part0, part);
                 double const at_turn = LocalParticle_get_at_turn(part);
-                if (at_turn < num_turns){
+                if (at_turn < num_turns) {
                     // integrating to get the instantaneous phase
                     double const phi = 2 * PI * q_start * at_turn
                        + PI * (q_end - q_start) / ((double) num_turns) * ((double) at_turn * at_turn);
@@ -60,9 +60,9 @@ class VerticalChirpKicker(xt.BeamElement):
                         0, // frame curvature
                         length,
                         length // lpath - same for a thin element
-                       );
-                    #endif
+                    );
+                #endif
                 }
-            //end_per_particle_block
+            END_PER_PARTICLE_BLOCK;
         }
         ''']
diff --git a/pyproject.toml b/pyproject.toml
index 966ba860e..3516aed9d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,62 @@
 [build-system]
 build-backend = 'setuptools.build_meta'
 requires = [
-    'setuptools >= 43.0.0',
-    'numpy',
+    'setuptools >= 61.0.0',
 ]
+
+[project]
+name = "xtrack"
+dynamic = ["version"]
+description = "Tracking library for particle accelerators"
+readme = "README.md"
+requires-python = ">=3.9"
+license = { text = "Apache-2.0" }
+authors = [
+  { name = "G. Iadarola et al." }
+]
+dependencies = [
+  "numpy>=1.0",
+  "pandas>=2.0",
+  "scipy",
+  "tqdm",
+  "requests",
+  "xobjects",
+  "xdeps",
+]
+
+[project.urls]
+"Homepage" = "https://xsuite.readthedocs.io/"
+"Bug Tracker" = "https://github.com/xsuite/xsuite/issues"
+"Documentation" = "https://xsuite.readthedocs.io/"
+"Source Code" = "https://github.com/xsuite/xtrack"
+"Download" = "https://pypi.python.org/pypi/xtrack"
+
+[project.optional-dependencies]
+tests = [
+  "cpymad",
+  "nafflib",
+  "PyHEADTAIL",
+  "pytest",
+  "pytest-mock",
+  "pymadng",
+  "requests-mock",
+  "tfs-pandas",
+]
+notebooks = [
+  "jupyter",
+  "ipympl",
+  "xplt"
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["xtrack", "ducktrack"]
+
+[tool.setuptools.dynamic]
+version = { attr = "xtrack._version.__version__" }
+
+[tool.setuptools]
+include-package-data = true
+
+[project.entry-points.xobjects]
+include = "xtrack"
diff --git a/setup.py b/setup.py
deleted file mode 100644
index ff4151360..000000000
--- a/setup.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# copyright ############################### #
-# This file is part of the Xtrack Package.  #
-# Copyright (c) CERN, 2021.                 #
-# ######################################### #
-
-from setuptools import setup, find_packages, Extension
-from pathlib import Path
-
-#######################################
-# Prepare list of compiled extensions #
-#######################################
-
-extensions = []
-
-#########
-# Setup #
-#########
-
-version_file = Path(__file__).parent / 'xtrack/_version.py'
-dd = {}
-with open(version_file.absolute(), 'r') as fp:
-    exec(fp.read(), dd)
-__version__ = dd['__version__']
-
-setup(
-    name='xtrack',
-    version=__version__,
-    description='Tracking library for particle accelerators',
-    long_description='Tracking library for particle accelerators',
-    url='https://xsuite.readthedocs.io/',
-    author='G. Iadarola et al.',
-    license='Apache 2.0',
-    download_url="https://pypi.python.org/pypi/xtrack",
-    project_urls={
-            "Bug Tracker": "https://github.com/xsuite/xsuite/issues",
-            "Documentation": 'https://xsuite.readthedocs.io/',
-            "Source Code": "https://github.com/xsuite/xtrack",
-        },
-    packages=find_packages(),
-    ext_modules = extensions,
-    include_package_data=True,
-    install_requires=[
-        'numpy>=1.0',
-        "pandas>=2.0",
-        'scipy',
-        'tqdm',
-        'requests',
-        'xobjects',
-        'xdeps'
-    ],
-    extras_require={
-        'tests': ['cpymad', 'nafflib', 'PyHEADTAIL', 'pytest', 'pytest-mock',
-                  'pymadng', 'requests-mock', 'tfs-pandas'],
-        'notebooks': ['jupyter', 'ipympl', 'xplt'],
-    },
-)
diff --git a/tests/test_atomic_add.py b/tests/test_atomic_add.py
deleted file mode 100644
index 97a5afbca..000000000
--- a/tests/test_atomic_add.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# copyright ############################### #
-# This file is part of the Xtrack Package.  #
-# Copyright (c) CERN, 2025.                 #
-# ######################################### #
-
-import pytest
-import numpy as np
-
-import xtrack as xt
-import xobjects as xo
-from xobjects.test_helpers import for_all_test_contexts
-
-
-@for_all_test_contexts
-@pytest.mark.parametrize("overload", [True, False], ids=['overload', 'no_overload'])
-@pytest.mark.parametrize("ctype", [xo.Int8, xo.Int16, xo.Int32, xo.Int64,
-                                   xo.UInt8, xo.UInt16, xo.UInt32,
-                                   xo.UInt64, xo.Float32, xo.Float64])
-def test_atomic(overload, ctype, test_context):
-    if overload:
-        func_name = 'atomicAdd'
-    else:
-        func_name = f'atomicAdd_{ctype.__name__.lower()[0]}{ctype.__name__.split("t")[1]}'
-    class TestAtomic(xt.BeamElement):
-        _xofields = {f'val':  ctype}
-        allow_track = False
-        _extra_c_sources = [f'''
-        #include <headers/track.h>
-        #include <headers/atomicadd.h>
-
-        GPUKERN
-        void run_atomic_test(TestAtomicData el, GPUGLMEM {ctype._c_type}* increments,
-                             GPUGLMEM {ctype._c_type}* retvals, int length) {{
-            VECTORIZE_OVER(ii, length);
-                GPUGLMEM {ctype._c_type}* val = TestAtomicData_getp_val(el);
-                {ctype._c_type} ret = {func_name}(val, increments[ii]);
-                retvals[ii] = ret;
-            END_VECTORIZE;
-        }}
-        ''']
-
-        _kernels = {
-            'run_atomic_test': xo.Kernel(
-                    c_name='run_atomic_test',
-                    args=[xo.Arg(xo.ThisClass, name='el'),
-                          xo.Arg(ctype, pointer=True, name='increments'),
-                          xo.Arg(ctype, pointer=True, name='retvals'),
-                          xo.Arg(xo.Int32, name='length')],
-                    n_threads='length')
-        }
-
-    atomic = TestAtomic(_context=test_context, val=0)
-    assert atomic.val == 0
-
-    # Test with all increments = 1, so we can check the return values easily.
-    num_steps = 10000
-    if ctype.__name__.startswith('Int') or ctype.__name__.startswith('Uint'):
-        # Less steps to avoid overflow
-        num_steps = min(num_steps, 2**(8*ctype._size - 1) - 1)
-    increments = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype) + 1
-    retvals = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype)
-    atomic.run_atomic_test(increments=increments, retvals=retvals, length=num_steps)
-    assert atomic.val == num_steps
-    retvals = np.sort(test_context.nparray_from_context_array(retvals))
-    assert np.allclose(retvals, np.arange(num_steps, dtype=ctype._dtype), atol=1.e-15, rtol=1.e-15)
-
-    # Test with random increments, where we now only can check the total sum
-    # (retvals can be anything). Watch out: overflow is undefined behaviour,
-    # except for unsigned integers, so we skip this test for signed integers.
-    atomic.val = 0
-    retvals = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype)
-    if ctype.__name__.startswith('Uint'):
-        low = 0
-        high = 2**(8*ctype._size) - 1
-        increments = np.random.randint(low, high+1, size=num_steps, dtype=ctype._dtype)
-        increments = test_context.nparray_to_context_array(increments)
-        atomic.run_atomic_test(increments=increments, retvals=retvals, length=num_steps)
-        increments = test_context.nparray_from_context_array(increments)
-        assert atomic.val == (np.sum(increments).item() % (2**(8*ctype._size)))
-
-    elif ctype.__name__.startswith('Float'):
-        increments  = np.zeros(shape=(num_steps,), dtype=ctype._dtype)
-        increments += np.random.uniform(0, 10, size=num_steps)
-        increments  = test_context.nparray_to_context_array(increments)
-        atomic.run_atomic_test(increments=increments, retvals=retvals, length=num_steps)
-        increments = test_context.nparray_from_context_array(increments)
-        if ctype == xo.Float32:
-            assert np.isclose(atomic.val, np.sum(increments), atol=10., rtol=1.e-4)
-        else:
-            assert np.isclose(atomic.val, np.sum(increments), atol=1.e-6, rtol=1.e-12)
diff --git a/tests/test_element_internal_record.py b/tests/test_element_internal_record.py
index c61a6d555..1034872ea 100644
--- a/tests/test_element_internal_record.py
+++ b/tests/test_element_internal_record.py
@@ -26,7 +26,9 @@ class TestElementRecord(xo.HybridClass):
     extra_src = []
 
     extra_src.append(r'''
-        /*gpufun*/
+        #include "xtrack/headers/track.h"
+
+        GPUFUN
         void TestElement_track_local_particle(TestElementData el, LocalParticle* part0){
 
             // Extract the record and record_index
@@ -39,8 +41,7 @@ class TestElementRecord(xo.HybridClass):
             int64_t n_kicks = TestElementData_get_n_kicks(el);
             //printf("n_kicks %d\n", (int)n_kicks);
 
-            //start_per_particle_block (part0->part)
-
+            START_PER_PARTICLE_BLOCK(part0, part);
                 for (int64_t i = 0; i < n_kicks; i++) {
                     double rr = 1e-6 * RandomUniform_generate(part);
                     LocalParticle_add_to_px(part, rr);
@@ -61,9 +62,7 @@ class TestElementRecord(xo.HybridClass):
                         }
                     }
                 }
-
-
-            //end_per_particle_block
+            END_PER_PARTICLE_BLOCK;
         }
         ''')
 
@@ -205,7 +204,9 @@ class TestElementRecord(xo.HybridClass):
     extra_src = []
 
     extra_src.append(r'''
-        /*gpufun*/
+        #include "xtrack/headers/track.h"
+        
+        GPUFUN
         void TestElement_track_local_particle(TestElementData el, LocalParticle* part0){
 
             // Extract the record and record_index
@@ -218,8 +219,7 @@ class TestElementRecord(xo.HybridClass):
             int64_t n_kicks = TestElementData_get_n_kicks(el);
             //printf("n_kicks %d\n", (int)n_kicks);
 
-            //start_per_particle_block (part0->part)
-
+            START_PER_PARTICLE_BLOCK(part0, part);
                 for (int64_t i = 0; i < n_kicks; i++) {
                     // We don't apply the kick, otherwise the twiss fails
                     double rr = 1e-6 * RandomUniform_generate(part);
@@ -240,9 +240,7 @@ class TestElementRecord(xo.HybridClass):
                         }
                     }
                 }
-
-
-            //end_per_particle_block
+            END_PER_PARTICLE_BLOCK;
         }
         ''')
 
@@ -308,7 +306,9 @@ class TestElementRecord(xo.HybridClass):
     extra_src = []
 
     extra_src.append(r'''
-        /*gpufun*/
+        #include "xtrack/headers/track.h"
+
+        GPUFUN
         void TestElement_track_local_particle(TestElementData el, LocalParticle* part0){
 
             // Extract the record and record_index
@@ -327,8 +327,7 @@ class TestElementRecord(xo.HybridClass):
             int64_t n_kicks = TestElementData_get_n_kicks(el);
             // printf("n_kicks %d\n", (int)n_kicks);
 
-            //start_per_particle_block (part0->part)
-
+            START_PER_PARTICLE_BLOCK(part0, part);
                 // Record in table1 info about the ingoing particle
                 if (record){
                     // Get a slot in table1
@@ -372,8 +371,7 @@ class TestElementRecord(xo.HybridClass):
                         }
                     }
                 }
-
-            //end_per_particle_block
+            END_PER_PARTICLE_BLOCK;
         }
         ''')
 
@@ -565,7 +563,9 @@ class TestElementRecord(xo.HybridClass):
     extra_src = []
 
     extra_src.append(r'''
-        /*gpufun*/
+        #include "xtrack/headers/track.h"
+
+        GPUFUN
         void TestElement_track_local_particle(TestElementData el, LocalParticle* part0){
 
             // Extract the record and record_index
@@ -584,8 +584,7 @@ class TestElementRecord(xo.HybridClass):
             int64_t n_kicks = TestElementData_get_n_kicks(el);
             // printf("n_kicks %d\n", (int)n_kicks);
 
-            //start_per_particle_block (part0->part)
-
+            START_PER_PARTICLE_BLOCK(part0, part);
                 // Record in table1 info about the ingoing particle
                 if (record){
                     // Get a slot in table1
@@ -629,8 +628,7 @@ class TestElementRecord(xo.HybridClass):
                         }
                     }
                 }
-
-            //end_per_particle_block
+            END_PER_PARTICLE_BLOCK;
         }
         ''')
 
diff --git a/tests/test_elements.py b/tests/test_elements.py
index 8d82f13e4..ef337229c 100644
--- a/tests/test_elements.py
+++ b/tests/test_elements.py
@@ -542,34 +542,36 @@ def test_exciter(test_context):
 
 
 test_source = r"""
-/*gpufun*/
+#include "xtrack/headers/track.h"
+
+GPUFUN
 void test_function(TestElementData el,
                 LocalParticle* part0,
-                /*gpuglmem*/ double* b){
+                GPUGLMEM double* b){
 
     double const a = TestElementData_get_a(el);
 
-    //start_per_particle_block (part0->part)
+    START_PER_PARTICLE_BLOCK(part0, part);
 
         const int64_t ipart = part->ipart;
         double const val = b[ipart];
 
         LocalParticle_add_to_s(part, val + a);
 
-    //end_per_particle_block
+    END_PER_PARTICLE_BLOCK;
 }
 
-/*gpufun*/
+GPUFUN
 void TestElement_track_local_particle(TestElementData el,
                 LocalParticle* part0){
 
     double const a = TestElementData_get_a(el);
 
-    //start_per_particle_block (part0->part)
+    START_PER_PARTICLE_BLOCK(part0, part);
 
         LocalParticle_set_s(part, a);
 
-    //end_per_particle_block
+    END_PER_PARTICLE_BLOCK;
 }
 
 """
diff --git a/tests/test_particles.py b/tests/test_particles.py
index 5827b04de..372fd2939 100644
--- a/tests/test_particles.py
+++ b/tests/test_particles.py
@@ -20,18 +20,19 @@ class TestElement(xt.BeamElement):
         }
 
         _extra_c_sources = ["""
+            #include "xtrack/headers/track.h"
             #define XT_OMP_SKIP_REORGANIZE
 
-            /*gpufun*/
+            GPUFUN
             void TestElement_track_local_particle(
                 TestElementData el,
                 LocalParticle* part0
             ) {
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     int64_t state = check_is_active(part);
                     int64_t id = LocalParticle_get_particle_id(part);
                     TestElementData_set_states(el, id, state);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
         """]
 
@@ -96,16 +97,18 @@ class TestElement(xt.BeamElement):
         }
 
         _extra_c_sources = ["""
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+
+            GPUFUN
             void TestElement_track_local_particle(
                 TestElementData el,
                 LocalParticle* part0
             ) {
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     int64_t state = check_is_active(part);
                     int64_t id = LocalParticle_get_particle_id(part);
                     TestElementData_set_states(el, id, state);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
         """]
 
diff --git a/tests/test_particles_basics.py b/tests/test_particles_basics.py
index 79e537ad3..4217937c1 100644
--- a/tests/test_particles_basics.py
+++ b/tests/test_particles_basics.py
@@ -345,14 +345,16 @@ class TestElement(xt.BeamElement):
             'pz_only': xo.Int64,
             }
         _extra_c_sources = ['''
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+            
+            GPUFUN
             void TestElement_track_local_particle(
                     TestElementData el, LocalParticle* part0){
                 double const value = TestElementData_get_value(el);
                 int const pz_only = (int) TestElementData_get_pz_only(el);
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     LocalParticle_add_to_energy(part, value, pz_only);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             ''']
 
@@ -419,13 +421,15 @@ class TestElement(xt.BeamElement):
             }
 
         _extra_c_sources =['''
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+            
+            GPUFUN
             void TestElement_track_local_particle(
                     TestElementData el, LocalParticle* part0){
                 double const value = TestElementData_get_value(el);
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     LocalParticle_update_delta(part, value);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             ''']
 
@@ -460,13 +464,15 @@ class TestElement(xt.BeamElement):
             }
 
         _extra_c_sources = ['''
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+            
+            GPUFUN
             void TestElement_track_local_particle(
                     TestElementData el, LocalParticle* part0){
                 double const value = TestElementData_get_value(el);
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     LocalParticle_update_ptau(part, value);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             ''']
 
@@ -500,14 +506,16 @@ class TestElement(xt.BeamElement):
             'value': xo.Float64,
             }
         _extra_c_sources = ['''
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+            
+            GPUFUN
             void TestElement_track_local_particle(
                     TestElementData el, LocalParticle* part0){
                 double const value = TestElementData_get_value(el);
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     double const pzeta = LocalParticle_get_pzeta(part);
                     LocalParticle_update_pzeta(part, pzeta+value);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             ''']
 
@@ -543,13 +551,15 @@ class TestElement(xt.BeamElement):
             'value': xo.Float64,
             }
         _extra_c_sources = ['''
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+            
+            GPUFUN
             void TestElement_track_local_particle(
                     TestElementData el, LocalParticle* part0){
                 double const value = TestElementData_get_value(el);
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     LocalParticle_update_p0c(part, value);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             ''']
 
@@ -590,18 +600,20 @@ class ScaleAng(xt.BeamElement):
             'scale_y2':  xo.Float64,
             }
         _extra_c_sources = ['''
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+            
+            GPUFUN
             void ScaleAng_track_local_particle(
                     ScaleAngData el, LocalParticle* part0){
                 double const scale_x = ScaleAngData_get_scale_x(el);
                 double const scale_y = ScaleAngData_get_scale_y(el);
                 double const scale_x2 = ScaleAngData_get_scale_x2(el);
                 double const scale_y2 = ScaleAngData_get_scale_y2(el);
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     LocalParticle_scale_xp(part, scale_x);
                     LocalParticle_scale_yp(part, scale_y);
                     LocalParticle_scale_xp_yp(part, scale_x2, scale_y2);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             ''']
 
@@ -613,18 +625,20 @@ class KickAng(xt.BeamElement):
             'kick_y2': xo.Float64,
             }
         _extra_c_sources = ['''
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+            
+            GPUFUN
             void KickAng_track_local_particle(
                     KickAngData el, LocalParticle* part0){
                 double const kick_x = KickAngData_get_kick_x(el);
                 double const kick_y = KickAngData_get_kick_y(el);
                 double const kick_x2 = KickAngData_get_kick_x2(el);
                 double const kick_y2 = KickAngData_get_kick_y2(el);
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     LocalParticle_add_to_xp(part, kick_x);
                     LocalParticle_add_to_yp(part, kick_y);
                     LocalParticle_add_to_xp_yp(part, kick_x2, kick_y2);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             ''']
 
@@ -648,18 +662,20 @@ class ScaleAngExact(xt.BeamElement):
             'scale_y2':  xo.Float64,
             }
         _extra_c_sources = ['''
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+            
+            GPUFUN
             void ScaleAngExact_track_local_particle(
                     ScaleAngExactData el, LocalParticle* part0){
                 double const scale_x = ScaleAngExactData_get_scale_x(el);
                 double const scale_y = ScaleAngExactData_get_scale_y(el);
                 double const scale_x2 = ScaleAngExactData_get_scale_x2(el);
                 double const scale_y2 = ScaleAngExactData_get_scale_y2(el);
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     LocalParticle_scale_exact_xp(part, scale_x);
                     LocalParticle_scale_exact_yp(part, scale_y);
                     LocalParticle_scale_exact_xp_yp(part, scale_x2, scale_y2);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             ''']
 
@@ -671,18 +687,20 @@ class KickAngExact(xt.BeamElement):
             'kick_y2': xo.Float64,
             }
         _extra_c_sources = ['''
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+            
+            GPUFUN
             void KickAngExact_track_local_particle(
                     KickAngExactData el, LocalParticle* part0){
                 double const kick_x = KickAngExactData_get_kick_x(el);
                 double const kick_y = KickAngExactData_get_kick_y(el);
                 double const kick_x2 = KickAngExactData_get_kick_x2(el);
                 double const kick_y2 = KickAngExactData_get_kick_y2(el);
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     LocalParticle_add_to_exact_xp(part, kick_x);
                     LocalParticle_add_to_exact_yp(part, kick_y);
                     LocalParticle_add_to_exact_xp_yp(part, kick_x2, kick_y2);
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             ''']
 
diff --git a/tests/test_random_gen.py b/tests/test_random_gen.py
index b3a9d3f2f..378a8ce07 100644
--- a/tests/test_random_gen.py
+++ b/tests/test_random_gen.py
@@ -31,13 +31,15 @@ class TestElement(xt.BeamElement):
 
         _extra_c_sources = [
             '''
-                /*gpufun*/
+                #include "xtrack/headers/track.h"
+
+                GPUFUN
                 void TestElement_track_local_particle(
                         TestElementData el, LocalParticle* part0){
-                    //start_per_particle_block (part0->part)
+                    START_PER_PARTICLE_BLOCK(part0, part);
                         double rr = !!GENERATOR!!_generate(part);
                         LocalParticle_set_x(part, rr);
-                    //end_per_particle_block
+                    END_PER_PARTICLE_BLOCK;
                 }
             '''.replace('!!GENERATOR!!', generator)
         ]
diff --git a/tests/test_random_gen_exp.py b/tests/test_random_gen_exp.py
index ea6b3ec01..3614fbd1d 100644
--- a/tests/test_random_gen_exp.py
+++ b/tests/test_random_gen_exp.py
@@ -29,13 +29,15 @@ class TestElement(xt.BeamElement):
 
         _extra_c_sources = [
             '''
-                /*gpufun*/
+                #include "xtrack/headers/track.h"
+
+                GPUFUN
                 void TestElement_track_local_particle(
                         TestElementData el, LocalParticle* part0){
-                    //start_per_particle_block (part0->part)
+                    START_PER_PARTICLE_BLOCK(part0, part);
                         double rr = RandomExponential_generate(part);
                         LocalParticle_set_x(part, rr);
-                    //end_per_particle_block
+                    END_PER_PARTICLE_BLOCK;
                 }
             '''
         ]
diff --git a/tests/test_random_gen_gauss.py b/tests/test_random_gen_gauss.py
index 1e92441cf..5579124d9 100644
--- a/tests/test_random_gen_gauss.py
+++ b/tests/test_random_gen_gauss.py
@@ -28,13 +28,15 @@ class TestElement(xt.BeamElement):
 
         _extra_c_sources = [
             '''
-                /*gpufun*/
+                #include "xtrack/headers/track.h"
+
+                GPUFUN
                 void TestElement_track_local_particle(
                         TestElementData el, LocalParticle* part0){
-                    //start_per_particle_block (part0->part)
+                    START_PER_PARTICLE_BLOCK(part0, part);
                         double rr = RandomNormal_generate(part);
                         LocalParticle_set_x(part, rr);
-                    //end_per_particle_block
+                    END_PER_PARTICLE_BLOCK;
                 }
             '''
         ]
diff --git a/tests/test_random_gen_ruth.py b/tests/test_random_gen_ruth.py
index 7139444d1..bd624190b 100644
--- a/tests/test_random_gen_ruth.py
+++ b/tests/test_random_gen_ruth.py
@@ -36,14 +36,16 @@ class TestElement(xt.BeamElement):
 
         _extra_c_sources = [
             '''
-                /*gpufun*/
+                #include "xtrack/headers/track.h"
+                
+                GPUFUN
                 void TestElement_track_local_particle(
                         TestElementData el, LocalParticle* part0){
                     RandomRutherfordData rng = TestElementData_getp_rng(el);
-                    //start_per_particle_block (part0->part)
+                    START_PER_PARTICLE_BLOCK(part0, part);
                         double rr = RandomRutherford_generate(rng, part);
                         LocalParticle_set_x(part, rr);
-                    //end_per_particle_block
+                    END_PER_PARTICLE_BLOCK;
                 }
             '''
         ]
diff --git a/tests/test_tracker.py b/tests/test_tracker.py
index 0b726955d..4c9a17679 100644
--- a/tests/test_tracker.py
+++ b/tests/test_tracker.py
@@ -381,12 +381,14 @@ class TestElement(xt.BeamElement):
             'dummy': xo.Float64,
         }
         _extra_c_sources = ["""
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+
+            GPUFUN
             void TestElement_track_local_particle(
                     TestElementData el,
                     LocalParticle* part0)
             {
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
 
                     #if TEST_FLAG == 2
                     LocalParticle_set_x(part, 7);
@@ -396,7 +398,7 @@ class TestElement(xt.BeamElement):
                     LocalParticle_set_y(part, 42);
                     #endif
 
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             """]
 
@@ -745,7 +747,9 @@ class TestElement(xt.BeamElement):
 
         _extra_c_sources = [
             r'''
-            /*gpufun*/
+            #include "xtrack/headers/track.h"
+            
+            GPUFUN
             void TestElement_track_local_particle(TestElementData el, LocalParticle* part0){
                 // Extract the record and record_index
                 TestElementRecordData record = TestElementData_getp_internal_record(el, part0);
@@ -756,7 +760,7 @@ class TestElement(xt.BeamElement):
 
                 int64_t elem_field = TestElementData_get_element_field(el);
 
-                //start_per_particle_block (part0->part)
+                START_PER_PARTICLE_BLOCK(part0, part);
                     if (record) {  // Record exists
                         // Get a slot in the record (this is thread safe)
                         int64_t i_slot = RecordIndex_get_slot(record_index);
@@ -774,7 +778,7 @@ class TestElement(xt.BeamElement):
                             );
                         }
                     }
-                //end_per_particle_block
+                END_PER_PARTICLE_BLOCK;
             }
             '''
         ]
diff --git a/xtrack/base_element.py b/xtrack/base_element.py
index 3df0677e6..1cddf66db 100644
--- a/xtrack/base_element.py
+++ b/xtrack/base_element.py
@@ -43,7 +43,7 @@
     }
 """
 
-def _handle_per_particle_blocks(sources, local_particle_src):
+def _handle_per_particle_blocks(sources):
 
     if isinstance(sources, str):
         sources = (sources, )
@@ -59,8 +59,6 @@ def _handle_per_particle_blocks(sources, local_particle_src):
         else:
             strss = ss
 
-        strss = strss.replace('/*placeholder_for_local_particle_src*/',
-                                local_particle_src)
         if '//start_per_particle_block' in strss:
 
             lines = strss.splitlines()
@@ -73,7 +71,7 @@ def _handle_per_particle_blocks(sources, local_particle_src):
             # TODO: this is very dirty, just for check!!!!!
             out.append('\n'.join(lines))
         else:
-            out.append(ss)
+            out.append(strss)
 
 
     if wasstring:
@@ -440,9 +438,7 @@ def init_pipeline(self, pipeline_manager, name, partners_names=[]):
     def compile_kernels(self, extra_classes=(), *args, **kwargs):
         if 'apply_to_source' not in kwargs.keys():
             kwargs['apply_to_source'] = []
-        kwargs['apply_to_source'].append(
-            partial(_handle_per_particle_blocks,
-                    local_particle_src=Particles.gen_local_particle_api()))
+        kwargs['apply_to_source'].append(_handle_per_particle_blocks)
         context = self._context
         cls = self.__class__
 
diff --git a/xtrack/headers/atomicadd.h b/xtrack/headers/atomicadd.h
index 360dda64d..0a44925b3 100644
--- a/xtrack/headers/atomicadd.h
+++ b/xtrack/headers/atomicadd.h
@@ -3,348 +3,11 @@
 // Copyright (c) CERN, 2025.                   //
 // ########################################### //
 
-#ifndef _ATOMICADD_H_
-#define _ATOMICADD_H_
+#ifndef XTRACK_ATOMICADD_H_DEPRECATED
+#define XTRACK_ATOMICADD_H_DEPRECATED
 
-#include <headers/track.h>
+// The xtrack/headers/atomicadd.h header is deprecated. Use xobjects/headers/atomicadd.h instead.
 
+#include "xobjects/headers/atomicadd.h"
 
-// Ensure no conflicting atomicAdd macro is defined earlier for CPU or OpenCL
-#ifndef XO_CONTEXT_CUDA
-#ifdef atomicAdd
-    #warning "Warning: atomicAdd macro already defined, undefining it."
-    #undef atomicAdd
-#endif // atomicAdd
-#endif // XO_CONTEXT_CUDA
-
-
-// ################################# //
-// ###        CPU Contexts       ### //
-// ################################# //
-
-#ifdef XO_CONTEXT_CPU
-#include <stdint.h>
-
-#ifdef XO_CONTEXT_CPU_OPENMP
-    #ifndef _OPENMP
-    #error "XO_CONTEXT_CPU_OPENMP set, but compiled without -fopenmp"
-    #endif
-    // OpenMP atomic capture gives us read+write atomically
-    #define OMP_ATOMIC_CAPTURE  _Pragma("omp atomic capture")
-#else
-    #define OMP_ATOMIC_CAPTURE  // No OpenMP: non-atomic fallback
-#endif // XO_CONTEXT_CPU_OPENMP
-
-// Macro to define atomicAdd for different types, will be overloaded via _Generic.
-#define DEF_ATOMIC_ADD(T, SUF)                                  \
-GPUFUN T atomicAdd_##SUF(GPUGLMEM T *addr, T val) {               \
-    T old;                                                      \
-    const T inc = (T)val;                                       \
-    OMP_ATOMIC_CAPTURE                                          \
-    { old = *addr; *addr = *addr + inc; }                       \
-    return old;                                                 \
-}
-
-DEF_ATOMIC_ADD(int8_t ,   i8)
-DEF_ATOMIC_ADD(int16_t,  i16)
-DEF_ATOMIC_ADD(int32_t,  i32)
-DEF_ATOMIC_ADD(int64_t,  i64)
-DEF_ATOMIC_ADD(uint8_t ,  u8)
-DEF_ATOMIC_ADD(uint16_t, u16)
-DEF_ATOMIC_ADD(uint32_t, u32)
-DEF_ATOMIC_ADD(uint64_t, u64)
-DEF_ATOMIC_ADD(float   , f32)
-DEF_ATOMIC_ADD(double  , f64)
-
-// Using _Generic to select the right function based on type (since C11).
-// See https://en.cppreference.com/w/c/language/generic.html
-#define atomicAdd(addr, val) _Generic((addr),        \
-    int8_t*:   atomicAdd_i8,                         \
-    int16_t*:  atomicAdd_i16,                        \
-    int32_t*:  atomicAdd_i32,                        \
-    int64_t*:  atomicAdd_i64,                        \
-    uint8_t*:  atomicAdd_u8,                         \
-    uint16_t*: atomicAdd_u16,                        \
-    uint32_t*: atomicAdd_u32,                        \
-    uint64_t*: atomicAdd_u64,                        \
-    float*:    atomicAdd_f32,                        \
-    double*:   atomicAdd_f64                         \
-)(addr, (val))
-#endif // XO_CONTEXT_CPU
-
-
-// ################################# //
-// ###        GPU Contexts       ### //
-// ################################# //
-
-/* -------------- Design notes for GPU atomics ---------------------------------
-* We will go for a unified approach for CUDA and OpenCL, with macros where needed.
-* We implement 8/16-bit atomic add by atomically CAS'ing the *containing*
- * 32-bit word. Only the target byte/halfword lane is modified; the neighbor
- * lane is preserved. This is linearisable: each successful CAS is one atomic
- * RMW on the 32-bit word.
- * Assumptions: little-endian lane layout (true on NVIDIA) and natural alignment
- * of the 16-bit addresses (addr % 2 == 0). 8-bit has no alignment requirement.
- * Return value matches CUDA semantics: the **old** value at *addr* (fetch-add).
- * ---------------------------------------------------------------------------*/
-
-// Info: a CAS function (Compare-And-Swap) takes three arguments: a pointer to
-// the value to be modified, the expected old value, and the new value. The
-// second argument helps to recognise if another thread has modified the value
-// in the meantime. The function compares the value at the address with the
-// expected values, and if they are the same, it writes the new value at the
-// address. In any case, the function returns the actual old value at the
-// address. If the returned value is different from the expected value, it
-// means that another thread has modified the value in the meantime.
-
-// Define types and macros for CUDA and OpenCL
-// -------------------------------------------
-#if defined(XO_CONTEXT_CUDA)
-    // CUDA compiler may not have <stdint.h>, so define the types if needed.
-    #ifdef __CUDACC_RTC__
-        // NVRTC (CuPy RawModule default) can’t see <stdint.h>, so detect it via __CUDACC_RTC__
-        typedef signed char        int8_t;
-        typedef short              int16_t;
-        typedef int                int32_t;
-        typedef long long          int64_t;
-        typedef unsigned char      uint8_t;
-        typedef unsigned short     uint16_t;
-        typedef unsigned int       uint32_t;
-        typedef unsigned long long uint64_t;
-    #else
-        // Alternatively, NVCC path is fine with host headers
-        #include <stdint.h>
-    #endif // __CUDACC_RTC__
-    #define GPUVOLATILE    GPUGLMEM
-    #define __XT_CAS_U32(ptr, exp, val) atomicCAS((GPUVOLATILE uint32_t*)(ptr), (uint32_t)(exp), (uint32_t)(val))
-    #define __XT_CAS_U64(ptr, exp, val) atomicCAS((GPUVOLATILE uint64_t*)(ptr), (uint64_t)(exp), (uint64_t)(val))
-    #define __XT_AS_U32_FROM_F32(x) __float_as_uint(x)
-    #define __XT_AS_F32_FROM_U32(x) __uint_as_float(x)
-    #define __XT_AS_U64_FROM_F64(x) __double_as_longlong(x)
-    #define __XT_AS_F64_FROM_U64(x) __longlong_as_double(x)
-#elif defined(XO_CONTEXT_CL)
-    // It seems OpenCL already has the types from <stdint.h> defined.
-    // typedef char           int8_t;
-    // typedef short          int16_t;
-    // typedef int            int32_t;
-    // typedef long           int64_t;
-    // typedef unsigned char  uint8_t;
-    // typedef unsigned short uint16_t;
-    // typedef unsigned int   uint32_t;
-    // typedef unsigned long  uint64_t;
-    #define GPUVOLATILE    GPUGLMEM volatile
-    #if __OPENCL_C_VERSION__ < 110
-        // Map 1.0 "atom_*" names to 1.1+ "atomic_*"  
-        #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-        #define atomic_add     atom_add
-        #define atomic_cmpxchg atom_cmpxchg
-    #endif
-    #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
-    #define __XT_CAS_U32(ptr, exp, val) atomic_cmpxchg((GPUVOLATILE uint32_t*)(ptr), (uint32_t)(exp), (uint32_t)(val))
-    #define __XT_CAS_U64(ptr, exp, val) atom_cmpxchg((GPUVOLATILE uint64_t*)(ptr), (uint64_t)(exp), (uint64_t)(val))
-    #define __XT_AS_U32_FROM_F32(x) as_uint(x)
-    #define __XT_AS_F32_FROM_U32(x) as_float(x)
-    #define __XT_AS_U64_FROM_F64(x) as_ulong(x)
-    #define __XT_AS_F64_FROM_U64(x) as_double(x)
-#endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
-
-// Define atomic functions per type
-// --------------------------------
-#if defined(XO_CONTEXT_CUDA) || defined(XO_CONTEXT_CL)
-    // Helper: compute (base 32-bit word pointer, shift, mask) for a byte in that word.
-    GPUFUN inline void __xt_lane8(GPUVOLATILE void* addr, GPUVOLATILE uint32_t** w, uint32_t* sh, uint32_t* mask){
-        size_t a = (size_t)addr;
-        *w    = (GPUVOLATILE uint32_t*)(a & ~(size_t)3); // word: align down to 4-byte boundary
-        *sh   = (uint32_t)((a & (size_t)3) * 8U);        // shift 0,8,16,24 depending on byte lane
-        *mask = (uint32_t)(0xFFu << *sh);
-    }
-    // Helper: same for a halfword (16-bit) in the containing 32-bit word.
-    GPUFUN inline void __xt_lane16(GPUVOLATILE void* addr, GPUVOLATILE uint32_t** w, uint32_t* sh, uint32_t* mask){
-        size_t a = (size_t)addr;
-        *w  = (GPUVOLATILE uint32_t*)(a & ~(size_t)3);  // word: align down to 4-byte boundary
-        *sh = (uint32_t)((a & (size_t)2) ? 16U : 0U);   // shift0 or 16 depending on halfword
-        *mask  = (uint32_t)(0xFFFFU << *sh);
-    }
-
-    // ---------------- 8-bit: int8_t / uint8_t (CAS on 32-bit word) --------------
-    GPUFUN int8_t atomicAdd_i8(GPUVOLATILE int8_t* addr, int8_t val){
-        GPUVOLATILE uint32_t *w;
-        uint32_t sh, mask;
-        __xt_lane8(addr, &w, &sh, &mask);
-        uint32_t old = *w, assumed, b, nb, nw;  // byte, newbyte, newword
-        do {
-            assumed = old;
-            b  = (assumed & mask) >> sh;                    // Extract current 8-bit lane
-            nb = (uint32_t)((uint8_t)b + (uint8_t)val);     // Add in modulo-256 (two's complement)
-            nw = (assumed & ~mask) | ((nb & 0xFFU) << sh);  // Merge back updated lane; leave neighbor lanes intact
-            // Try to publish; if someone raced us, retry with their value
-            old = __XT_CAS_U32(w, assumed, nw);
-        } while (old != assumed);
-        return (int8_t)((assumed & mask) >> sh);
-    }
-    GPUFUN uint8_t atomicAdd_u8(GPUVOLATILE uint8_t* addr, uint8_t val){
-        GPUVOLATILE uint32_t *w;
-        uint32_t sh, mask;
-        __xt_lane8(addr, &w, &sh, &mask);
-        uint32_t old = *w, assumed, b, nb, nw;
-        do {
-            assumed = old;
-            b  = (assumed & mask) >> sh;
-            nb = (uint32_t)(b + val);                             /* mod 256 */
-            nw = (assumed & ~mask) | ((nb & 0xFFU) << sh);
-            old = __XT_CAS_U32(w, assumed, nw);
-        } while (old != assumed);
-        return (uint8_t)((assumed & mask) >> sh);
-    }
-
-    // ---------------- 16-bit: int16_t / uint16_t (CAS on 32-bit word) -----------
-    GPUFUN int16_t atomicAdd_i16(GPUVOLATILE int16_t* addr, int16_t val){
-        GPUVOLATILE uint32_t* w;
-        uint32_t sh, mask;
-        __xt_lane16(addr, &w, &sh, &mask);
-        uint32_t old = *w, assumed, b, nb, nw;
-        do {
-            assumed = old;
-            b  = (assumed & mask) >> sh;
-            nb = (uint32_t)((uint16_t)b + (uint16_t)val);
-            nw = (assumed & ~mask) | ((nb & 0xFFFFU) << sh);
-            old = __XT_CAS_U32(w, assumed, nw);
-        } while (old != assumed);
-        return (int16_t)((assumed & mask) >> sh);
-    }
-    GPUFUN uint16_t atomicAdd_u16(GPUVOLATILE uint16_t* addr, uint16_t val){
-        GPUVOLATILE uint32_t* w;
-        uint32_t sh, mask;
-        __xt_lane16(addr, &w, &sh, &mask);
-        uint32_t old = *w, assumed, b, nb, nw;
-        do {
-            assumed = old;
-            b  = (assumed & mask) >> sh;
-            nb = (uint32_t)(b + val);
-            nw = (assumed & ~mask) | ((nb & 0xFFFFU) << sh);
-            old = __XT_CAS_U32(w, assumed, nw);
-        } while (old != assumed);
-        return (uint16_t)((assumed & mask) >> sh);
-    }
-
-    // ---------------- 32-bit: int32_t / uint32_t (built-in) -----------
-    GPUFUN int32_t atomicAdd_i32(GPUVOLATILE int32_t* addr, int32_t val){
-    #ifdef XO_CONTEXT_CUDA
-        return atomicAdd(addr, val);
-    #else // XO_CONTEXT_CL
-        return atomic_add(addr, val);
-    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
-    }
-    GPUFUN uint32_t atomicAdd_u32(GPUVOLATILE uint32_t* addr, uint32_t val){
-    #ifdef XO_CONTEXT_CUDA
-        return atomicAdd(addr, val);
-    #else // XO_CONTEXT_CL
-        return atomic_add(addr, val);
-    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
-    }
-
-    // ---------------- 64-bit: int64_t / uint64_t (built-in or CAS on 64-bit word) -----------
-    GPUFUN int64_t atomicAdd_i64(GPUVOLATILE int64_t* addr, int64_t val){
-        uint64_t old, nw;
-        do {
-            old = *addr;
-            nw = old + val;
-        } while (__XT_CAS_U64((GPUVOLATILE uint64_t*)addr, old, nw) != old);
-        return old;
-    }
-    GPUFUN uint64_t atomicAdd_u64(GPUVOLATILE uint64_t* addr, uint64_t val){
-    #ifdef XO_CONTEXT_CUDA
-        return atomicAdd(addr, val);
-    #else // XO_CONTEXT_CL
-        return atom_add(addr, val);
-    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
-    }
-
-    // ---------------- 32-bit: float (built-in or CAS on 32-bit word) -----------
-    GPUFUN float atomicAdd_f32(GPUVOLATILE float* addr, float val){
-    #ifdef XO_CONTEXT_CUDA
-        return atomicAdd(addr, val);
-    #else // XO_CONTEXT_CL
-        uint32_t old, nw;
-        do {
-            old = __XT_AS_U32_FROM_F32(*addr);
-            nw = __XT_AS_U32_FROM_F32(__XT_AS_F32_FROM_U32(old) + val);
-        } while (__XT_CAS_U32((GPUVOLATILE uint32_t*)addr, old, nw) != old);
-        return __XT_AS_F32_FROM_U32(old);
-    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
-    }
-
-    // ---------------- 64-bit: float (built-in or CAS on 64-bit word) -----------
-    GPUFUN double atomicAdd_f64(GPUVOLATILE double* addr, double val){
-    #if __CUDA_ARCH__ >= 600
-        return atomicAdd(addr, val);
-    #else // XO_CONTEXT_CL || __CUDA_ARCH__ < 600
-        uint64_t old, nw;
-        do {
-            old = __XT_AS_U64_FROM_F64(*addr);
-            nw = __XT_AS_U64_FROM_F64(__XT_AS_F64_FROM_U64(old) + val);
-        } while (__XT_CAS_U64((GPUVOLATILE uint64_t*)addr, old, nw) != old);
-        return __XT_AS_F64_FROM_U64(old);
-    #endif // __CUDA_ARCH__ >= 600 / XO_CONTEXT_CL
-    }
-#endif // defined(XO_CONTEXT_CUDA) || defined(XO_CONTEXT_CL)
-
-// Define the overloaded function
-// ------------------------------
-#ifdef XO_CONTEXT_CUDA
-    // NVRTC (CuPy RawModule) usually compiles under extern "C".
-    // In C, function overloading is not possible, but we can cheat by doing it in
-    // C++ (with a different name to avoid clashes with the built-in atomicAdd).
-    // This function will then be remapped to atomicAdd() via a macro in C.
-    #ifdef __cplusplus
-    extern "C++" {
-
-    GPUFUN int8_t   xt_atomicAdd(GPUVOLATILE int8_t*  p, int8_t  v)  { return atomicAdd_i8 (p, v); }
-    GPUFUN uint8_t  xt_atomicAdd(GPUVOLATILE uint8_t* p, uint8_t v)  { return atomicAdd_u8 (p, v); }
-    GPUFUN int16_t  xt_atomicAdd(GPUVOLATILE int16_t* p, int16_t v)  { return atomicAdd_i16(p, v); }
-    GPUFUN uint16_t xt_atomicAdd(GPUVOLATILE uint16_t*p, uint16_t v) { return atomicAdd_u16(p, v); }
-    GPUFUN int64_t  xt_atomicAdd(GPUVOLATILE int64_t*p, int64_t v)   { return atomicAdd_i64(p, v); }
-
-    // Existing type definitions: forward to CUDA built-ins
-    GPUFUN int32_t  xt_atomicAdd(GPUVOLATILE int32_t* p, int32_t v)   { return ::atomicAdd(p, v); }
-    GPUFUN uint32_t xt_atomicAdd(GPUVOLATILE uint32_t* p, uint32_t v) { return ::atomicAdd(p, v); }
-    GPUFUN uint64_t xt_atomicAdd(GPUVOLATILE uint64_t* p, uint64_t v) { return ::atomicAdd(p, v); }
-    GPUFUN float    xt_atomicAdd(GPUVOLATILE float* p, float v)       { return ::atomicAdd(p, v); }
-    #if __CUDA_ARCH__ >= 600
-    GPUFUN double   xt_atomicAdd(GPUVOLATILE double* p, double v)     { return ::atomicAdd(p, v); }
-    #else
-    GPUFUN double   xt_atomicAdd(GPUVOLATILE double* p, double v)     { return atomicAdd_f64(p, v); }
-    #endif
-
-    }
-    #endif // __cplusplus
-
-    // ---------- Global remap of the public name on device code ----------
-    // Define AFTER the wrappers so we don't macro-rewrite our own calls.
-    #ifdef atomicAdd
-    #undef atomicAdd
-    #endif
-    #define atomicAdd(ptr, val) xt_atomicAdd((ptr), (val))
-#endif /* XO_CONTEXT_CUDA */
-
-#ifdef XO_CONTEXT_CL
-    #if !__has_attribute(overloadable)
-        #error "The current OpenCL compiler/architecture does not support __attribute__((overloadable))"
-    #endif
-    #define OCL_OVERLOAD __attribute__((overloadable))
-    GPUFUN int8_t   OCL_OVERLOAD atomicAdd(GPUVOLATILE int8_t*  p, int8_t  v)  { return atomicAdd_i8 (p, v); }
-    GPUFUN uint8_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE uint8_t* p, uint8_t v)  { return atomicAdd_u8 (p, v); }
-    GPUFUN int16_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int16_t* p, int16_t v)  { return atomicAdd_i16(p, v); }
-    GPUFUN uint16_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint16_t*p, uint16_t v) { return atomicAdd_u16(p, v); }
-    GPUFUN int64_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int64_t*p, int64_t v)   { return atomicAdd_i64(p, v); }
-    GPUFUN float    OCL_OVERLOAD atomicAdd(GPUVOLATILE float* p, float v)      { return atomicAdd_f32(p, v); }
-    GPUFUN double   OCL_OVERLOAD atomicAdd(GPUVOLATILE double* p, double v)    { return atomicAdd_f64(p, v); }
-
-    // Existing type definitions: forward to OpenCL built-ins
-    GPUFUN int32_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int32_t* p, int32_t v)   { return atomic_add(p, v); }
-    GPUFUN uint32_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint32_t* p, uint32_t v) { return atomic_add(p, v); }
-    GPUFUN uint64_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint64_t* p, uint64_t v) { return atom_add(p, v); }
-#endif // XO_CONTEXT_CL
-
-#endif //_ATOMICADD_H_
+#endif // XTRACK_ATOMICADD_H_DEPRECATED
diff --git a/xtrack/headers/constants.h b/xtrack/headers/constants.h
index 9159b38ff..d876cc4df 100644
--- a/xtrack/headers/constants.h
+++ b/xtrack/headers/constants.h
@@ -58,4 +58,20 @@
     #define MASS_ELECTRON (9.1093837e-31)
 #endif /* !defined( MASS_ELECTRON ) */
 
+#if !defined( SQRT3 )
+    #define SQRT3 1.732050807568877
+#endif /* !defined( SQRT3 ) */
+
+#if !defined( SQRT2 )
+    #define SQRT2 (1.414213562373095048801688724209698078569671875376948073176679738)
+#endif /* !defined( SQRT2 ) */
+
+#if !defined( TWO_OVER_SQRT_PI )
+    #define TWO_OVER_SQRT_PI (1.128379167095512573896158903121545171688101258657997713688171443418)
+#endif /* !defined( TWO_OVER_SQRT_PI ) */
+
+#if !defined( ALPHA_EM )
+    #define ALPHA_EM 0.0072973525693
+#endif /* !defined( ALPHA_EM ) */
+
 #endif /* XTRACK_CONSTANTS_H */
diff --git a/xtrack/headers/track.h b/xtrack/headers/track.h
index d803bf46b..b55b4b6bc 100644
--- a/xtrack/headers/track.h
+++ b/xtrack/headers/track.h
@@ -1,7 +1,14 @@
+// copyright ############################### //
+// This file is part of the Xtrack Package.  //
+// Copyright (c) CERN, 2025.                 //
+// ######################################### //
+
 #ifndef XTRACK_TRACK_H
 #define XTRACK_TRACK_H
 
-#include <headers/constants.h>
+#include "xobjects/headers/common.h"
+#include "xobjects/headers/atomicadd.h"
+#include "xtrack/headers/constants.h"
 
 /*
     The particle tracking "decorators" for all the contexts.
@@ -22,12 +29,6 @@
     #define END_PER_PARTICLE_BLOCK \
             } \
         }
-
-    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
-        for (int INDEX_NAME = 0; INDEX_NAME < (COUNT); INDEX_NAME++) {
-
-    #define END_VECTORIZE \
-        }
 #endif  // XO_CONTEXT_CPU_SERIAL
 
 #ifdef XO_CONTEXT_CPU_OPENMP
@@ -48,14 +49,6 @@
                 } \
             } \
         }
-
-    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
-        _Pragma("omp parallel for") \
-        for (int INDEX_NAME = 0; INDEX_NAME < (COUNT); INDEX_NAME++) {
-
-    #define END_VECTORIZE \
-        }
-
 #endif  // XO_CONTEXT_CPU_OPENMP
 
 
@@ -67,14 +60,6 @@
 
     #define END_PER_PARTICLE_BLOCK \
             }
-
-    #define VECTORIZE_OVER(INDEX_NAME, COUNT) { \
-            int INDEX_NAME = blockDim.x * blockIdx.x + threadIdx.x; \
-            if (INDEX_NAME < (COUNT)) {
-
-    #define END_VECTORIZE \
-            } \
-        }
 #endif  // XO_CONTEXT_CUDA
 
 
@@ -86,54 +71,9 @@
 
     #define END_PER_PARTICLE_BLOCK \
             }
-
-    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
-        { \
-            int INDEX_NAME = get_global_id(0);
-
-    #define END_VECTORIZE \
-        }
 #endif  // XO_CONTEXT_CL
 
 
-/*
-    Qualifier keywords for GPU and optimisation
-*/
-
-#ifdef XO_CONTEXT_CPU // for both serial and OpenMP
-    #define GPUKERN
-    #define GPUFUN      static inline
-    #define GPUGLMEM
-    #define RESTRICT    restrict
-#endif
-
-
-#ifdef XO_CONTEXT_CUDA
-    #define GPUKERN     __global__
-    #define GPUFUN      __device__
-    #define GPUGLMEM
-    #define RESTRICT
-#endif // XO_CONTEXT_CUDA
-
-
-#ifdef XO_CONTEXT_CL
-    #define GPUKERN     __kernel
-    #define GPUFUN
-    #define GPUGLMEM    __global
-    #define RESTRICT
-#endif // XO_CONTEXT_CL
-
-
-/*
-    Common maths-related macros
-*/
-
-#define POW2(X) ((X)*(X))
-#define POW3(X) ((X)*(X)*(X))
-#define POW4(X) ((X)*(X)*(X)*(X))
-#define NONZERO(X) ((X) != 0.0)
-
-
 #ifndef START_PER_PARTICLE_BLOCK
 #error "Unknown context, or the expected context (XO_CONTEXT_*) flag undefined. Try updating Xobjects?"
 #endif
diff --git a/xtrack/internal_record.py b/xtrack/internal_record.py
index 4aa1d78b6..fe4f1560f 100644
--- a/xtrack/internal_record.py
+++ b/xtrack/internal_record.py
@@ -33,8 +33,8 @@
 '''
 
 _RecordIndex_get_slot_source = r'''
-#include <headers/track.h>
-#include <headers/atomicadd.h>
+#include "xobjects/headers/common.h"
+#include "xobjects/headers/atomicadd.h"
 
 GPUFUN
 int64_t RecordIndex_get_slot(RecordIndex record_index){
diff --git a/xtrack/monitors/beam_position_monitor.h b/xtrack/monitors/beam_position_monitor.h
index 81e50569e..f8821eb6c 100644
--- a/xtrack/monitors/beam_position_monitor.h
+++ b/xtrack/monitors/beam_position_monitor.h
@@ -9,7 +9,7 @@
 #ifndef XTRACK_BEAM_POSITION_MONITOR_H
 #define XTRACK_BEAM_POSITION_MONITOR_H
 
-#include <headers/track.h>
+#include "xtrack/headers/track.h"
 
 
 GPUFUN
diff --git a/xtrack/monitors/beam_position_monitor.py b/xtrack/monitors/beam_position_monitor.py
index 65b3e8ba9..3780261ab 100644
--- a/xtrack/monitors/beam_position_monitor.py
+++ b/xtrack/monitors/beam_position_monitor.py
@@ -12,7 +12,6 @@
 from ..base_element import BeamElement
 from ..beam_elements import Marker
 from ..internal_record import RecordIndex
-from ..general import _pkg_root
 
 
 class BeamPositionMonitorRecord(xo.Struct):
@@ -40,8 +39,7 @@ class BeamPositionMonitor(BeamElement):
     properties = [field.name for field in BeamPositionMonitorRecord._fields]
 
     _extra_c_sources = [
-        _pkg_root.joinpath('headers/atomicadd.h'),
-        _pkg_root.joinpath('monitors/beam_position_monitor.h')
+        '#include "xtrack/monitors/beam_position_monitor.h"',
     ]
 
     def __init__(self, *, particle_id_range=None, particle_id_start=None, num_particles=None,
diff --git a/xtrack/monitors/beam_profile_monitor.h b/xtrack/monitors/beam_profile_monitor.h
index 37daff113..dd720c7e8 100644
--- a/xtrack/monitors/beam_profile_monitor.h
+++ b/xtrack/monitors/beam_profile_monitor.h
@@ -9,7 +9,7 @@
 #ifndef XTRACK_BEAM_PROFILE_MONITOR_H
 #define XTRACK_BEAM_PROFILE_MONITOR_H
 
-#include <headers/track.h>
+#include "xtrack/headers/track.h"
 
 
 GPUFUN
diff --git a/xtrack/monitors/beam_profile_monitor.py b/xtrack/monitors/beam_profile_monitor.py
index 24c7853ed..d23727010 100644
--- a/xtrack/monitors/beam_profile_monitor.py
+++ b/xtrack/monitors/beam_profile_monitor.py
@@ -12,7 +12,6 @@
 from ..base_element import BeamElement
 from ..beam_elements import Marker
 from ..internal_record import RecordIndex
-from ..general import _pkg_root
 
 class BeamProfileMonitorRecord(xo.Struct):
     counts_x = xo.Float64[:]
@@ -45,8 +44,7 @@ class BeamProfileMonitor(BeamElement):
     properties = [field.name for field in BeamProfileMonitorRecord._fields]
 
     _extra_c_sources = [
-        _pkg_root.joinpath('headers/atomicadd.h'),
-        _pkg_root.joinpath('monitors/beam_profile_monitor.h')
+        '#include "xtrack/monitors/beam_profile_monitor.h"',
     ]
 
 
diff --git a/xtrack/monitors/beam_size_monitor.h b/xtrack/monitors/beam_size_monitor.h
index 66bfa23cf..cb4102bad 100644
--- a/xtrack/monitors/beam_size_monitor.h
+++ b/xtrack/monitors/beam_size_monitor.h
@@ -13,8 +13,8 @@
 
 
 GPUFUN
-void BeamSizeMonitor_track_local_particle(BeamSizeMonitorData el, LocalParticle* part0){
-
+void BeamSizeMonitor_track_local_particle(BeamSizeMonitorData el, LocalParticle* part0)
+{
     // get parameters
     int64_t const start_at_turn = BeamSizeMonitorData_get_start_at_turn(el);
     int64_t particle_id_start = BeamSizeMonitorData_get_particle_id_start(el);
@@ -44,19 +44,19 @@ void BeamSizeMonitor_track_local_particle(BeamSizeMonitorData el, LocalParticle*
                 double x = LocalParticle_get_x(part);
                 double y = LocalParticle_get_y(part);
 
-                /*gpuglmem*/ double* count = BeamSizeMonitorRecord_getp1_count(record, slot);
+                GPUGLMEM double* count = BeamSizeMonitorRecord_getp1_count(record, slot);
                 atomicAdd(count, 1);
 
-                /*gpuglmem*/ double * x_sum = BeamSizeMonitorRecord_getp1_x_sum(record, slot);
+                GPUGLMEM double * x_sum = BeamSizeMonitorRecord_getp1_x_sum(record, slot);
                 atomicAdd(x_sum, x);
 
-                /*gpuglmem*/ double * y_sum = BeamSizeMonitorRecord_getp1_y_sum(record, slot);
+                GPUGLMEM double * y_sum = BeamSizeMonitorRecord_getp1_y_sum(record, slot);
                 atomicAdd(y_sum, y);
 
-                /*gpuglmem*/ double * x2_sum = BeamSizeMonitorRecord_getp1_x2_sum(record, slot);
+                GPUGLMEM double * x2_sum = BeamSizeMonitorRecord_getp1_x2_sum(record, slot);
                 atomicAdd(x2_sum, x*x);
 
-                /*gpuglmem*/ double * y2_sum = BeamSizeMonitorRecord_getp1_y2_sum(record, slot);
+                GPUGLMEM double * y2_sum = BeamSizeMonitorRecord_getp1_y2_sum(record, slot);
                 atomicAdd(y2_sum, y*y);
             }
         }
diff --git a/xtrack/monitors/beam_size_monitor.py b/xtrack/monitors/beam_size_monitor.py
index dff81bbb7..be66c4010 100644
--- a/xtrack/monitors/beam_size_monitor.py
+++ b/xtrack/monitors/beam_size_monitor.py
@@ -8,11 +8,10 @@
 import numpy as np
 
 import xobjects as xo
-
 from ..base_element import BeamElement
 from ..beam_elements import Marker
 from ..internal_record import RecordIndex
-from ..general import _pkg_root
+
 
 class BeamSizeMonitorRecord(xo.Struct):
     count = xo.Float64[:]
@@ -40,8 +39,7 @@ class BeamSizeMonitor(BeamElement):
     properties = [field.name for field in BeamSizeMonitorRecord._fields]
 
     _extra_c_sources = [
-        _pkg_root.joinpath('headers/atomicadd.h'),
-        _pkg_root.joinpath('monitors/beam_size_monitor.h')
+        '#include "xtrack/monitors/beam_size_monitor.h"',
     ]
 
     def __init__(self, *, particle_id_range=None, particle_id_start=None, num_particles=None,
diff --git a/xtrack/monitors/particles_monitor.h b/xtrack/monitors/particles_monitor.h
index d10faba96..5c889def4 100644
--- a/xtrack/monitors/particles_monitor.h
+++ b/xtrack/monitors/particles_monitor.h
@@ -48,8 +48,11 @@ void ParticlesMonitor_track_local_particle(ParticlesMonitorData el,
         }
         else if (n_repetitions > 1){
             if (at_turn < start_at_turn){
-                return; //only_for_context cuda opencl
-                break; //only_for_context cpu_serial cpu_openmp
+                #ifdef XO_CONTEXT_CPU
+                    break;
+                #else
+                    return;
+                #endif
             }
             int64_t const i_frame = (at_turn - start_at_turn) / repetition_period;
             if (i_frame < n_repetitions
diff --git a/xtrack/multisetter/multisetter.h b/xtrack/multisetter/multisetter.h
new file mode 100644
index 000000000..ce15d8bf0
--- /dev/null
+++ b/xtrack/multisetter/multisetter.h
@@ -0,0 +1,76 @@
+// copyright ############################### //
+// This file is part of the Xtrack Package.  //
+// Copyright (c) CERN, 2025.                 //
+// ######################################### //
+
+#ifndef XTRACK_MULTISETTER_H
+#define XTRACK_MULTISETTER_H
+
+#include "xobjects/headers/common.h"
+
+
+GPUKERN
+void get_values_at_offsets_float64(
+    MultiSetterData data,
+    GPUGLMEM int8_t* buffer,
+    GPUGLMEM double* out){
+
+    int64_t num_offsets = MultiSetterData_len_offsets(data);
+
+    VECTORIZE_OVER(ii, num_offsets);
+        int64_t offs = MultiSetterData_get_offsets(data, ii);
+
+        double val = *((GPUGLMEM double*)(buffer + offs));
+        out[ii] = val;
+    END_VECTORIZE;
+}
+
+GPUKERN
+void get_values_at_offsets_int64(
+    MultiSetterData data,
+    GPUGLMEM int8_t* buffer,
+    GPUGLMEM int64_t* out){
+
+    int64_t num_offsets = MultiSetterData_len_offsets(data);
+
+    VECTORIZE_OVER(ii, num_offsets);
+        int64_t offs = MultiSetterData_get_offsets(data, ii);
+
+        int64_t val = *((GPUGLMEM int64_t*)(buffer + offs));
+        out[ii] = val;
+    END_VECTORIZE;
+}
+
+GPUKERN
+void set_values_at_offsets_float64(
+    MultiSetterData data,
+    GPUGLMEM int8_t* buffer,
+    GPUGLMEM double* input){
+
+    int64_t num_offsets = MultiSetterData_len_offsets(data);
+
+    VECTORIZE_OVER(ii, num_offsets);
+        int64_t offs = MultiSetterData_get_offsets(data, ii);
+
+        double val = input[ii];
+        *((GPUGLMEM double*)(buffer + offs)) = val;
+    END_VECTORIZE;
+}
+
+GPUKERN
+void set_values_at_offsets_int64(
+    MultiSetterData data,
+    GPUGLMEM int8_t* buffer,
+    GPUGLMEM int64_t* input){
+
+    int64_t num_offsets = MultiSetterData_len_offsets(data);
+
+    VECTORIZE_OVER(ii, num_offsets);
+        int64_t offs = MultiSetterData_get_offsets(data, ii);
+
+        int64_t val = input[ii];
+        *((GPUGLMEM int64_t*)(buffer + offs)) = val;
+    END_VECTORIZE;
+}
+
+#endif /* XTRACK_MULTISETTER_H */
\ No newline at end of file
diff --git a/xtrack/multisetter/multisetter.py b/xtrack/multisetter/multisetter.py
index e963eece6..b3cdb1b1c 100644
--- a/xtrack/multisetter/multisetter.py
+++ b/xtrack/multisetter/multisetter.py
@@ -3,75 +3,6 @@
 import xobjects as xo
 import xtrack as xt
 
-source = """
-
-/*gpukern*/
-void get_values_at_offsets_float64(
-    MultiSetterData data,
-    /*gpuglmem*/ int8_t* buffer,
-    /*gpuglmem*/ double* out){
-
-    int64_t num_offsets = MultiSetterData_len_offsets(data);
-
-    for (int64_t ii = 0; ii < num_offsets; ii++) { //vectorize_over ii num_offsets
-        int64_t offs = MultiSetterData_get_offsets(data, ii);
-
-        double val = *((/*gpuglmem*/ double*)(buffer + offs));
-        out[ii] = val;
-    } //end_vectorize
-}
-
-/*gpukern*/
-void get_values_at_offsets_int64(
-    MultiSetterData data,
-    /*gpuglmem*/ int8_t* buffer,
-    /*gpuglmem*/ int64_t* out){
-
-    int64_t num_offsets = MultiSetterData_len_offsets(data);
-
-    for (int64_t ii = 0; ii < num_offsets; ii++) { //vectorize_over ii num_offsets
-        int64_t offs = MultiSetterData_get_offsets(data, ii);
-
-        int64_t val = *((/*gpuglmem*/ int64_t*)(buffer + offs));
-        out[ii] = val;
-    } //end_vectorize
-}
-
-/*gpukern*/
-void set_values_at_offsets_float64(
-    MultiSetterData data,
-    /*gpuglmem*/ int8_t* buffer,
-    /*gpuglmem*/ double* input){
-
-    int64_t num_offsets = MultiSetterData_len_offsets(data);
-
-    for (int64_t ii = 0; ii < num_offsets; ii++) {  //vectorize_over ii num_offsets
-        int64_t offs = MultiSetterData_get_offsets(data, ii);
-
-        double val = input[ii];
-        *((/*gpuglmem*/ double*)(buffer + offs)) = val;
-    } //end_vectorize
-}
-
-/*gpukern*/
-void set_values_at_offsets_int64(
-    MultiSetterData data,
-    /*gpuglmem*/ int8_t* buffer,
-    /*gpuglmem*/ int64_t* input){
-
-    int64_t num_offsets = MultiSetterData_len_offsets(data);
-
-    for (int64_t ii = 0; ii < num_offsets; ii++) {  //vectorize_over ii num_offsets
-        int64_t offs = MultiSetterData_get_offsets(data, ii);
-
-        int64_t val = input[ii];
-        *((/*gpuglmem*/ int64_t*)(buffer + offs)) = val;
-    } //end_vectorize
-}
-
-
-
-"""
 
 class MultiSetter(xo.HybridClass):
     _xofields = {
@@ -79,7 +10,7 @@ class MultiSetter(xo.HybridClass):
     }
 
     _extra_c_sources = [
-        source,
+        '#include "xtrack/multisetter/multisetter.h"',
     ]
 
     _kernels = {
diff --git a/xtrack/particles/local_particle_custom_api.h b/xtrack/particles/local_particle_custom_api.h
new file mode 100644
index 000000000..19b9ce078
--- /dev/null
+++ b/xtrack/particles/local_particle_custom_api.h
@@ -0,0 +1,289 @@
+// copyright ############################### //
+// This file is part of the Xtrack Package.  //
+// Copyright (c) CERN, 2025.                 //
+// ######################################### //
+
+#ifndef XTRACK_LOCAL_PARTICLE_CUSTOM_API_H
+#define XTRACK_LOCAL_PARTICLE_CUSTOM_API_H
+
+#include "xtrack/headers/track.h"
+
+
+GPUFUN
+double LocalParticle_get_energy0(LocalParticle* part) {
+    double const p0c = LocalParticle_get_p0c(part);
+    double const m0  = LocalParticle_get_mass0(part);
+    return sqrt(p0c * p0c + m0 * m0);
+}
+
+
+GPUFUN
+void LocalParticle_update_ptau(LocalParticle* part, double new_ptau_value) {
+    double const beta0 = LocalParticle_get_beta0(part);
+    double const ptau = new_ptau_value;
+    double const irpp = sqrt(ptau * ptau + 2 * ptau / beta0 + 1);
+    double const new_rpp = 1. / irpp;
+    double const new_rvv = irpp / (1 + beta0 * ptau);
+
+    LocalParticle_set_delta(part, irpp - 1);
+    LocalParticle_set_rvv(part, new_rvv);
+    LocalParticle_set_ptau(part, ptau);
+    LocalParticle_set_rpp(part, new_rpp);
+}
+
+
+GPUFUN
+void LocalParticle_update_delta(LocalParticle* part, double new_delta_value) {
+    double const beta0 = LocalParticle_get_beta0(part);
+    double const delta_beta0 = new_delta_value * beta0;
+    double const ptau_beta0 = sqrt(delta_beta0 * delta_beta0 + 2 * delta_beta0 * beta0 + 1) - 1;
+    double const one_plus_delta = 1 + new_delta_value;
+    double const rvv = (one_plus_delta) / (1 + ptau_beta0);
+    double const rpp = 1 / one_plus_delta;
+    double const ptau = ptau_beta0 / beta0;
+
+    LocalParticle_set_delta(part, new_delta_value);
+    LocalParticle_set_rvv(part, rvv);
+    LocalParticle_set_rpp(part, rpp);
+    LocalParticle_set_ptau(part, ptau);
+}
+
+
+GPUFUN
+double LocalParticle_get_pzeta(LocalParticle* part) {
+    double const ptau = LocalParticle_get_ptau(part);
+    double const beta0 = LocalParticle_get_beta0(part);
+    return ptau / beta0;
+}
+
+
+GPUFUN
+void LocalParticle_update_pzeta(LocalParticle* part, double new_pzeta_value) {
+    double const beta0 = LocalParticle_get_beta0(part);
+    LocalParticle_update_ptau(part, beta0 * new_pzeta_value);
+}
+
+
+GPUFUN
+void increment_at_element(LocalParticle* part0, int64_t const increment) {
+    START_PER_PARTICLE_BLOCK(part0, part);
+        LocalParticle_add_to_at_element(part, increment);
+    END_PER_PARTICLE_BLOCK;
+}
+
+
+GPUFUN
+void increment_at_turn(LocalParticle* part0, int flag_reset_s){
+    START_PER_PARTICLE_BLOCK(part0, part);
+        LocalParticle_add_to_at_turn(part, 1);
+        LocalParticle_set_at_element(part, 0);
+        if (flag_reset_s > 0) {
+            LocalParticle_set_s(part, 0.);
+        }
+    END_PER_PARTICLE_BLOCK;
+}
+
+
+GPUFUN
+void increment_at_turn_backtrack(
+        LocalParticle* part0,
+        int flag_reset_s,
+        double const line_length,
+        int64_t const num_elements
+) {
+    START_PER_PARTICLE_BLOCK(part0, part);
+        LocalParticle_add_to_at_turn(part, -1);
+        LocalParticle_set_at_element(part, num_elements);
+        if (flag_reset_s > 0) {
+            LocalParticle_set_s(part, line_length);
+        }
+    END_PER_PARTICLE_BLOCK;
+}
+
+
+/* check_is_active has different implementation on CPU and GPU */
+#if defined(XO_CONTEXT_CPU_SERIAL)
+
+    GPUFUN
+    int64_t check_is_active(LocalParticle* part) {
+        int64_t ipart = 0;
+        while (ipart < part->_num_active_particles){
+            #ifdef XSUITE_RESTORE_LOSS
+                ipart++;
+            #else
+                if (part->state[ipart] < 1) {
+                    LocalParticle_exchange(part, ipart, part->_num_active_particles - 1);
+                    part->_num_active_particles--;
+                    part->_num_lost_particles++;
+                } else {
+                    ipart++;
+                }
+            #endif
+        }
+
+        if (part->_num_active_particles == 0){
+            return 0; //All particles lost
+        } else {
+            return 1; //Some stable particles are still present
+        }
+    }
+
+#elif defined(XO_CONTEXT_CPU_OPENMP)
+
+    GPUFUN
+    int64_t check_is_active(LocalParticle* part) {
+    #ifndef SKIP_SWAPS
+        int64_t ipart = part->ipart;
+        int64_t endpart = part->endpart;
+        int64_t left = ipart;
+        int64_t right = endpart - 1;
+        int64_t swap_made = 0;
+        int64_t has_alive = 0;
+
+        if (left == right)
+            return part->state[left] > 0;
+
+        while (left < right) {
+            if (part->state[left] > 0) {
+                left++;
+                has_alive = 1;
+            }
+            else if (part->state[right] <= 0)
+                right--;
+            else {
+                LocalParticle_exchange(part, left, right);
+                left++;
+                right--;
+                swap_made = 1;
+            }
+        }
+        return swap_made || has_alive;
+    #else
+        return 1;
+    #endif
+    }
+
+
+    GPUFUN
+    void count_reorganized_particles(LocalParticle* part) {
+        int64_t num_active = 0;
+        int64_t num_lost = 0;
+
+        for (int64_t i = part->ipart; i < part->endpart; i++) {
+            if (part->state[i] <= -999999999)
+                break;
+            else if (part->state[i] > 0)
+                num_active++;
+            else
+                num_lost++;
+        }
+
+        part->_num_active_particles = num_active;
+        part->_num_lost_particles = num_lost;
+    }
+
+#else // not XO_CONTEXT_CPU_SERIAL and not XO_CONTEXT_CPU_OPENMP
+
+    GPUFUN
+    int64_t check_is_active(LocalParticle* part) {
+        return LocalParticle_get_state(part) > 0;
+    };
+
+#endif
+
+
+GPUFUN
+void LocalParticle_add_to_energy(LocalParticle* part, double delta_energy, int pz_only )
+{
+    double ptau = LocalParticle_get_ptau(part);
+    double const p0c = LocalParticle_get_p0c(part);
+    double const charge_ratio = LocalParticle_get_charge_ratio(part);
+    double const chi = LocalParticle_get_chi(part);
+    double const mass_ratio = charge_ratio / chi;
+
+    ptau += delta_energy / p0c / mass_ratio;
+
+    double const old_rpp = LocalParticle_get_rpp(part);
+
+    LocalParticle_update_ptau(part, ptau);
+
+    if (!pz_only) {
+        double const new_rpp = LocalParticle_get_rpp(part);
+        double const f = old_rpp / new_rpp;
+        LocalParticle_scale_px(part, f);
+        LocalParticle_scale_py(part, f);
+    }
+}
+
+
+GPUFUN
+void LocalParticle_update_p0c(LocalParticle* part, double new_p0c_value)
+{
+    double const mass0 = LocalParticle_get_mass0(part);
+    double const old_p0c = LocalParticle_get_p0c(part);
+    double const old_delta = LocalParticle_get_delta(part);
+    double const old_beta0 = LocalParticle_get_beta0(part);
+
+    double const ppc = old_p0c * old_delta + old_p0c;
+    double const new_delta = (ppc - new_p0c_value) / new_p0c_value;
+
+    double const new_energy0 = sqrt(new_p0c_value * new_p0c_value + mass0 * mass0);
+    double const new_beta0 = new_p0c_value / new_energy0;
+    double const new_gamma0 = new_energy0 / mass0;
+
+    LocalParticle_set_p0c(part, new_p0c_value);
+    LocalParticle_set_gamma0(part, new_gamma0);
+    LocalParticle_set_beta0(part, new_beta0);
+
+    LocalParticle_update_delta(part, new_delta);
+
+    LocalParticle_scale_px(part, old_p0c / new_p0c_value);
+    LocalParticle_scale_py(part, old_p0c / new_p0c_value);
+
+    LocalParticle_scale_zeta(part, new_beta0 / old_beta0);
+
+}
+
+GPUFUN
+void LocalParticle_kill_particle(LocalParticle* part, int64_t kill_state) {
+    LocalParticle_set_x(part, 1e30);
+    LocalParticle_set_px(part, 1e30);
+    LocalParticle_set_y(part, 1e30);
+    LocalParticle_set_py(part, 1e30);
+    LocalParticle_set_zeta(part, 1e30);
+    LocalParticle_update_delta(part, -1);  // zero energy
+    LocalParticle_set_state(part, kill_state);
+}
+
+
+#ifdef XTRACK_GLOBAL_XY_LIMIT
+
+    GPUFUN
+    void global_aperture_check(LocalParticle* part0)
+    {
+        if (LocalParticle_check_track_flag(part0, XS_FLAG_IGNORE_GLOBAL_APERTURE))
+        {
+            return;
+        }
+
+        START_PER_PARTICLE_BLOCK(part0, part);
+            double const x = LocalParticle_get_x(part);
+            double const y = LocalParticle_get_y(part);
+
+            int64_t const is_alive = (int64_t)(
+                (x >= -XTRACK_GLOBAL_XY_LIMIT) &&
+                (x <=  XTRACK_GLOBAL_XY_LIMIT) &&
+                (y >= -XTRACK_GLOBAL_XY_LIMIT) &&
+                (y <=  XTRACK_GLOBAL_XY_LIMIT)
+            );
+
+            // I assume that if I am in the function is because
+            if (!is_alive) {
+               LocalParticle_set_state(part, -1);
+            }
+        END_PER_PARTICLE_BLOCK;
+    }
+
+#endif
+
+#endif /* XTRACK_LOCAL_PARTICLE_CUSTOM_API_H */
diff --git a/xtrack/particles/particles.py b/xtrack/particles/particles.py
index eca5b324a..b7a5438b9 100644
--- a/xtrack/particles/particles.py
+++ b/xtrack/particles/particles.py
@@ -23,69 +23,398 @@
 
 LAST_INVALID_STATE = -999999999
 
+size_vars = (
+    (xo.Int64, '_capacity'),
+    (xo.Int64, '_num_active_particles'),
+    (xo.Int64, '_num_lost_particles'),
+    (xo.Int64, 'start_tracking_at_element'),
+)
+# Capacity is always kept up to date
+# the other two are placeholders to be used if needed
+# i.e. on ContextCpu
+
+scalar_vars = (
+    (xo.Float64, 'q0'),
+    (xo.Float64, 'mass0'),
+    (xo.Float64, 't_sim'),
+)
+
+part_energy_vars = (
+    (xo.Float64, 'ptau'),
+    (xo.Float64, 'delta'),
+    (xo.Float64, 'rpp'),
+    (xo.Float64, 'rvv'),
+)
+
+per_particle_vars = (
+    (
+        (xo.Float64, 'p0c'),
+        (xo.Float64, 'gamma0'),
+        (xo.Float64, 'beta0'),
+        (xo.Float64, 's'),
+        (xo.Float64, 'zeta'),
+        (xo.Float64, 'x'),
+        (xo.Float64, 'y'),
+        (xo.Float64, 'px'),
+        (xo.Float64, 'py'),
+    )
+    + part_energy_vars +
+    (
+        (xo.Float64, 'chi'),
+        (xo.Float64, 'charge_ratio'),
+        (xo.Float64, 'weight'),
+        (xo.Float64, 'ax'),
+        (xo.Float64, 'ay'),
+        (xo.Float64, 'spin_x'),
+        (xo.Float64, 'spin_y'),
+        (xo.Float64, 'spin_z'),
+        (xo.Float64, 'anomalous_magnetic_moment'),
+        (xo.Int64, 'pdg_id'),
+        (xo.Int64, 'particle_id'),
+        (xo.Int64, 'at_element'),
+        (xo.Int64, 'at_turn'),
+        (xo.Int64, 'state'),
+        (xo.Int64, 'parent_particle_id'),
+        (xo.UInt32, '_rng_s1'),
+        (xo.UInt32, '_rng_s2'),
+        (xo.UInt32, '_rng_s3'),
+        (xo.UInt32, '_rng_s4')
+    )
+)
 
-class Particles(xo.HybridClass):
-    _cname = 'ParticlesData'
 
-    size_vars = (
-        (xo.Int64, '_capacity'),
-        (xo.Int64, '_num_active_particles'),
-        (xo.Int64, '_num_lost_particles'),
-        (xo.Int64, 'start_tracking_at_element'),
+def gen_local_particle_api():
+    src_lines = [
+        '#include "xobjects/headers/common.h"',
+        '#include "xtrack/particles/rng_src/base_rng.h"',
+        '#include "xtrack/particles/rng_src/particles_rng.h"',
+    ]
+    for name, mass in mass__dict__.items():
+        if name.endswith('_MASS_EV'):
+            src_lines.append(f'#define {name} {mass}')
+
+    src_lines.append('typedef struct {')
+
+    for tt, vv in size_vars + scalar_vars:
+        src_lines.append('                 ' + tt._c_type + '  ' + vv + ';')
+
+    for tt, vv in per_particle_vars:
+        src_lines.append('    GPUGLMEM ' + tt._c_type + '* ' + vv + ';')
+
+    src_lines.append('             int64_t ipart;')
+    src_lines.append('             int64_t endpart;')
+    src_lines.append('             uint64_t track_flags;')
+    src_lines.append('    GPUGLMEM int8_t* io_buffer;')
+    src_lines.append('} LocalParticle;')
+    src_typedef = '\n'.join(src_lines)
+
+    # Get io buffer
+    src_lines = []
+    src_lines.append(
+        '''
+        GPUFUN
+        GPUGLMEM int8_t* LocalParticle_get_io_buffer(LocalParticle* part){
+            return part->io_buffer;
+        }
+
+        '''
     )
-    # Capacity is always kept up to date
-    # the other two are placeholders to be used if needed
-    # i.e. on ContextCpu
-
-    scalar_vars = (
-        (xo.Float64, 'q0'),
-        (xo.Float64, 'mass0'),
-        (xo.Float64, 't_sim'),
+
+    # Get track flag
+    src_lines.append(
+        '''
+        GPUFUN
+        uint64_t LocalParticle_check_track_flag(LocalParticle* part, uint8_t index){
+            return (part->track_flags >> index) & 1;
+        }
+    '''
     )
 
-    part_energy_vars = (
-        (xo.Float64, 'ptau'),
-        (xo.Float64, 'delta'),
-        (xo.Float64, 'rpp'),
-        (xo.Float64, 'rvv'),
+    # Particles_to_LocalParticle
+    src_lines.append(
+        '''
+        GPUFUN
+        void Particles_to_LocalParticle(ParticlesData source,
+                                        LocalParticle* dest,
+                                        int64_t id,
+                                        int64_t eid){'''
+    )
+    for _, vv in size_vars + scalar_vars:
+        src_lines.append(
+            f'  dest->{vv} = ParticlesData_get_' + vv + '(source);'
+        )
+
+    for _, vv in per_particle_vars:
+        src_lines.append(
+            f'  dest->{vv} = ParticlesData_getp1_' + vv + '(source, 0);'
+        )
+
+    src_lines.append('  dest->ipart = id;')
+    src_lines.append('  dest->endpart = eid;')
+    src_lines.append('}')
+    src_particles_to_local = '\n'.join(src_lines)
+
+    # LocalParticle_to_Particles
+    src_lines = []
+    src_lines.append(
+        '''
+        GPUFUN
+        void LocalParticle_to_Particles(LocalParticle* source,
+                                        ParticlesData dest,
+                                        int64_t id,
+                                        int64_t set_scalar){'''
     )
+    src_lines.append('if (set_scalar){')
+    for _, vv in size_vars + scalar_vars:
+        src_lines.append(
+            '  ParticlesData_set_' + vv + '(dest,'
+                                          f'      LocalParticle_get_{vv}(source));'
+        )
+    src_lines.append('}')
+
+    for _, vv in per_particle_vars:
+        src_lines.append(
+            '  ParticlesData_set_' + vv + '(dest, id, '
+                                          f'      LocalParticle_get_{vv}(source));'
+        )
+    src_lines.append('}')
+    src_local_to_particles = '\n'.join(src_lines)
+
+    # Adders
+    src_lines = []
+    for tt, vv in per_particle_vars:
+        src_lines.append(
+            '''
+        GPUFUN
+        void LocalParticle_add_to_''' + vv + f'(LocalParticle* part, {tt._c_type} value)'
+            + '{'
+        )
+        src_lines.append(f'#ifndef FREEZE_VAR_{vv}')
+        src_lines.append(f'  part->{vv}[part->ipart] += value;')
+        src_lines.append('#endif')
+        src_lines.append('}\n')
+    src_adders = '\n'.join(src_lines)
+
+    # Scalers
+    src_lines = []
+    for tt, vv in per_particle_vars:
+        src_lines.append(
+            '''
+        GPUFUN
+        void LocalParticle_scale_''' + vv + f'(LocalParticle* part, {tt._c_type} value)'
+            + '{'
+        )
+        src_lines.append(f'#ifndef FREEZE_VAR_{vv}')
+        src_lines.append(f'  part->{vv}[part->ipart] *= value;')
+        src_lines.append('#endif')
+        src_lines.append('}\n')
+    src_scalers = '\n'.join(src_lines)
+
+    # Setters
+    src_lines = []
+    for tt, vv in per_particle_vars:
+        src_lines.append(
+            '''
+        GPUFUN
+        void LocalParticle_set_''' + vv + f'(LocalParticle* part, {tt._c_type} value)'
+            + '{'
+        )
+        src_lines.append(f'#ifndef FREEZE_VAR_{vv}')
+        src_lines.append(f'  part->{vv}[part->ipart] = value;')
+        src_lines.append('#endif')
+        src_lines.append('}')
+    src_setters = '\n'.join(src_lines)
+
+    # Getters
+    src_lines = []
 
-    per_particle_vars = (
-        (
-            (xo.Float64, 'p0c'),
-            (xo.Float64, 'gamma0'),
-            (xo.Float64, 'beta0'),
-            (xo.Float64, 's'),
-            (xo.Float64, 'zeta'),
-            (xo.Float64, 'x'),
-            (xo.Float64, 'y'),
-            (xo.Float64, 'px'),
-            (xo.Float64, 'py'),
+    for tt, vv in size_vars + scalar_vars:
+        src_lines.append('GPUFUN')
+        src_lines.append(
+            f'{tt._c_type} LocalParticle_get_' + vv
+            + '(LocalParticle* part)'
+            + '{'
+        )
+        src_lines.append(f'  return part->{vv};')
+        src_lines.append('}')
+
+    for tt, vv in per_particle_vars:
+        src_lines.append('GPUFUN')
+        src_lines.append(
+            f'{tt._c_type} LocalParticle_get_' + vv
+            + '(LocalParticle* part)'
+            + '{'
+        )
+        src_lines.append(f'  return part->{vv}[part->ipart];')
+        src_lines.append('}')
+
+    src_getters = '\n'.join(src_lines)
+
+    # Angles
+    src_angles_lines = []
+    for exact in ['', 'exact_']:
+        # xp (py as transverse) and vice versa
+        for xx, yy in [['x', 'y'], ['y', 'x']]:
+            # Getter
+            src_angles_lines.append('GPUFUN')
+            src_angles_lines.append(
+                f'double LocalParticle_get_{exact}{xx}p(LocalParticle* part){{'
+            )
+            src_angles_lines.append(
+                f'    double const p{xx} = LocalParticle_get_p{xx}(part);'
+            )
+            if exact == 'exact_':
+                src_angles_lines.append(
+                    f'    double const p{yy} = LocalParticle_get_p{yy}(part);'
+                )
+                src_angles_lines.append(
+                    '    double const one_plus_delta = 1. + LocalParticle_get_delta(part);'
+                )
+                src_angles_lines.append(
+                    '    double const rpp = 1./sqrt(one_plus_delta*one_plus_delta - px*px - py*py);'
+                )
+            else:
+                src_angles_lines.append(
+                    '    double const rpp = LocalParticle_get_rpp(part);'
+                )
+            src_angles_lines.append(
+                '    // INFO: this is not the angle, but sin(angle)'
+            )
+            src_angles_lines.append(f'    return p{xx}*rpp;')
+            src_angles_lines.append('}')
+            src_angles_lines.append('')
+
+        for xx, yy in [['x', 'y'], ['y', 'x']]:
+            # Setter
+            src_angles_lines.append('GPUFUN')
+            src_angles_lines.append(
+                f'void LocalParticle_set_{exact}{xx}p(LocalParticle* part, double {xx}p){{'
+            )
+            src_angles_lines.append(f'#ifndef FREEZE_VAR_p{xx}')
+            src_angles_lines.append(
+                '    double rpp = LocalParticle_get_rpp(part);'
+            )
+            if exact == 'exact_':
+                src_angles_lines.append(
+                    f'    // Careful! If {yy}p also changes, use LocalParticle_set_{exact}xp_yp!'
+                )
+                src_angles_lines.append(
+                    f'    double const {yy}p = LocalParticle_get_{exact}{yy}p(part);'
+                )
+                src_angles_lines.append('    rpp *= sqrt(1 + xp*xp + yp*yp);')
+            src_angles_lines.append(
+                f'    // INFO: {xx}p is not the angle, but sin(angle)'
+            )
+            src_angles_lines.append(
+                f'    LocalParticle_set_p{xx}(part, {xx}p/rpp);'
+            )
+            src_angles_lines.append('#endif')
+            src_angles_lines.append('}')
+            src_angles_lines.append('')
+
+        for xx, yy in [['x', 'y'], ['y', 'x']]:
+            # Adder
+            src_angles_lines.append('GPUFUN')
+            src_angles_lines.append(
+                f'void LocalParticle_add_to_{exact}{xx}p(LocalParticle* part, double {xx}p){{'
+            )
+            src_angles_lines.append(f'#ifndef FREEZE_VAR_p{xx}')
+            src_angles_lines.append(
+                f'    LocalParticle_set_{exact}{xx}p(part, '
+                + f'LocalParticle_get_{exact}{xx}p(part) + {xx}p);'
+            )
+            src_angles_lines.append('#endif')
+            src_angles_lines.append('}')
+            src_angles_lines.append('')
+            # Scaler
+            src_angles_lines.append('GPUFUN')
+            src_angles_lines.append(
+                f'void LocalParticle_scale_{exact}{xx}p(LocalParticle* part, double value){{'
+            )
+            src_angles_lines.append(f'#ifndef FREEZE_VAR_p{xx}')
+            src_angles_lines.append(
+                f'    LocalParticle_set_{exact}{xx}p(part, '
+                + f'LocalParticle_get_{exact}{xx}p(part) * value);'
+            )
+            src_angles_lines.append('#endif')
+            src_angles_lines.append('}')
+            src_angles_lines.append('')
+        # Double setter, adder, scaler
+        src_angles_lines.append('GPUFUN')
+        src_angles_lines.append(
+            f'void LocalParticle_set_{exact}xp_yp(LocalParticle* part, double xp, double yp){{'
+        )
+        src_angles_lines.append('    double rpp = LocalParticle_get_rpp(part);')
+        if exact == 'exact_':
+            src_angles_lines.append('    rpp *= sqrt(1 + xp*xp + yp*yp);')
+        for xx in ['x', 'y']:
+            src_angles_lines.append(f'#ifndef FREEZE_VAR_p{xx}')
+            src_angles_lines.append(
+                f'    LocalParticle_set_p{xx}(part, {xx}p/rpp);'
+            )
+            src_angles_lines.append('#endif')
+        src_angles_lines.append('}')
+        src_angles_lines.append('')
+        src_angles_lines.append('GPUFUN')
+        src_angles_lines.append(
+            f'void LocalParticle_add_to_{exact}xp_yp(LocalParticle* part, double xp, double yp){{'
+        )
+        src_angles_lines.append(
+            f'    LocalParticle_set_{exact}xp_yp(part, '
+            + f'LocalParticle_get_{exact}xp(part) + xp, '
+            + f'LocalParticle_get_{exact}yp(part) + yp);'
+        )
+        src_angles_lines.append('}')
+        src_angles_lines.append('')
+        src_angles_lines.append('GPUFUN')
+        src_angles_lines.append(
+            f'void LocalParticle_scale_{exact}xp_yp(LocalParticle* part, double value_x, double value_y){{'
         )
-        + part_energy_vars +
-        (
-            (xo.Float64, 'chi'),
-            (xo.Float64, 'charge_ratio'),
-            (xo.Float64, 'weight'),
-            (xo.Float64, 'ax'),
-            (xo.Float64, 'ay'),
-            (xo.Float64, 'spin_x'),
-            (xo.Float64, 'spin_y'),
-            (xo.Float64, 'spin_z'),
-            (xo.Float64, 'anomalous_magnetic_moment'),
-            (xo.Int64, 'pdg_id'),
-            (xo.Int64, 'particle_id'),
-            (xo.Int64, 'at_element'),
-            (xo.Int64, 'at_turn'),
-            (xo.Int64, 'state'),
-            (xo.Int64, 'parent_particle_id'),
-            (xo.UInt32, '_rng_s1'),
-            (xo.UInt32, '_rng_s2'),
-            (xo.UInt32, '_rng_s3'),
-            (xo.UInt32, '_rng_s4')
+        src_angles_lines.append(
+            f'    LocalParticle_set_{exact}xp_yp(part, '
+            + f'LocalParticle_get_{exact}xp(part) * value_x, '
+            + f'LocalParticle_get_{exact}yp(part) * value_y);'
         )
+        src_angles_lines.append('}')
+    src_angles = '\n'.join(src_angles_lines)
+
+    # Particle exchangers
+    src_exchange = '''
+    GPUFUN
+    void LocalParticle_exchange(LocalParticle* part, int64_t i1, int64_t i2){
+    '''
+    for tt, vv in per_particle_vars:
+        src_exchange += '\n'.join(
+            [
+                '\n    {',
+                f'    {tt._c_type} temp = part->{vv}[i2];',
+                f'    part->{vv}[i2] = part->{vv}[i1];',
+                f'    part->{vv}[i1] = temp;',
+                '     }']
+        )
+    src_exchange += '}\n'
+
+    source = '\n\n'.join(
+        [src_typedef, src_adders, src_getters,
+            src_setters, src_scalers, src_exchange,
+            src_particles_to_local, src_local_to_particles,
+            src_angles]
     )
 
+    source += """
+                #include "xtrack/particles/local_particle_custom_api.h"
+    """
+    return source
+
+
+class Particles(xo.HybridClass):
+    _cname = 'ParticlesData'
+
+    size_vars = size_vars
+    scalar_vars = scalar_vars
+    part_energy_vars = part_energy_vars
+    per_particle_vars = per_particle_vars
+
     _xofields = {
         **{nn: tt for tt, nn in size_vars + scalar_vars},
         **{nn: tt[:] for tt, nn in per_particle_vars},
@@ -95,9 +424,7 @@ class Particles(xo.HybridClass):
                     if (not vv[1].startswith('_') and vv[1] != 't_sim')] + ['t_sim']
 
     _extra_c_sources = [
-        Path(__file__).parent.joinpath('rng_src', 'base_rng.h'),
-        Path(__file__).parent.joinpath('rng_src', 'particles_rng.h'),
-        '\n /*placeholder_for_local_particle_src*/ \n'
+        gen_local_particle_api()
     ]
 
     _rename = {
@@ -1471,551 +1798,6 @@ def add_to_energy(self, delta_energy):
     def set_particle(self, index, set_scalar_vars=False, **kwargs):
         raise NotImplementedError('This functionality has been removed')
 
-    @classmethod
-    def gen_local_particle_api(cls, mode='no_local_copy'):
-        if mode != 'no_local_copy':
-            raise NotImplementedError
-
-        src_lines = ['']
-        for name, mass in mass__dict__.items():
-            if name.endswith('_MASS_EV'):
-                src_lines.append(f'#define {name} {mass}')
-
-        src_lines.append('typedef struct {')
-
-        for tt, vv in cls.size_vars + cls.scalar_vars:
-            src_lines.append('                 ' + tt._c_type + '  ' + vv + ';')
-
-        for tt, vv in cls.per_particle_vars:
-            src_lines.append('    /*gpuglmem*/ ' + tt._c_type + '* ' + vv + ';')
-
-        src_lines.append('                 int64_t ipart;')
-        src_lines.append('                 int64_t endpart;')
-        src_lines.append('                 uint64_t track_flags;')
-        src_lines.append('    /*gpuglmem*/ int8_t* io_buffer;')
-        src_lines.append('} LocalParticle;')
-        src_typedef = '\n'.join(src_lines)
-
-        # Get io buffer
-        src_lines = []
-        src_lines.append('''
-            /*gpufun*/
-            /*gpuglmem*/ int8_t* LocalParticle_get_io_buffer(LocalParticle* part){
-                return part->io_buffer;
-            }
-
-            ''')
-
-        # Get track flag
-        src_lines.append('''
-            /*gpufun*/
-            uint64_t LocalParticle_check_track_flag(LocalParticle* part, uint8_t index){
-                return (part->track_flags >> index) & 1;
-            }
-        ''')
-
-        # Particles_to_LocalParticle
-        src_lines.append('''
-            /*gpufun*/
-            void Particles_to_LocalParticle(ParticlesData source,
-                                            LocalParticle* dest,
-                                            int64_t id,
-                                            int64_t eid){''')
-        for _, vv in cls.size_vars + cls.scalar_vars:
-            src_lines.append(
-                f'  dest->{vv} = ParticlesData_get_' + vv + '(source);')
-
-        for _, vv in cls.per_particle_vars:
-            src_lines.append(
-                f'  dest->{vv} = ParticlesData_getp1_' + vv + '(source, 0);')
-
-        src_lines.append('  dest->ipart = id;')
-        src_lines.append('  dest->endpart = eid;')
-        src_lines.append('}')
-        src_particles_to_local = '\n'.join(src_lines)
-
-        # LocalParticle_to_Particles
-        src_lines = []
-        src_lines.append('''
-            /*gpufun*/
-            void LocalParticle_to_Particles(LocalParticle* source,
-                                            ParticlesData dest,
-                                            int64_t id,
-                                            int64_t set_scalar){''')
-        src_lines.append('if (set_scalar){')
-        for _, vv in cls.size_vars + cls.scalar_vars:
-            src_lines.append(
-                '  ParticlesData_set_' + vv + '(dest,'
-                                               f'      LocalParticle_get_{vv}(source));')
-        src_lines.append('}')
-
-        for _, vv in cls.per_particle_vars:
-            src_lines.append(
-                '  ParticlesData_set_' + vv + '(dest, id, '
-                                               f'      LocalParticle_get_{vv}(source));')
-        src_lines.append('}')
-        src_local_to_particles = '\n'.join(src_lines)
-
-        # Adders
-        src_lines = []
-        for tt, vv in cls.per_particle_vars:
-            src_lines.append('''
-            /*gpufun*/
-            void LocalParticle_add_to_''' + vv + f'(LocalParticle* part, {tt._c_type} value)'
-                             + '{')
-            src_lines.append(f'#ifndef FREEZE_VAR_{vv}')
-            src_lines.append(f'  part->{vv}[part->ipart] += value;')
-            src_lines.append('#endif')
-            src_lines.append('}\n')
-        src_adders = '\n'.join(src_lines)
-
-        # Scalers
-        src_lines = []
-        for tt, vv in cls.per_particle_vars:
-            src_lines.append('''
-            /*gpufun*/
-            void LocalParticle_scale_''' + vv + f'(LocalParticle* part, {tt._c_type} value)'
-                             + '{')
-            src_lines.append(f'#ifndef FREEZE_VAR_{vv}')
-            src_lines.append(f'  part->{vv}[part->ipart] *= value;')
-            src_lines.append('#endif')
-            src_lines.append('}\n')
-        src_scalers = '\n'.join(src_lines)
-
-        # Setters
-        src_lines = []
-        for tt, vv in cls.per_particle_vars:
-            src_lines.append('''
-            /*gpufun*/
-            void LocalParticle_set_''' + vv + f'(LocalParticle* part, {tt._c_type} value)'
-                             + '{')
-            src_lines.append(f'#ifndef FREEZE_VAR_{vv}')
-            src_lines.append(f'  part->{vv}[part->ipart] = value;')
-            src_lines.append('#endif')
-            src_lines.append('}')
-        src_setters = '\n'.join(src_lines)
-
-        # Getters
-        src_lines = []
-
-        for tt, vv in cls.size_vars + cls.scalar_vars:
-            src_lines.append('/*gpufun*/')
-            src_lines.append(f'{tt._c_type} LocalParticle_get_' + vv
-                             + '(LocalParticle* part)'
-                             + '{')
-            src_lines.append(f'  return part->{vv};')
-            src_lines.append('}')
-
-        for tt, vv in cls.per_particle_vars:
-            src_lines.append('/*gpufun*/')
-            src_lines.append(f'{tt._c_type} LocalParticle_get_' + vv
-                             + '(LocalParticle* part)'
-                             + '{')
-            src_lines.append(f'  return part->{vv}[part->ipart];')
-            src_lines.append('}')
-
-        src_getters = '\n'.join(src_lines)
-
-        # Angles
-        src_angles_lines = []
-        for exact in ['', 'exact_']:
-            # xp (py as transverse) and vice versa
-            for xx, yy in [['x', 'y'], ['y', 'x']]:
-                # Getter
-                src_angles_lines.append('/*gpufun*/')
-                src_angles_lines.append(f'double LocalParticle_get_{exact}{xx}p(LocalParticle* part){{')
-                src_angles_lines.append(f'    double const p{xx} = LocalParticle_get_p{xx}(part);')
-                if exact == 'exact_':
-                    src_angles_lines.append(f'    double const p{yy} = LocalParticle_get_p{yy}(part);')
-                    src_angles_lines.append('    double const one_plus_delta = 1. + LocalParticle_get_delta(part);')
-                    src_angles_lines.append(
-                        '    double const rpp = 1./sqrt(one_plus_delta*one_plus_delta - px*px - py*py);')
-                else:
-                    src_angles_lines.append('    double const rpp = LocalParticle_get_rpp(part);')
-                src_angles_lines.append('    // INFO: this is not the angle, but sin(angle)')
-                src_angles_lines.append(f'    return p{xx}*rpp;')
-                src_angles_lines.append('}')
-                src_angles_lines.append('')
-
-            for xx, yy in [['x', 'y'], ['y', 'x']]:
-                # Setter
-                src_angles_lines.append('/*gpufun*/')
-                src_angles_lines.append(f'void LocalParticle_set_{exact}{xx}p(LocalParticle* part, double {xx}p){{')
-                src_angles_lines.append(f'#ifndef FREEZE_VAR_p{xx}')
-                src_angles_lines.append('    double rpp = LocalParticle_get_rpp(part);')
-                if exact == 'exact_':
-                    src_angles_lines.append(
-                        f'    // Careful! If {yy}p also changes, use LocalParticle_set_{exact}xp_yp!')
-                    src_angles_lines.append(f'    double const {yy}p = LocalParticle_get_{exact}{yy}p(part);')
-                    src_angles_lines.append('    rpp *= sqrt(1 + xp*xp + yp*yp);')
-                src_angles_lines.append(f'    // INFO: {xx}p is not the angle, but sin(angle)')
-                src_angles_lines.append(f'    LocalParticle_set_p{xx}(part, {xx}p/rpp);')
-                src_angles_lines.append('#endif')
-                src_angles_lines.append('}')
-                src_angles_lines.append('')
-
-            for xx, yy in [['x', 'y'], ['y', 'x']]:
-                # Adder
-                src_angles_lines.append('/*gpufun*/')
-                src_angles_lines.append(f'void LocalParticle_add_to_{exact}{xx}p(LocalParticle* part, double {xx}p){{')
-                src_angles_lines.append(f'#ifndef FREEZE_VAR_p{xx}')
-                src_angles_lines.append(f'    LocalParticle_set_{exact}{xx}p(part, '
-                                        + f'LocalParticle_get_{exact}{xx}p(part) + {xx}p);')
-                src_angles_lines.append('#endif')
-                src_angles_lines.append('}')
-                src_angles_lines.append('')
-                # Scaler
-                src_angles_lines.append('/*gpufun*/')
-                src_angles_lines.append(f'void LocalParticle_scale_{exact}{xx}p(LocalParticle* part, double value){{')
-                src_angles_lines.append(f'#ifndef FREEZE_VAR_p{xx}')
-                src_angles_lines.append(f'    LocalParticle_set_{exact}{xx}p(part, '
-                                        + f'LocalParticle_get_{exact}{xx}p(part) * value);')
-                src_angles_lines.append('#endif')
-                src_angles_lines.append('}')
-                src_angles_lines.append('')
-            # Double setter, adder, scaler
-            src_angles_lines.append('/*gpufun*/')
-            src_angles_lines.append(f'void LocalParticle_set_{exact}xp_yp(LocalParticle* part, double xp, double yp){{')
-            src_angles_lines.append('    double rpp = LocalParticle_get_rpp(part);')
-            if exact == 'exact_':
-                src_angles_lines.append('    rpp *= sqrt(1 + xp*xp + yp*yp);')
-            for xx in ['x', 'y']:
-                src_angles_lines.append(f'#ifndef FREEZE_VAR_p{xx}')
-                src_angles_lines.append(f'    LocalParticle_set_p{xx}(part, {xx}p/rpp);')
-                src_angles_lines.append('#endif')
-            src_angles_lines.append('}')
-            src_angles_lines.append('')
-            src_angles_lines.append('/*gpufun*/')
-            src_angles_lines.append(
-                f'void LocalParticle_add_to_{exact}xp_yp(LocalParticle* part, double xp, double yp){{')
-            src_angles_lines.append(f'    LocalParticle_set_{exact}xp_yp(part, '
-                                    + f'LocalParticle_get_{exact}xp(part) + xp, '
-                                    + f'LocalParticle_get_{exact}yp(part) + yp);')
-            src_angles_lines.append('}')
-            src_angles_lines.append('')
-            src_angles_lines.append('/*gpufun*/')
-            src_angles_lines.append(
-                f'void LocalParticle_scale_{exact}xp_yp(LocalParticle* part, double value_x, double value_y){{')
-            src_angles_lines.append(f'    LocalParticle_set_{exact}xp_yp(part, '
-                                    + f'LocalParticle_get_{exact}xp(part) * value_x, '
-                                    + f'LocalParticle_get_{exact}yp(part) * value_y);')
-            src_angles_lines.append('}')
-        src_angles = '\n'.join(src_angles_lines)
-
-        # Particle exchangers
-        src_exchange = '''
-        /*gpufun*/
-        void LocalParticle_exchange(LocalParticle* part, int64_t i1, int64_t i2){
-        '''
-        for tt, vv in cls.per_particle_vars:
-            src_exchange += '\n'.join([
-                '\n    {',
-                f'    {tt._c_type} temp = part->{vv}[i2];',
-                f'    part->{vv}[i2] = part->{vv}[i1];',
-                f'    part->{vv}[i1] = temp;',
-                '     }'])
-        src_exchange += '}\n'
-
-        custom_source = '''
-        /*gpufun*/
-        double LocalParticle_get_energy0(LocalParticle* part){
-
-            double const p0c = LocalParticle_get_p0c(part);
-            double const m0  = LocalParticle_get_mass0(part);
-
-            return sqrt( p0c * p0c + m0 * m0 );
-        }
-
-        /*gpufun*/
-        void LocalParticle_update_ptau(LocalParticle* part, double new_ptau_value){
-
-            double const beta0 = LocalParticle_get_beta0(part);
-
-            double const ptau = new_ptau_value;
-
-            double const irpp = sqrt(ptau*ptau + 2*ptau/beta0 +1);
-
-            double const new_rpp = 1./irpp;
-            LocalParticle_set_delta(part, irpp - 1.);
-
-            double const new_rvv = irpp/(1 + beta0*ptau);
-            LocalParticle_set_rvv(part, new_rvv);
-            LocalParticle_set_ptau(part, ptau);
-
-            LocalParticle_set_rpp(part, new_rpp );
-        }
-
-        /*gpufun*/
-        void LocalParticle_update_delta(LocalParticle* part, double new_delta_value){
-            double const beta0 = LocalParticle_get_beta0(part);
-            double const delta_beta0 = new_delta_value * beta0;
-            double const ptau_beta0  = sqrt( delta_beta0 * delta_beta0 +
-                                        2. * delta_beta0 * beta0 + 1. ) - 1.;
-
-            double const one_plus_delta = 1. + new_delta_value;
-            double const rvv    = ( one_plus_delta ) / ( 1. + ptau_beta0 );
-            double const rpp    = 1. / one_plus_delta;
-            double const ptau = ptau_beta0 / beta0;
-
-            LocalParticle_set_delta(part, new_delta_value);
-
-            LocalParticle_set_rvv(part, rvv );
-            LocalParticle_set_rpp(part, rpp );
-            LocalParticle_set_ptau(part, ptau );
-
-        }
-
-        /*gpufun*/
-        double LocalParticle_get_pzeta(LocalParticle* part){
-
-            double const ptau = LocalParticle_get_ptau(part);
-            double const beta0 = LocalParticle_get_beta0(part);
-
-            return ptau/beta0;
-
-        }
-
-        /*gpufun*/
-        void LocalParticle_update_pzeta(LocalParticle* part, double new_pzeta_value){
-
-            double const beta0 = LocalParticle_get_beta0(part);
-            LocalParticle_update_ptau(part, beta0*new_pzeta_value);
-
-        }
-
-        /*gpufun*/
-        void increment_at_element(LocalParticle* part0, int64_t const increment){
-
-            //start_per_particle_block (part0->part)
-                LocalParticle_add_to_at_element(part, increment);
-            //end_per_particle_block
-
-        }
-
-        /*gpufun*/
-        void increment_at_turn(LocalParticle* part0, int flag_reset_s){
-
-            //start_per_particle_block (part0->part)
-            LocalParticle_add_to_at_turn(part, 1);
-            LocalParticle_set_at_element(part, 0);
-            if (flag_reset_s>0){
-                LocalParticle_set_s(part, 0.);
-            }
-            //end_per_particle_block
-        }
-
-        /*gpufun*/
-        void increment_at_turn_backtrack(LocalParticle* part0, int flag_reset_s,
-                                         double const line_length,
-                                         int64_t const num_elements){
-
-            //start_per_particle_block (part0->part)
-            LocalParticle_add_to_at_turn(part, -1);
-            LocalParticle_set_at_element(part, num_elements);
-            if (flag_reset_s>0){
-                LocalParticle_set_s(part, line_length);
-            }
-            //end_per_particle_block
-        }
-
-        // check_is_active has different implementation on CPU and GPU
-
-        #define CPU_SERIAL_IMPLEM //only_for_context cpu_serial
-        #define CPU_OMP_IMPLEM //only_for_context cpu_openmp
-
-        #ifdef CPU_SERIAL_IMPLEM
-
-        /*gpufun*/
-        int64_t check_is_active(LocalParticle* part) {
-            int64_t ipart=0;
-            while (ipart < part->_num_active_particles){
-                #ifdef XSUITE_RESTORE_LOSS
-                ipart++;
-                #else
-                if (part->state[ipart]<1){
-                    LocalParticle_exchange(
-                        part, ipart, part->_num_active_particles-1);
-                    part->_num_active_particles--;
-                    part->_num_lost_particles++;
-                }
-                else{
-                    ipart++;
-                }
-                #endif
-            }
-
-            if (part->_num_active_particles==0){
-                return 0;//All particles lost
-            } else {
-                return 1; //Some stable particles are still present
-            }
-        }
-
-        #else // not CPU_SERIAL_IMPLEM
-        #ifdef CPU_OMP_IMPLEM
-
-        /*gpufun*/
-        int64_t check_is_active(LocalParticle* part) {
-        #ifndef SKIP_SWAPS
-            int64_t ipart = part->ipart;
-            int64_t endpart = part->endpart;
-
-            int64_t left = ipart;
-            int64_t right = endpart - 1;
-            int64_t swap_made = 0;
-            int64_t has_alive = 0;
-
-            if (left == right) return part->state[left] > 0;
-
-            while (left < right) {
-                if (part->state[left] > 0) {
-                    left++;
-                    has_alive = 1;
-                }
-                else if (part->state[right] <= 0) right--;
-                else {
-                    LocalParticle_exchange(part, left, right);
-                    left++;
-                    right--;
-                    swap_made = 1;
-                }
-            }
-
-            return swap_made || has_alive;
-        #else
-            return 1;
-        #endif
-        }
-
-        /*gpufun*/
-        void count_reorganized_particles(LocalParticle* part) {
-            int64_t num_active = 0;
-            int64_t num_lost = 0;
-
-            for (int64_t i = part->ipart; i < part->endpart; i++) {
-                if (part->state[i] <= -999999999) break;
-                else if (part->state[i] > 0) num_active++;
-                else num_lost++;
-            }
-
-            part->_num_active_particles = num_active;
-            part->_num_lost_particles = num_lost;
-        }
-
-        #else // not CPU_SERIAL_IMPLEM and not CPU_OMP_IMPLEM
-
-        /*gpufun*/
-        int64_t check_is_active(LocalParticle* part) {
-            return LocalParticle_get_state(part)>0;
-        };
-
-        #endif // CPU_OMP_IMPLEM
-        #endif // CPU_SERIAL_IMPLEM
-
-        #undef CPU_SERIAL_IMPLEM //only_for_context cpu_serial
-        #undef CPU_OMP_IMPLEM //only_for_context cpu_openmp
-
-
-        '''
-
-        source = '\n\n'.join([src_typedef, src_adders, src_getters,
-                                 src_setters, src_scalers, src_exchange,
-                                 src_particles_to_local, src_local_to_particles,
-                                 src_angles, custom_source])
-
-        source += """
-                    #ifdef XTRACK_GLOBAL_XY_LIMIT
-
-                    /*gpufun*/
-                    void global_aperture_check(LocalParticle* part0) {
-                        if (LocalParticle_check_track_flag(
-                            part0, XS_FLAG_IGNORE_GLOBAL_APERTURE)){
-                            return;
-                        }
-
-                        //start_per_particle_block (part0->part)
-                            double const x = LocalParticle_get_x(part);
-                            double const y = LocalParticle_get_y(part);
-
-                        int64_t const is_alive = (int64_t)(
-                                  (x >= -XTRACK_GLOBAL_XY_LIMIT) &&
-                                  (x <=  XTRACK_GLOBAL_XY_LIMIT) &&
-                                  (y >= -XTRACK_GLOBAL_XY_LIMIT) &&
-                                  (y <=  XTRACK_GLOBAL_XY_LIMIT) );
-
-                        // I assume that if I am in the function is because
-                            if (!is_alive){
-                               LocalParticle_set_state(part, -1);
-                        }
-                        //end_per_particle_block
-                    }
-
-                    #endif
-
-                    /*gpufun*/
-                    void LocalParticle_add_to_energy(LocalParticle* part, double delta_energy, int pz_only ){
-                        double ptau = LocalParticle_get_ptau(part);
-                        double const p0c = LocalParticle_get_p0c(part);
-                        double const charge_ratio = LocalParticle_get_charge_ratio(part);
-                        double const chi = LocalParticle_get_chi(part);
-                        double const mass_ratio = charge_ratio / chi;
-
-                        ptau += delta_energy/p0c / mass_ratio;
-
-                        double const old_rpp = LocalParticle_get_rpp(part);
-
-                        LocalParticle_update_ptau(part, ptau);
-
-                        if (!pz_only) {
-                            double const new_rpp = LocalParticle_get_rpp(part);
-                            double const f = old_rpp / new_rpp;
-                            LocalParticle_scale_px(part, f);
-                            LocalParticle_scale_py(part, f);
-                        }
-                    }
-
-
-                    /*gpufun*/
-                    void LocalParticle_update_p0c(LocalParticle* part, double new_p0c_value){
-
-                        double const mass0 = LocalParticle_get_mass0(part);
-                        double const old_p0c = LocalParticle_get_p0c(part);
-                        double const old_delta = LocalParticle_get_delta(part);
-                        double const old_beta0 = LocalParticle_get_beta0(part);
-
-                        double const ppc = old_p0c * old_delta + old_p0c;
-                        double const new_delta = (ppc - new_p0c_value)/new_p0c_value;
-
-                        double const new_energy0 = sqrt(new_p0c_value*new_p0c_value + mass0 * mass0);
-                        double const new_beta0 = new_p0c_value / new_energy0;
-                        double const new_gamma0 = new_energy0 / mass0;
-
-                        LocalParticle_set_p0c(part, new_p0c_value);
-                        LocalParticle_set_gamma0(part, new_gamma0);
-                        LocalParticle_set_beta0(part, new_beta0);
-
-                        LocalParticle_update_delta(part, new_delta);
-
-                        LocalParticle_scale_px(part, old_p0c/new_p0c_value);
-                        LocalParticle_scale_py(part, old_p0c/new_p0c_value);
-
-                        LocalParticle_scale_zeta(part, new_beta0/old_beta0);
-
-                    }
-
-                    /*gpufun*/
-                    void LocalParticle_kill_particle(LocalParticle* part, int64_t kill_state) {
-                        LocalParticle_set_x(part, 1e30);
-                        LocalParticle_set_px(part, 1e30);
-                        LocalParticle_set_y(part, 1e30);
-                        LocalParticle_set_py(part, 1e30);
-                        LocalParticle_set_zeta(part, 1e30);
-                        LocalParticle_update_delta(part, -1);  // zero energy
-                        LocalParticle_set_state(part, kill_state);
-                    }
-                """
-        return source
-
     @classmethod
     def part_energy_varnames(cls):
         return [vv for tt, vv in cls.part_energy_vars]
diff --git a/xtrack/tracker.py b/xtrack/tracker.py
index 27128d045..ba38bd178 100644
--- a/xtrack/tracker.py
+++ b/xtrack/tracker.py
@@ -47,7 +47,6 @@ def __init__(
         track_kernel=None,
         particles_monitor_class=None,
         extra_headers=(),
-        local_particle_src=None,
         _prebuilding_kernels=False,
     ):
 
@@ -62,16 +61,12 @@ def __init__(
             raise ValueError("`enable_pipeline_hold` is not implemented in "
                              "non-collective mode")
 
-        if local_particle_src is None:
-            local_particle_src = xt.Particles.gen_local_particle_api()
-
         if not particles_monitor_class:
             particles_monitor_class = self._get_default_monitor_class()
 
         self.line = line
         self.particles_monitor_class = particles_monitor_class
         self.extra_headers = extra_headers
-        self.local_particle_src = local_particle_src
         self._enable_pipeline_hold = enable_pipeline_hold
         self.use_prebuilt_kernels = use_prebuilt_kernels
         self.track_flags = TrackFlags()
@@ -684,9 +679,7 @@ def _build_kernel(
             kernel_descriptions=kernels,
             extra_headers=self._config_to_headers() + headers,
             extra_classes=kernel_element_classes + extra_classes,
-            apply_to_source=[
-                partial(_handle_per_particle_blocks,
-                        local_particle_src=self.local_particle_src)],
+            apply_to_source=[_handle_per_particle_blocks],
             specialize=True,
             compile=compile,
             save_source_as=f'{module_name}.c' if module_name else None,
@@ -1128,8 +1121,6 @@ def _track_no_collective(
                 self.track_flags.XS_FLAG_BACKTRACK = True
                 return self._track_no_collective(**kwargs)
 
-        self.local_particle_src = particles.gen_local_particle_api()
-
         if freeze_longitudinal:
             kwargs = locals().copy()
             kwargs.pop('self')
diff --git a/xtrack/twiss.py b/xtrack/twiss.py
index 87c00c68e..1cd68663e 100644
--- a/xtrack/twiss.py
+++ b/xtrack/twiss.py
@@ -2840,7 +2840,6 @@ def _build_auxiliary_tracker_with_extra_markers(tracker, at_s, marker_prefix,
         io_buffer=tracker.io_buffer,
         line=auxline,
         particles_monitor_class=None,
-        local_particle_src=tracker.local_particle_src
     )
     auxtracker.line.config = tracker.line.config.copy()
     auxtracker.line._extra_config = tracker.line._extra_config.copy()