diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..be006de9a
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,13 @@
+# Keep GitHub Actions up to date with GitHub's Dependabot...
+# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem
+version: 2
+updates:
+  - package-ecosystem: github-actions
+    directory: /
+    groups:
+      github-actions:
+        patterns:
+          - "*"  # Group all Actions updates into a single larger pull request
+    schedule:
+      interval: weekly
diff --git a/.github/workflows/conda-setup.yml b/.github/workflows/conda-setup.yml
index 4c878cbc9..3fbba8c4c 100644
--- a/.github/workflows/conda-setup.yml
+++ b/.github/workflows/conda-setup.yml
@@ -1,6 +1,6 @@
-name: Conda setup
+name: github CI
 
-on: [push, pull_request]
+on: [pull_request, push]
 
 jobs:
   test_conda:
@@ -9,9 +9,14 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        #os: ["ubuntu-latest", "macos-latest", "windows-latest"]
-        os: ["ubuntu-latest", "windows-latest"]
-        python-version: ["3.9", "3.11", "3.13"]
+        os: ["ubuntu-latest", "macos-latest", "windows-latest"]
+
+        # Our meson.build expects that numpy-config is installed but this 
+        # is not available for python < 3.9
+
+        # 2025/06/19: 3.14 causes an error with conda_incubator/setup-miniconda@v3 
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+
     steps:
       - uses: actions/checkout@v4
       - uses: conda-incubator/setup-miniconda@v3
@@ -25,21 +30,32 @@ jobs:
         run: |
           conda env create --name anuga_env --file environments/environment_${{matrix.python-version}}.yml
 
-      - name: Install mingw compilers on Windows
+      - name: Install gcc compilers on Windows
         if: runner.os == 'Windows'
         shell: bash -el {0}
         run: |
-          conda install -c conda-forge -n anuga_env libpython m2w64-toolchain
+           conda install -c conda-forge -n anuga_env gcc_win-64 gxx_win-64
+           # As of 2025/06/19, strange combination of mpi4py, the new compilers and pytest is
+           # causing a segmentation fault on Windows so we uninstall mpi4py
+           # This is a temporary workaround until the issue is resolved.
+           conda uninstall -n anuga_env mpi4py
 
-      - name: Install package
+      - name: Install clang with openmp compiler on macOS
+        if: runner.os == 'macOS'
+        shell: bash -el {0}
+        run: |
+           conda install -c conda-forge -n anuga_env cxx-compiler llvm-openmp
+           
+      - name: Install anuga package
         shell: bash -el {0}
         run: |
           conda activate anuga_env
-          pip install --no-build-isolation .
+          pip install --no-build-isolation -v .
 
       - name: Test package
         shell: bash -el {0}
         run: |
           conda activate anuga_env
           cd ..
-          pytest -p no:faulthandler -q --pyargs anuga
\ No newline at end of file
+          export OMP_NUM_THREADS=1
+          pytest -p no:faulthandler -rs --pyargs anuga
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 000000000..f1964e63a
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,40 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+  push:
+    branches: [ "develop-meson" ]
+  release:
+    branches: [ "main" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    # - name: Lint with flake8
+    #   run: |
+    #     # stop the build if there are Python syntax errors or undefined names
+    #     flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+    #     # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+    #     flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest
diff --git a/.gitignore b/.gitignore
index 27b231ae6..ff8a5b5e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@
 [#]*#
 *~
 *$
+*.csv
+*.pstat
 *.bak
 .idea/*
 *.kdev4
@@ -15,6 +17,7 @@
 .settings/
 .*.sw[nop]
 .sw[nop]
+*.pstat
 *.c
 *.cpp
 *.tmp
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 01a8e0dd7..010a7a6eb 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -15,6 +15,11 @@ build:
 sphinx:
   configuration: docs/source/conf.py
 
+# Optionally build your docs in additional formats such as PDF and ePub
+formats:
+   - pdf
+#   - epub
+
 # We recommend specifying your dependencies to enable reproducible builds:
 # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 python:
diff --git a/README.rst b/README.rst
index 0a21fdafd..8c9d57744 100644
--- a/README.rst
+++ b/README.rst
@@ -1,13 +1,19 @@
 
-.. image:: https://badges.gitter.im/gitterHQ/gitter.png
+.. |badge0| image:: https://badges.gitter.im/gitterHQ/gitter.png
     :target: https://app.gitter.im/#/room/#anuga-community:gitter.im
     :alt: Community Chat
 
-.. image:: https://app.travis-ci.com/anuga-community/anuga_core.svg?branch=develop_hackathon
+..
+    |badge1| image:: https://img.shields.io/travis/com/anuga-community/anuga_core/develop_hackathon.svg
     :target: https://app.travis-ci.com/anuga-community/anuga_core
-    :alt: travis ci status
+    :alt: Travis CI Status
+
+.. |badge1| image:: https://github.com/anuga-community/anuga_core/actions/workflows/conda-setup.yml/badge.svg
+    :target: https://github.com/anuga-community/anuga_core/actions/workflows/conda-setup.yml
+    :alt: GitHub Actions Status
    
-.. image:: https://ci.appveyor.com/api/projects/status/x5airjv7eq2u805w/branch/develop_hackathon?svg=true
+.. 
+    |badge2| image:: https://ci.appveyor.com/api/projects/status/x5airjv7eq2u805w/branch/main?svg=true
     :target: https://ci.appveyor.com/project/stoiver/anuga-core-nwgr0
     :alt: appveyor status
 
@@ -31,7 +37,7 @@
     :target: https://anuga.readthedocs.io/en/latest/?badge=latest
     :alt: Documentation Status
 
-|badge0| |badge1| |badge2| |badge3| |badge3| |badge4| |badge5| |badge6| |badge7|
+|badge0| |badge1| |badge3| |badge4| |badge5| |badge6| |badge7|
 
 
 
@@ -103,7 +109,7 @@ ANUGA documentation is available via "read the docs" at
     https://anuga.readthedocs.io 
 
 Also you can obtain help via the old
-`user_manual <https://github.com/anuga-community/anuga_core/raw/main/doc/anuga_user_manual.pdf>`_
+`user_manual <https://dx.doi.org/10.13140/RG.2.2.17267.81446>`_
 
 Also helpful information is available online at
 
diff --git a/anuga/abstract_2d_finite_volumes/generic_domain.py b/anuga/abstract_2d_finite_volumes/generic_domain.py
index 30e414817..d0858e364 100644
--- a/anuga/abstract_2d_finite_volumes/generic_domain.py
+++ b/anuga/abstract_2d_finite_volumes/generic_domain.py
@@ -241,13 +241,10 @@ def __init__(self,
 
         #-------------------------------
         # Set multiprocessor mode 
-        # 0. orig (original with edge optim)
-        # 1. simd (used for multiprocessor)
-        # 2. openmp (in development)
-        # 3. openacc (in development)
-        # 4. cuda (in development)
+        # 1. openmp (in development)
+        # 2. cuda (in development)
         #-------------------------------    
-        self.set_multiprocessor_mode(0)
+        self.set_multiprocessor_mode(1)
 
         self.processor = processor
         self.numproc = numproc
@@ -749,18 +746,23 @@ def set_multiprocessor_mode(self, multiprocessor_mode= 0):
         """
         Set multiprocessor mode 
         
-        0. original
-        1. simd (used for multiprocessor)
-        2. openmp (in development)
-        3. openacc (in development)
-        4. cuda (in development)
+        1. openmp (in development)
+        2. cuda (in development)
         """
 
-        if multiprocessor_mode in [0,1,2,3,4]:
+        if multiprocessor_mode in [1,2]:
             self.multiprocessor_mode = multiprocessor_mode
         else:
             raise Exception('multiprocessor mode {multiprocessor_mode} not supported')
 
+    def get_multiprocessor_mode(self):
+        """
+        Get multiprocessor mode 
+        
+        1. openmp (in development)
+        2. cuda (in development)
+        """
+        return self.multiprocessor_mode 
             
     def set_using_centroid_averaging(self, flag=True):
         """Set flag to use centroid averaging in output
@@ -2164,10 +2166,7 @@ def evolve_one_rk3_step(self, yieldstep, finaltime):
         # self.saxpy_conserved_quantities(2.0/3.0, 1.0/3.0)
 
         # So do this instead!
-        self.saxpy_conserved_quantities(2.0, 1.0)
-        for name in self.conserved_quantities:
-            Q = self.quantities[name]
-            Q.centroid_values[:] = Q.centroid_values / 3.0
+        self.saxpy_conserved_quantities(2.0, 1.0, 3.0)
 
         # Update special conditions
         # self.update_special_conditions()
@@ -2191,12 +2190,15 @@ def backup_conserved_quantities(self):
             Q = self.quantities[name]
             Q.backup_centroid_values()
 
-    def saxpy_conserved_quantities(self, a, b):
+    def saxpy_conserved_quantities(self, a, b, c=None):
 
-        # Backup conserved_quantities centroid values
+        # saxpy conserved_quantities centroid values with backup values
         for name in self.conserved_quantities:
             Q = self.quantities[name]
             Q.saxpy_centroid_values(a, b)
+            if c is not None:
+                Q.centroid_values[:] = Q.centroid_values / c
+
 
     def conserved_values_to_evolved_values(self, q_cons, q_evol):
         """Needs to be overridden by Domain subclass
@@ -2517,42 +2519,6 @@ def centroid_norm(self, quantity, normfunc):
 
         return normfunc(self.quantities[quantity].centroid_values)
 
-    def apply_protection_against_isolated_degenerate_timesteps(self):
-
-        # FIXME (Steve): This should be in shallow_water as it assumes x and y
-        # momentum
-        if self.protect_against_isolated_degenerate_timesteps is False:
-            return
-
-        # FIXME (Ole): Make this configurable
-        if num.max(self.max_speed) < 10.0:
-            return
-
-        # Setup 10 bins for speed histogram
-        from anuga.utilities.numerical_tools import histogram, create_bins
-
-        bins = create_bins(self.max_speed, 10)
-        hist = histogram(self.max_speed, bins)
-
-        # Look for characteristic signature
-        if len(hist) > 1 and hist[-1] > 0 and \
-           hist[4] == hist[5] == hist[6] == hist[7] == hist[8] == 0:
-            # Danger of isolated degenerate triangles
-
-            # Find triangles in last bin
-            # FIXME - speed up using numeric package
-            d = 0
-            for i in range(self.number_of_triangles):
-                if self.max_speed[i] > bins[-1]:
-                    msg = 'Time=%f: Ignoring isolated high ' % self.get_time()
-                    msg += 'speed triangle '
-                    msg += '#%d of %d with max speed = %f' \
-                        % (i, self.number_of_triangles, self.max_speed[i])
-
-                    self.get_quantity('xmomentum').set_values(0.0, indices=[i])
-                    self.get_quantity('ymomentum').set_values(0.0, indices=[i])
-                    self.max_speed[i] = 0.0
-                    d += 1
 
 
 if __name__ == "__main__":
diff --git a/anuga/abstract_2d_finite_volumes/meson.build b/anuga/abstract_2d_finite_volumes/meson.build
index 34f77d812..5fdbcc869 100644
--- a/anuga/abstract_2d_finite_volumes/meson.build
+++ b/anuga/abstract_2d_finite_volumes/meson.build
@@ -2,22 +2,6 @@
 inc_dir = include_directories('../utilities', incdir_numpy)
 
 
-openmp = dependency('openmp', required: false)
-if openmp.found()
-  if host_machine.system() == 'windows'
-    # On Windows, the mingw compiler does not support OpenMP ATOMIC operations
-    openmp_deps = dependencies
-  else
-    openmp_deps = dependencies + [openmp]
-  endif
-
-else
-  openmp_deps = dependencies
-endif
-
-# FIXME SR: Need to setup dependencies for openacc code
-openacc_deps = dependencies
-
 # Compile the Cython-generated C code and additional C code
 py3.extension_module('mesh_factory_ext',
   sources: ['mesh_factory_ext.pyx'],
@@ -52,35 +36,22 @@ py3.extension_module('pmesh2domain_ext',
   install: true,
 )
 
-py3.extension_module('quantity_ext',
-  sources: ['quantity_ext.pyx'],
-  include_directories: inc_dir,
-  dependencies: dependencies,
-  subdir: 'anuga/abstract_2d_finite_volumes',
-  install: true,
-)
-
 
 py3.extension_module('quantity_openmp_ext',
   sources: ['quantity_openmp_ext.pyx'],
+  c_args : openmp_c_args,
   include_directories: inc_dir,
   dependencies: openmp_deps,
   subdir: 'anuga/abstract_2d_finite_volumes',
   install: true,
 )
 
-py3.extension_module('quantity_openacc_ext',
-  sources: ['quantity_openacc_ext.pyx'],
-  include_directories: inc_dir,
-  dependencies: openacc_deps,
-  subdir: 'anuga/abstract_2d_finite_volumes',
-  install: true,
-)
 
 py3.extension_module('quantity_cuda_ext',
   sources: ['quantity_cuda_ext.pyx'],
+  c_args : openmp_c_args,
   include_directories: inc_dir,
-  dependencies: dependencies,
+  dependencies: openmp_deps,
   subdir: 'anuga/abstract_2d_finite_volumes',
   install: true,
 )
diff --git a/anuga/abstract_2d_finite_volumes/neighbour_mesh.py b/anuga/abstract_2d_finite_volumes/neighbour_mesh.py
index 57535e047..b0fbb0801 100644
--- a/anuga/abstract_2d_finite_volumes/neighbour_mesh.py
+++ b/anuga/abstract_2d_finite_volumes/neighbour_mesh.py
@@ -1005,6 +1005,7 @@ def statistics(self, nbins=10):
         hist = histogram(areas, bins)
 
         str =  '------------------------------------------------\n'
+        str += ' Jorge openmp version \n'
         str += 'Mesh statistics:\n'
         str += '  Number of triangles = %d\n' %len(self)
         str += '  Extent [m]:\n'
diff --git a/anuga/abstract_2d_finite_volumes/pmesh2domain.c b/anuga/abstract_2d_finite_volumes/pmesh2domain.c
index 45c79e8ba..b2aa0ed07 100644
--- a/anuga/abstract_2d_finite_volumes/pmesh2domain.c
+++ b/anuga/abstract_2d_finite_volumes/pmesh2domain.c
@@ -1,11 +1,13 @@
 #include <stdio.h>   /* gets */
 #include <stdlib.h>  /* atoi, malloc */
 #include <string.h>  /* strcpy */
+#include <inttypes.h> /* PRId64 */
 #include <math.h>
 
 //Shared code snippets
 
 #include "uthash.h"     /* in utilities */
+#include "anuga_typedefs.h" /* in utilities */
 
 //==============================================================================
 // hashtable code from uthash. Look at copyright info in "uthash.h in the
@@ -13,20 +15,20 @@
 //==============================================================================
 
 typedef struct {
-    int64_t i;
-    int64_t j;
+    anuga_int i;
+    anuga_int j;
 } segment_key_t;
 
 typedef struct {
     segment_key_t key; /* key of form i , j */
-    int64_t vol_id; /* id of vol containing this segement */
-    int64_t edge_id; /* edge_id of segement in this vol */
+    anuga_int vol_id; /* id of vol containing this segement */
+    anuga_int edge_id; /* edge_id of segement in this vol */
     UT_hash_handle hh; /* makes this structure hashable */
 } segment_t;
 
 segment_t *segment_table = NULL;
 
-void add_segment(segment_key_t key, int64_t vol_id, int64_t edge_id) {
+void add_segment(segment_key_t key, anuga_int vol_id, anuga_int edge_id) {
     segment_t *s;
 
     s = (segment_t*) malloc(sizeof (segment_t));
@@ -63,16 +65,16 @@ void print_segments(void) {
     segment_t *s;
 
     for (s = segment_table; s != NULL; s = (segment_t*) (s->hh.next)) {
-        printf("segment key i %ld j %ld vol_id %ld  edge_id %ld\n",
+        printf("segment key i %" PRId64 " j %" PRId64 " vol_id %" PRId64 "  edge_id %" PRId64 "\n",
                 s->key.i, s->key.j, s->vol_id, s->edge_id);
     }
 }
 
-int64_t vol_id_sort(segment_t *a, segment_t *b) {
+anuga_int vol_id_sort(segment_t *a, segment_t *b) {
     return (a->vol_id - b->vol_id);
 }
 
-int64_t key_sort(segment_t *a, segment_t *b) {
+anuga_int key_sort(segment_t *a, segment_t *b) {
     return (a->key.i - b->key.i);
 }
 
diff --git a/anuga/abstract_2d_finite_volumes/quantity.c b/anuga/abstract_2d_finite_volumes/quantity.c
index 0e9873206..4f87ab265 100644
--- a/anuga/abstract_2d_finite_volumes/quantity.c
+++ b/anuga/abstract_2d_finite_volumes/quantity.c
@@ -15,21 +15,20 @@
 //Shared code snippets
 #include "util_ext.h"
 
-typedef int64_t keyint;
 
 //-------------------------------------------
 // Low level routines (called from wrappers)
 //------------------------------------------
 
-int64_t _compute_gradients(keyint N,
+anuga_int _compute_gradients(anuga_int N,
 			double* centroids,
 			double* centroid_values,
-			int64_t* number_of_boundaries,
-			int64_t* surrogate_neighbours,
+			anuga_int* number_of_boundaries,
+			anuga_int* surrogate_neighbours,
 			double* a,
 			double* b){
 
-  keyint i, k, k0, k1, k2, index3;
+  anuga_int i, k, k0, k1, k2, index3;
   double x0, x1, x2, y0, y1, y2, q0, q1, q2; //, det;
 
 
@@ -93,19 +92,18 @@ int64_t _compute_gradients(keyint N,
 }
 
 
-int64_t _compute_local_gradients(keyint N,
+anuga_int _compute_local_gradients(anuga_int N,
 			       double* vertex_coordinates,
 			       double* vertex_values,
 			       double* a,
 			       double* b) {
 
-  keyint k, k2, k3, k6;
+  anuga_int k, k3, k6;
   double x0, y0, x1, y1, x2, y2, v0, v1, v2;
 
   for (k=0; k<N; k++) {
     k6 = 6*k;
     k3 = 3*k;
-    k2 = 2*k;
 
     // vertex coordinates
     // x0, y0, x1, y1, x2, y2 = X[k,:]
@@ -128,7 +126,7 @@ int64_t _compute_local_gradients(keyint N,
     return 0;
 }
 
-int64_t _extrapolate_from_gradient(keyint N,
+anuga_int _extrapolate_from_gradient(anuga_int N,
 			       double* centroids,
 			       double* centroid_values,
 			       double* vertex_coordinates,
@@ -137,7 +135,7 @@ int64_t _extrapolate_from_gradient(keyint N,
 			       double* a,
 			       double* b) {
 
-  keyint k, k2, k3, k6;
+  anuga_int k, k2, k3, k6;
   double x, y, x0, y0, x1, y1, x2, y2;
 
   for (k=0; k<N; k++){
@@ -172,9 +170,9 @@ int64_t _extrapolate_from_gradient(keyint N,
 }
 
 
-int64_t _extrapolate_and_limit_from_gradient(keyint N,double beta,
+anuga_int _extrapolate_and_limit_from_gradient(anuga_int N,double beta,
 					 double* centroids,
-					 int64_t*   neighbours,
+					 anuga_int*   neighbours,
 					 double* centroid_values,
 					 double* vertex_coordinates,
 					 double* vertex_values,
@@ -183,9 +181,9 @@ int64_t _extrapolate_and_limit_from_gradient(keyint N,double beta,
 					 double* x_gradient,
 					 double* y_gradient) {
 
-  keyint i, k, k2, k3, k6;
+  anuga_int i, k, k2, k3, k6;
   double x, y, x0, y0, x1, y1, x2, y2;
-  keyint n;
+  anuga_int n;
   double qmin, qmax, qc;
   double qn[3];
   double dq, dqa[3], r;
@@ -296,24 +294,22 @@ int64_t _extrapolate_and_limit_from_gradient(keyint N,double beta,
 
 
 
-int64_t _limit_vertices_by_all_neighbours(keyint N, double beta,
+anuga_int _limit_vertices_by_all_neighbours(anuga_int N, double beta,
 				      double* centroid_values,
 				      double* vertex_values,
 				      double* edge_values,
-				      int64_t*   neighbours,
+				      anuga_int*   neighbours,
 				      double* x_gradient,
 				      double* y_gradient) {
 
 
-  keyint i, k, k2, k3, k6;
-  keyint n;
+  anuga_int i, k, k3;
+  anuga_int n;
   double qmin, qmax, qn, qc;
   double dq, dqa[3], phi, r;
 
   for (k=0; k<N; k++){
-    k6 = 6*k;
     k3 = 3*k;
-    k2 = 2*k;
 
     qc = centroid_values[k];
     qmin = qc;
@@ -363,23 +359,21 @@ int64_t _limit_vertices_by_all_neighbours(keyint N, double beta,
 
 
 
-int64_t _limit_edges_by_all_neighbours(keyint N, double beta,
+anuga_int _limit_edges_by_all_neighbours(anuga_int N, double beta,
 				   double* centroid_values,
 				   double* vertex_values,
 				   double* edge_values,
-				   int64_t*   neighbours,
+				   anuga_int*   neighbours,
 				   double* x_gradient,
 				   double* y_gradient) {
 
-  keyint i, k, k2, k3, k6;
-  keyint n;
-  double qmin, qmax, qn, qc, sign;
+  anuga_int i, k, k3;
+  anuga_int n;
+  double qmin, qmax, qn, qc;
   double dq, dqa[3], phi, r;
 
   for (k=0; k<N; k++){
-    k6 = 6*k;
     k3 = 3*k;
-    k2 = 2*k;
 
     qc = centroid_values[k];
     qmin = qc;
@@ -395,12 +389,6 @@ int64_t _limit_edges_by_all_neighbours(keyint N, double beta,
       }
     }
 
-    sign = 0.0;
-    if (qmin > 0.0) {
-      sign = 1.0;
-    } else if (qmax < 0) {
-      sign = -1.0;
-    }
 
     phi = 1.0;
     for (i=0; i<3; i++) {
@@ -449,21 +437,19 @@ int64_t _limit_edges_by_all_neighbours(keyint N, double beta,
 }
 
 
-int64_t _limit_edges_by_neighbour(keyint N, double beta,
+anuga_int _limit_edges_by_neighbour(anuga_int N, double beta,
 		     double* centroid_values,
 		     double* vertex_values,
 		     double* edge_values,
-		     int64_t*   neighbours) {
+		     anuga_int*   neighbours) {
 
-	keyint i, k, k2, k3, k6;
-	keyint n;
+	anuga_int i, k, k3;
+	anuga_int n;
 	double qmin, qmax, qn, qc;
 	double dq, dqa[3], phi, r;
 
 	for (k=0; k<N; k++){
-		k6 = 6*k;
 		k3 = 3*k;
-		k2 = 2*k;
 
 		qc = centroid_values[k];
 		phi = 1.0;
@@ -504,23 +490,21 @@ int64_t _limit_edges_by_neighbour(keyint N, double beta,
 }
 
 
-int64_t _limit_gradient_by_neighbour(keyint N, double beta,
+anuga_int _limit_gradient_by_neighbour(anuga_int N, double beta,
 		     double* centroid_values,
 		     double* vertex_values,
 		     double* edge_values,
 		     double* x_gradient,
 		     double* y_gradient,
-		     int64_t*   neighbours) {
+		     anuga_int*   neighbours) {
 
-	keyint i, k, k2, k3, k6;
-	keyint n;
+	anuga_int i, k, k3;
+	anuga_int n;
 	double qmin, qmax, qn, qc;
 	double dq, dqa[3], phi, r;
 
 	for (k=0; k<N; k++){
-		k6 = 6*k;
 		k3 = 3*k;
-		k2 = 2*k;
 
 		qc = centroid_values[k];
 		phi = 1.0;
@@ -560,21 +544,19 @@ int64_t _limit_gradient_by_neighbour(keyint N, double beta,
 	return 0;
 }
 
-int64_t _bound_vertices_below_by_constant(keyint N, double bound,
+anuga_int _bound_vertices_below_by_constant(anuga_int N, double bound,
 		     double* centroid_values,
 		     double* vertex_values,
 		     double* edge_values,
 		     double* x_gradient,
 		     double* y_gradient) {
 
-	keyint i, k, k2, k3, k6;
+	anuga_int i, k, k3;
 	double qmin, qc;
 	double dq, dqa[3], phi, r;
 
 	for (k=0; k<N; k++){
-		k6 = 6*k;
 		k3 = 3*k;
-		k2 = 2*k;
 
 		qc = centroid_values[k];
 		qmin = bound;
@@ -611,7 +593,7 @@ int64_t _bound_vertices_below_by_constant(keyint N, double bound,
 	return 0;
 }
 
-int64_t _bound_vertices_below_by_quantity(keyint N,
+anuga_int _bound_vertices_below_by_quantity(anuga_int N,
 				      double* bound_vertex_values,
 				      double* centroid_values,
 				      double* vertex_values,
@@ -619,14 +601,12 @@ int64_t _bound_vertices_below_by_quantity(keyint N,
 				      double* x_gradient,
 				      double* y_gradient) {
 
-	keyint i, k, k2, k3, k6;
+	anuga_int i, k, k3;
 	double qmin, qc;
 	double dq, dqa[3], phi, r;
 
 	for (k=0; k<N; k++){
-		k6 = 6*k;
 		k3 = 3*k;
-		k2 = 2*k;
 
 		qc = centroid_values[k];
 
@@ -662,12 +642,12 @@ int64_t _bound_vertices_below_by_quantity(keyint N,
 	return 0;
 }
 
-int64_t _interpolate(keyint N,
+anuga_int _interpolate(anuga_int N,
 		 double* vertex_values,
 		 double* edge_values,
                  double* centroid_values) {
 
-	keyint k, k3;
+	anuga_int k, k3;
 	double q0, q1, q2;
 
 
@@ -687,11 +667,11 @@ int64_t _interpolate(keyint N,
 	return 0;
 }
 
-int64_t _interpolate_from_vertices_to_edges(keyint N,
+anuga_int _interpolate_from_vertices_to_edges(anuga_int N,
 					double* vertex_values,
 					double* edge_values) {
 
-	keyint k, k3;
+	anuga_int k, k3;
 	double q0, q1, q2;
 
 
@@ -710,11 +690,11 @@ int64_t _interpolate_from_vertices_to_edges(keyint N,
 }
 
 
-int64_t _interpolate_from_edges_to_vertices(keyint N,
+anuga_int _interpolate_from_edges_to_vertices(anuga_int N,
 					double* vertex_values,
 					double* edge_values) {
 
-	keyint k, k3;
+	anuga_int k, k3;
 	double e0, e1, e2;
 
 
@@ -732,13 +712,13 @@ int64_t _interpolate_from_edges_to_vertices(keyint N,
 	return 0;
 }
 
-int64_t _backup_centroid_values(keyint N,
+anuga_int _backup_centroid_values(anuga_int N,
 			    double* centroid_values,
 			    double* centroid_backup_values) {
     // Backup centroid values
 
 
-    keyint k;
+    anuga_int k;
 
     for (k=0; k<N; k++) {
 	centroid_backup_values[k] = centroid_values[k];
@@ -749,7 +729,7 @@ int64_t _backup_centroid_values(keyint N,
 }
 
 
-int64_t _saxpy_centroid_values(keyint N,
+anuga_int _saxpy_centroid_values(anuga_int N,
 			   double a,
 			   double b,
 			   double* centroid_values,
@@ -757,7 +737,7 @@ int64_t _saxpy_centroid_values(keyint N,
     // Saxby centroid values
 
 
-    keyint k;
+    anuga_int k;
 
 
     for (k=0; k<N; k++) {
@@ -769,7 +749,7 @@ int64_t _saxpy_centroid_values(keyint N,
 }
 
 
-int64_t _update(keyint N,
+anuga_int _update(anuga_int N,
 	    double timestep,
 	    double* centroid_values,
 	    double* explicit_update,
@@ -778,7 +758,7 @@ int64_t _update(keyint N,
 	// explicit_update and semi_implicit_update as well as given timestep
 
 
-	keyint k;
+	anuga_int k;
 	double denominator, x;
 
 
@@ -820,16 +800,16 @@ int64_t _update(keyint N,
 }
 
 
-int64_t _average_vertex_values(keyint N,
-			   int64_t* vertex_value_indices,
-			   int64_t* number_of_triangles_per_node,
+anuga_int _average_vertex_values(anuga_int N,
+			   anuga_int* vertex_value_indices,
+			   anuga_int* number_of_triangles_per_node,
 			   double* vertex_values,
 			   double* A) {
   // Average vertex values to obtain one value per node
 
-  keyint i, index;
-  keyint k = 0; //Track triangles touching each node
-  keyint current_node = 0;
+  anuga_int i, index;
+  anuga_int k = 0; //Track triangles touching each node
+  anuga_int current_node = 0;
   double total = 0.0;
 
   for (i=0; i<N; i++) {
@@ -869,17 +849,17 @@ int64_t _average_vertex_values(keyint N,
   return 0;
 }
 
-int64_t _average_centroid_values(keyint N,
-			   int64_t* vertex_value_indices,
-			   int64_t* number_of_triangles_per_node,
+anuga_int _average_centroid_values(anuga_int N,
+			   anuga_int* vertex_value_indices,
+			   anuga_int* number_of_triangles_per_node,
 			   double* centroid_values,
 			   double* A) {
   // Average centroid values to obtain one value per node
 
-  keyint i, index;
-  keyint volume_id;
-  keyint k = 0; //Track triangles touching each node
-  keyint current_node = 0;
+  anuga_int i, index;
+  anuga_int volume_id;
+  anuga_int k = 0; //Track triangles touching each node
+  anuga_int current_node = 0;
   double total = 0.0;
 
   for (i=0; i<N; i++) {
@@ -919,15 +899,15 @@ int64_t _average_centroid_values(keyint N,
 // from a list of vertices and values at those vertices. Called in
 // quantity.py by _set_vertex_values.
 // Naming is a little confusing - but sticking with convention.
-int64_t _set_vertex_values_c(keyint num_verts,
-                        int64_t * vertices,
-                        int64_t * node_index,
-                        int64_t * number_of_triangles_per_node,
-                        int64_t * vertex_value_indices,
+anuga_int _set_vertex_values_c(anuga_int num_verts,
+                        anuga_int * vertices,
+                        anuga_int * node_index,
+                        anuga_int * number_of_triangles_per_node,
+                        anuga_int * vertex_value_indices,
                         double * vertex_values,
                         double * A
                         ){
-  keyint i,j,num_triangles,u_vert_id,vert_v_index;
+  anuga_int i,j,num_triangles,u_vert_id,vert_v_index;
 
   for(i=0;i<num_verts;i++){
 
@@ -946,16 +926,16 @@ int64_t _set_vertex_values_c(keyint num_verts,
 
 }
 
-int64_t _min_and_max_centroid_values(keyint N,
+anuga_int _min_and_max_centroid_values(anuga_int N,
                                  double * qc,
                                  double * qv,
-                                 int64_t * neighbours,
+                                 anuga_int * neighbours,
                                  double * qmin,
                                  double * qmax){
   
   // Find min and max of this and neighbour's centroid values
 
-  keyint k, i, n, k3;
+  anuga_int k, i, n, k3;
   double qn;
 
   for (k=0; k<N; k++) {
diff --git a/anuga/abstract_2d_finite_volumes/quantity.py b/anuga/abstract_2d_finite_volumes/quantity.py
index 588373583..d453e37d9 100644
--- a/anuga/abstract_2d_finite_volumes/quantity.py
+++ b/anuga/abstract_2d_finite_volumes/quantity.py
@@ -686,20 +686,20 @@ def interpolate(self):
         """Compute interpolated values at edges and centroid
         Pre-condition: vertex_values have been set
         """
-        from .quantity_ext import interpolate
+        from .quantity_openmp_ext import interpolate
         interpolate(self)
 
 
     def interpolate_from_vertices_to_edges(self):
         # Call correct module function (either from this module or C-extension)
 
-        from .quantity_ext import interpolate_from_vertices_to_edges
+        from .quantity_openmp_ext import interpolate_from_vertices_to_edges
         interpolate_from_vertices_to_edges(self)
 
     def interpolate_from_edges_to_vertices(self):
         # Call correct module function (either from this module or C-extension)
 
-        from .quantity_ext import interpolate_from_edges_to_vertices
+        from .quantity_openmp_ext import interpolate_from_edges_to_vertices
         interpolate_from_edges_to_vertices(self)
 
     #---------------------------------------------
@@ -2300,9 +2300,9 @@ def update(self, timestep):
             from .quantity_openmp_ext import update
         if self.domain.multiprocessor_mode == 4:
             # FIXME SR: Change this when gpu version is available
-            from .quantity_ext import update
+            from .quantity_openmp_ext import update
         else:
-            from .quantity_ext import update
+            from .quantity_openmp_ext import update
         
         return update(self, timestep)
 
@@ -2391,7 +2391,7 @@ def __init__(self, domain, vertex_values=None):
 ######
 # Prepare the C extensions.
 ######
-from .quantity_ext import \
+from .quantity_openmp_ext import \
          average_vertex_values,\
          average_centroid_values,\
          backup_centroid_values,\
diff --git a/anuga/abstract_2d_finite_volumes/quantity_cuda.c b/anuga/abstract_2d_finite_volumes/quantity_cuda.c
index 9acb067bd..47db43918 100644
--- a/anuga/abstract_2d_finite_volumes/quantity_cuda.c
+++ b/anuga/abstract_2d_finite_volumes/quantity_cuda.c
@@ -16,21 +16,20 @@
 //Shared code snippets
 #include "util_ext.h"
 
-typedef int64_t keyint;
 
 //-------------------------------------------
 // Low level routines (called from wrappers)
 //------------------------------------------
 
-int64_t _compute_gradients(keyint N,
+anuga_int _compute_gradients(anuga_int N,
 			double* centroids,
 			double* centroid_values,
-			int64_t* number_of_boundaries,
-			int64_t* surrogate_neighbours,
+			anuga_int* number_of_boundaries,
+			anuga_int* surrogate_neighbours,
 			double* a,
 			double* b){
 
-  keyint i, k, k0, k1, k2, index3;
+  anuga_int i, k, k0, k1, k2, index3;
   double x0, x1, x2, y0, y1, y2, q0, q1, q2; //, det;
 
 
@@ -94,13 +93,13 @@ int64_t _compute_gradients(keyint N,
 }
 
 
-int64_t _compute_local_gradients(keyint N,
+anuga_int _compute_local_gradients(anuga_int N,
 			       double* vertex_coordinates,
 			       double* vertex_values,
 			       double* a,
 			       double* b) {
 
-  keyint k, k2, k3, k6;
+  anuga_int k, k2, k3, k6;
   double x0, y0, x1, y1, x2, y2, v0, v1, v2;
 
   for (k=0; k<N; k++) {
@@ -129,7 +128,7 @@ int64_t _compute_local_gradients(keyint N,
     return 0;
 }
 
-int64_t _extrapolate_from_gradient(keyint N,
+anuga_int _extrapolate_from_gradient(anuga_int N,
 			       double* centroids,
 			       double* centroid_values,
 			       double* vertex_coordinates,
@@ -138,7 +137,7 @@ int64_t _extrapolate_from_gradient(keyint N,
 			       double* a,
 			       double* b) {
 
-  keyint k, k2, k3, k6;
+  anuga_int k, k2, k3, k6;
   double x, y, x0, y0, x1, y1, x2, y2;
 
   for (k=0; k<N; k++){
@@ -173,9 +172,9 @@ int64_t _extrapolate_from_gradient(keyint N,
 }
 
 
-int64_t _extrapolate_and_limit_from_gradient(keyint N,double beta,
+anuga_int _extrapolate_and_limit_from_gradient(anuga_int N,double beta,
 					 double* centroids,
-					 int64_t*   neighbours,
+					 anuga_int*   neighbours,
 					 double* centroid_values,
 					 double* vertex_coordinates,
 					 double* vertex_values,
@@ -184,9 +183,9 @@ int64_t _extrapolate_and_limit_from_gradient(keyint N,double beta,
 					 double* x_gradient,
 					 double* y_gradient) {
 
-  keyint i, k, k2, k3, k6;
+  anuga_int i, k, k2, k3, k6;
   double x, y, x0, y0, x1, y1, x2, y2;
-  keyint n;
+  anuga_int n;
   double qmin, qmax, qc;
   double qn[3];
   double dq, dqa[3], r;
@@ -297,17 +296,17 @@ int64_t _extrapolate_and_limit_from_gradient(keyint N,double beta,
 
 
 
-int64_t _limit_vertices_by_all_neighbours(keyint N, double beta,
+anuga_int _limit_vertices_by_all_neighbours(anuga_int N, double beta,
 				      double* centroid_values,
 				      double* vertex_values,
 				      double* edge_values,
-				      int64_t*   neighbours,
+				      anuga_int*   neighbours,
 				      double* x_gradient,
 				      double* y_gradient) {
 
 
-  keyint i, k, k2, k3, k6;
-  keyint n;
+  anuga_int i, k, k2, k3, k6;
+  anuga_int n;
   double qmin, qmax, qn, qc;
   double dq, dqa[3], phi, r;
 
@@ -364,16 +363,16 @@ int64_t _limit_vertices_by_all_neighbours(keyint N, double beta,
 
 
 
-int64_t _limit_edges_by_all_neighbours(keyint N, double beta,
+anuga_int _limit_edges_by_all_neighbours(anuga_int N, double beta,
 				   double* centroid_values,
 				   double* vertex_values,
 				   double* edge_values,
-				   int64_t*   neighbours,
+				   anuga_int*   neighbours,
 				   double* x_gradient,
 				   double* y_gradient) {
 
-  keyint i, k, k2, k3, k6;
-  keyint n;
+  anuga_int i, k, k2, k3, k6;
+  anuga_int n;
   double qmin, qmax, qn, qc, sign;
   double dq, dqa[3], phi, r;
 
@@ -450,14 +449,14 @@ int64_t _limit_edges_by_all_neighbours(keyint N, double beta,
 }
 
 
-int64_t _limit_edges_by_neighbour(keyint N, double beta,
+anuga_int _limit_edges_by_neighbour(anuga_int N, double beta,
 		     double* centroid_values,
 		     double* vertex_values,
 		     double* edge_values,
-		     int64_t*   neighbours) {
+		     anuga_int*   neighbours) {
 
-	keyint i, k, k2, k3, k6;
-	keyint n;
+	anuga_int i, k, k2, k3, k6;
+	anuga_int n;
 	double qmin, qmax, qn, qc;
 	double dq, dqa[3], phi, r;
 
@@ -505,16 +504,16 @@ int64_t _limit_edges_by_neighbour(keyint N, double beta,
 }
 
 
-int64_t _limit_gradient_by_neighbour(keyint N, double beta,
+anuga_int _limit_gradient_by_neighbour(anuga_int N, double beta,
 		     double* centroid_values,
 		     double* vertex_values,
 		     double* edge_values,
 		     double* x_gradient,
 		     double* y_gradient,
-		     int64_t*   neighbours) {
+		     anuga_int*   neighbours) {
 
-	keyint i, k, k2, k3, k6;
-	keyint n;
+	anuga_int i, k, k2, k3, k6;
+	anuga_int n;
 	double qmin, qmax, qn, qc;
 	double dq, dqa[3], phi, r;
 
@@ -561,14 +560,14 @@ int64_t _limit_gradient_by_neighbour(keyint N, double beta,
 	return 0;
 }
 
-int64_t _bound_vertices_below_by_constant(keyint N, double bound,
+anuga_int _bound_vertices_below_by_constant(anuga_int N, double bound,
 		     double* centroid_values,
 		     double* vertex_values,
 		     double* edge_values,
 		     double* x_gradient,
 		     double* y_gradient) {
 
-	keyint i, k, k2, k3, k6;
+	anuga_int i, k, k2, k3, k6;
 	double qmin, qc;
 	double dq, dqa[3], phi, r;
 
@@ -612,7 +611,7 @@ int64_t _bound_vertices_below_by_constant(keyint N, double bound,
 	return 0;
 }
 
-int64_t _bound_vertices_below_by_quantity(keyint N,
+anuga_int _bound_vertices_below_by_quantity(anuga_int N,
 				      double* bound_vertex_values,
 				      double* centroid_values,
 				      double* vertex_values,
@@ -620,7 +619,7 @@ int64_t _bound_vertices_below_by_quantity(keyint N,
 				      double* x_gradient,
 				      double* y_gradient) {
 
-	keyint i, k, k2, k3, k6;
+	anuga_int i, k, k2, k3, k6;
 	double qmin, qc;
 	double dq, dqa[3], phi, r;
 
@@ -663,12 +662,12 @@ int64_t _bound_vertices_below_by_quantity(keyint N,
 	return 0;
 }
 
-int64_t _interpolate(keyint N,
+anuga_int _interpolate(anuga_int N,
 		 double* vertex_values,
 		 double* edge_values,
                  double* centroid_values) {
 
-	keyint k, k3;
+	anuga_int k, k3;
 	double q0, q1, q2;
 
 
@@ -688,11 +687,11 @@ int64_t _interpolate(keyint N,
 	return 0;
 }
 
-int64_t _interpolate_from_vertices_to_edges(keyint N,
+anuga_int _interpolate_from_vertices_to_edges(anuga_int N,
 					double* vertex_values,
 					double* edge_values) {
 
-	keyint k, k3;
+	anuga_int k, k3;
 	double q0, q1, q2;
 
 
@@ -711,11 +710,11 @@ int64_t _interpolate_from_vertices_to_edges(keyint N,
 }
 
 
-int64_t _interpolate_from_edges_to_vertices(keyint N,
+anuga_int _interpolate_from_edges_to_vertices(anuga_int N,
 					double* vertex_values,
 					double* edge_values) {
 
-	keyint k, k3;
+	anuga_int k, k3;
 	double e0, e1, e2;
 
 
@@ -733,13 +732,13 @@ int64_t _interpolate_from_edges_to_vertices(keyint N,
 	return 0;
 }
 
-int64_t _backup_centroid_values(keyint N,
+anuga_int _backup_centroid_values(anuga_int N,
 			    double* centroid_values,
 			    double* centroid_backup_values) {
     // Backup centroid values
 
 
-    keyint k;
+    anuga_int k;
 
     for (k=0; k<N; k++) {
 	centroid_backup_values[k] = centroid_values[k];
@@ -750,7 +749,7 @@ int64_t _backup_centroid_values(keyint N,
 }
 
 
-int64_t _saxpy_centroid_values(keyint N,
+anuga_int _saxpy_centroid_values(anuga_int N,
 			   double a,
 			   double b,
 			   double* centroid_values,
@@ -758,7 +757,7 @@ int64_t _saxpy_centroid_values(keyint N,
     // Saxby centroid values
 
 
-    keyint k;
+    anuga_int k;
 
 
     for (k=0; k<N; k++) {
@@ -770,7 +769,7 @@ int64_t _saxpy_centroid_values(keyint N,
 }
 
 
-int64_t _update(keyint N,
+anuga_int _update(anuga_int N,
 	    double timestep,
 	    double* centroid_values,
 	    double* explicit_update,
@@ -779,7 +778,7 @@ int64_t _update(keyint N,
 	// explicit_update and semi_implicit_update as well as given timestep
 
 
-	keyint k;
+	anuga_int k;
 	double denominator, x;
 
 
@@ -802,7 +801,7 @@ int64_t _update(keyint N,
 	// }
 
 
-	// int64_t err_return = 0;
+	// anuga_int err_return = 0;
 
 	// // Semi implicit updates
 	// #pragma omp parallel for private(k, denominator) reduction(min:err_return)
@@ -830,7 +829,7 @@ int64_t _update(keyint N,
 
 	// return 0;
 
-	int64_t err_return = 0;
+	anuga_int err_return = 0;
 
 	// Divide semi_implicit update by conserved quantity
 	#pragma omp parallel for private(k, x)
@@ -867,16 +866,16 @@ int64_t _update(keyint N,
 }
 
 
-int64_t _average_vertex_values(keyint N,
-			   int64_t* vertex_value_indices,
-			   int64_t* number_of_triangles_per_node,
+anuga_int _average_vertex_values(anuga_int N,
+			   anuga_int* vertex_value_indices,
+			   anuga_int* number_of_triangles_per_node,
 			   double* vertex_values,
 			   double* A) {
   // Average vertex values to obtain one value per node
 
-  keyint i, index;
-  keyint k = 0; //Track triangles touching each node
-  keyint current_node = 0;
+  anuga_int i, index;
+  anuga_int k = 0; //Track triangles touching each node
+  anuga_int current_node = 0;
   double total = 0.0;
 
   for (i=0; i<N; i++) {
@@ -916,17 +915,17 @@ int64_t _average_vertex_values(keyint N,
   return 0;
 }
 
-int64_t _average_centroid_values(keyint N,
-			   int64_t* vertex_value_indices,
-			   int64_t* number_of_triangles_per_node,
+anuga_int _average_centroid_values(anuga_int N,
+			   anuga_int* vertex_value_indices,
+			   anuga_int* number_of_triangles_per_node,
 			   double* centroid_values,
 			   double* A) {
   // Average centroid values to obtain one value per node
 
-  keyint i, index;
-  keyint volume_id;
-  keyint k = 0; //Track triangles touching each node
-  keyint current_node = 0;
+  anuga_int i, index;
+  anuga_int volume_id;
+  anuga_int k = 0; //Track triangles touching each node
+  anuga_int current_node = 0;
   double total = 0.0;
 
   for (i=0; i<N; i++) {
@@ -966,15 +965,15 @@ int64_t _average_centroid_values(keyint N,
 // from a list of vertices and values at those vertices. Called in
 // quantity.py by _set_vertex_values.
 // Naming is a little confusing - but sticking with convention.
-int64_t _set_vertex_values_c(keyint num_verts,
-                        int64_t * vertices,
-                        int64_t * node_index,
-                        int64_t * number_of_triangles_per_node,
-                        int64_t * vertex_value_indices,
+anuga_int _set_vertex_values_c(anuga_int num_verts,
+                        anuga_int * vertices,
+                        anuga_int * node_index,
+                        anuga_int * number_of_triangles_per_node,
+                        anuga_int * vertex_value_indices,
                         double * vertex_values,
                         double * A
                         ){
-  keyint i,j,num_triangles,u_vert_id,vert_v_index;
+  anuga_int i,j,num_triangles,u_vert_id,vert_v_index;
 
   for(i=0;i<num_verts;i++){
 
@@ -993,16 +992,16 @@ int64_t _set_vertex_values_c(keyint num_verts,
 
 }
 
-int64_t _min_and_max_centroid_values(keyint N,
+anuga_int _min_and_max_centroid_values(anuga_int N,
                                  double * qc,
                                  double * qv,
-                                 int64_t * neighbours,
+                                 anuga_int * neighbours,
                                  double * qmin,
                                  double * qmax){
   
   // Find min and max of this and neighbour's centroid values
 
-  keyint k, i, n, k3;
+  anuga_int k, i, n, k3;
   double qn;
 
   for (k=0; k<N; k++) {
diff --git a/anuga/abstract_2d_finite_volumes/quantity_openacc.c b/anuga/abstract_2d_finite_volumes/quantity_openacc.c
deleted file mode 100644
index 8b7f94ef6..000000000
--- a/anuga/abstract_2d_finite_volumes/quantity_openacc.c
+++ /dev/null
@@ -1,1033 +0,0 @@
-// Python - C extension for quantity module.
-//
-// To compile (Python2.3):
-//  gcc -c util_ext.c -I/usr/include/python2.3 -o util_ext.o -Wall -O
-//  gcc -shared util_ext.o  -o util_ext.so
-//
-// See the module quantity.py
-//
-//
-// Ole Nielsen, GA 2004
-
-#include "math.h"
-#include "omp.h"
-#include <stdint.h>
-
-//Shared code snippets
-#include "util_ext.h"
-
-typedef int64_t keyint;
-
-//-------------------------------------------
-// Low level routines (called from wrappers)
-//------------------------------------------
-
-int64_t _compute_gradients(keyint N,
-			double* centroids,
-			double* centroid_values,
-			int64_t* number_of_boundaries,
-			int64_t* surrogate_neighbours,
-			double* a,
-			double* b){
-
-  keyint i, k, k0, k1, k2, index3;
-  double x0, x1, x2, y0, y1, y2, q0, q1, q2; //, det;
-
-
-  for (k=0; k<N; k++) {
-    index3 = 3*k;
-
-    if (number_of_boundaries[k] < 2) {
-      // Two or three true neighbours
-
-      // Get indices of neighbours (or self when used as surrogate)
-      // k0, k1, k2 = surrogate_neighbours[k,:]
-
-      k0 = surrogate_neighbours[index3 + 0];
-      k1 = surrogate_neighbours[index3 + 1];
-      k2 = surrogate_neighbours[index3 + 2];
-
-
-      if (k0 == k1 || k1 == k2) return -1;
-
-      // Get data
-      q0 = centroid_values[k0];
-      q1 = centroid_values[k1];
-      q2 = centroid_values[k2];
-
-      x0 = centroids[k0*2]; y0 = centroids[k0*2+1];
-      x1 = centroids[k1*2]; y1 = centroids[k1*2+1];
-      x2 = centroids[k2*2]; y2 = centroids[k2*2+1];
-
-      // Gradient
-      _gradient(x0, y0, x1, y1, x2, y2, q0, q1, q2, &a[k], &b[k]);
-
-    } else if (number_of_boundaries[k] == 2) {
-      // One true neighbour
-
-      // Get index of the one neighbour
-      i=0; k0 = k;
-      while (i<3 && k0==k) {
-	k0 = surrogate_neighbours[index3 + i];
-	i++;
-      }
-      if (k0 == k) return -1;
-
-      k1 = k; //self
-
-      // Get data
-      q0 = centroid_values[k0];
-      q1 = centroid_values[k1];
-
-      x0 = centroids[k0*2]; y0 = centroids[k0*2+1];
-      x1 = centroids[k1*2]; y1 = centroids[k1*2+1];
-
-      // Two point gradient
-      _gradient2(x0, y0, x1, y1, q0, q1, &a[k], &b[k]);
-
-    }
-    //    else:
-    //        #No true neighbours -
-    //        #Fall back to first order scheme
-  }
-  return 0;
-}
-
-
-int64_t _compute_local_gradients(keyint N,
-			       double* vertex_coordinates,
-			       double* vertex_values,
-			       double* a,
-			       double* b) {
-
-  keyint k, k2, k3, k6;
-  double x0, y0, x1, y1, x2, y2, v0, v1, v2;
-
-  for (k=0; k<N; k++) {
-    k6 = 6*k;
-    k3 = 3*k;
-    k2 = 2*k;
-
-    // vertex coordinates
-    // x0, y0, x1, y1, x2, y2 = X[k,:]
-    x0 = vertex_coordinates[k6 + 0];
-    y0 = vertex_coordinates[k6 + 1];
-    x1 = vertex_coordinates[k6 + 2];
-    y1 = vertex_coordinates[k6 + 3];
-    x2 = vertex_coordinates[k6 + 4];
-    y2 = vertex_coordinates[k6 + 5];
-
-    v0 = vertex_values[k3+0];
-    v1 = vertex_values[k3+1];
-    v2 = vertex_values[k3+2];
-
-    // Gradient
-    _gradient(x0, y0, x1, y1, x2, y2, v0, v1, v2, &a[k], &b[k]);
-
-
-    }
-    return 0;
-}
-
-int64_t _extrapolate_from_gradient(keyint N,
-			       double* centroids,
-			       double* centroid_values,
-			       double* vertex_coordinates,
-			       double* vertex_values,
-			       double* edge_values,
-			       double* a,
-			       double* b) {
-
-  keyint k, k2, k3, k6;
-  double x, y, x0, y0, x1, y1, x2, y2;
-
-  for (k=0; k<N; k++){
-    k6 = 6*k;
-    k3 = 3*k;
-    k2 = 2*k;
-
-    // Centroid coordinates
-    x = centroids[k2]; y = centroids[k2+1];
-
-    // vertex coordinates
-    // x0, y0, x1, y1, x2, y2 = X[k,:]
-    x0 = vertex_coordinates[k6 + 0];
-    y0 = vertex_coordinates[k6 + 1];
-    x1 = vertex_coordinates[k6 + 2];
-    y1 = vertex_coordinates[k6 + 3];
-    x2 = vertex_coordinates[k6 + 4];
-    y2 = vertex_coordinates[k6 + 5];
-
-    // Extrapolate to Vertices
-    vertex_values[k3+0] = centroid_values[k] + a[k]*(x0-x) + b[k]*(y0-y);
-    vertex_values[k3+1] = centroid_values[k] + a[k]*(x1-x) + b[k]*(y1-y);
-    vertex_values[k3+2] = centroid_values[k] + a[k]*(x2-x) + b[k]*(y2-y);
-
-    // Extrapolate to Edges (midpoints)
-    edge_values[k3+0] = 0.5*(vertex_values[k3 + 1]+vertex_values[k3 + 2]);
-    edge_values[k3+1] = 0.5*(vertex_values[k3 + 2]+vertex_values[k3 + 0]);
-    edge_values[k3+2] = 0.5*(vertex_values[k3 + 0]+vertex_values[k3 + 1]);
-
-  }
-  return 0;
-}
-
-
-int64_t _extrapolate_and_limit_from_gradient(keyint N,double beta,
-					 double* centroids,
-					 int64_t*   neighbours,
-					 double* centroid_values,
-					 double* vertex_coordinates,
-					 double* vertex_values,
-					 double* edge_values,
-					 double* phi,
-					 double* x_gradient,
-					 double* y_gradient) {
-
-  keyint i, k, k2, k3, k6;
-  double x, y, x0, y0, x1, y1, x2, y2;
-  keyint n;
-  double qmin, qmax, qc;
-  double qn[3];
-  double dq, dqa[3], r;
-
-  for (k=0; k<N; k++){
-    k6 = 6*k;
-    k3 = 3*k;
-    k2 = 2*k;
-
-    // Centroid coordinates
-    x = centroids[k2+0];
-    y = centroids[k2+1];
-
-    // vertex coordinates
-    // x0, y0, x1, y1, x2, y2 = X[k,:]
-    x0 = vertex_coordinates[k6 + 0];
-    y0 = vertex_coordinates[k6 + 1];
-    x1 = vertex_coordinates[k6 + 2];
-    y1 = vertex_coordinates[k6 + 3];
-    x2 = vertex_coordinates[k6 + 4];
-    y2 = vertex_coordinates[k6 + 5];
-
-    // Extrapolate to Vertices
-    vertex_values[k3+0] = centroid_values[k] + x_gradient[k]*(x0-x) + y_gradient[k]*(y0-y);
-    vertex_values[k3+1] = centroid_values[k] + x_gradient[k]*(x1-x) + y_gradient[k]*(y1-y);
-    vertex_values[k3+2] = centroid_values[k] + x_gradient[k]*(x2-x) + y_gradient[k]*(y2-y);
-
-    // Extrapolate to Edges (midpoints)
-    edge_values[k3+0] = 0.5*(vertex_values[k3 + 1]+vertex_values[k3 + 2]);
-    edge_values[k3+1] = 0.5*(vertex_values[k3 + 2]+vertex_values[k3 + 0]);
-    edge_values[k3+2] = 0.5*(vertex_values[k3 + 0]+vertex_values[k3 + 1]);
-  }
-
-
-
-  for (k=0; k<N; k++){
-    k6 = 6*k;
-    k3 = 3*k;
-    k2 = 2*k;
-
-
-    qc = centroid_values[k];
-
-    qmin = qc;
-    qmax = qc;
-
-    for (i=0; i<3; i++) {
-      n = neighbours[k3+i];
-      if (n < 0) {
-	qn[i] = qc;
-      } else {
-	qn[i] = centroid_values[n];
-      }
-
-      qmin = fmin(qmin, qn[i]);
-      qmax = fmax(qmax, qn[i]);
-    }
-
-    //qtmin = fmin(fmin(fmin(qn[0],qn[1]),qn[2]),qc);
-    //qtmax = fmax(fmax(fmax(qn[0],qn[1]),qn[2]),qc);
-
-    /* 		for (i=0; i<3; i++) { */
-    /* 		    n = neighbours[k3+i]; */
-    /* 		    if (n < 0) { */
-    /* 			qn[i] = qc; */
-    /* 			qmin[i] = qtmin; */
-    /* 			qmax[i] = qtmax; */
-    /* 		    }  */
-    /* 		} */
-
-    phi[k] = 1.0;
-
-    for (i=0; i<3; i++) {
-      dq = edge_values[k3+i] - qc;      //Delta between edge and centroid values
-      dqa[i] = dq;                      //Save dq for use in updating vertex values
-
-      r = 1.0;
-
-      if (dq > 0.0) r = (qmax - qc)/dq;
-      if (dq < 0.0) r = (qmin - qc)/dq;
-
-      phi[k] = fmin( fmin(r*beta, 1.0), phi[k]);
-
-    }
-
-
-
-    //Update gradient, edge and vertex values using phi limiter
-    x_gradient[k] = x_gradient[k]*phi[k];
-    y_gradient[k] = y_gradient[k]*phi[k];
-
-    edge_values[k3+0] = qc + phi[k]*dqa[0];
-    edge_values[k3+1] = qc + phi[k]*dqa[1];
-    edge_values[k3+2] = qc + phi[k]*dqa[2];
-
-
-    vertex_values[k3+0] = edge_values[k3+1] + edge_values[k3+2] - edge_values[k3+0];
-    vertex_values[k3+1] = edge_values[k3+2] + edge_values[k3+0] - edge_values[k3+1];
-    vertex_values[k3+2] = edge_values[k3+0] + edge_values[k3+1] - edge_values[k3+2];
-
-
-  }
-
-  return 0;
-
-}
-
-
-
-
-int64_t _limit_vertices_by_all_neighbours(keyint N, double beta,
-				      double* centroid_values,
-				      double* vertex_values,
-				      double* edge_values,
-				      int64_t*   neighbours,
-				      double* x_gradient,
-				      double* y_gradient) {
-
-
-  keyint i, k, k2, k3, k6;
-  keyint n;
-  double qmin, qmax, qn, qc;
-  double dq, dqa[3], phi, r;
-
-  for (k=0; k<N; k++){
-    k6 = 6*k;
-    k3 = 3*k;
-    k2 = 2*k;
-
-    qc = centroid_values[k];
-    qmin = qc;
-    qmax = qc;
-
-    for (i=0; i<3; i++) {
-      n = neighbours[k3+i];
-      if (n >= 0) {
-	qn = centroid_values[n]; //Neighbour's centroid value
-
-	qmin = fmin(qmin, qn);
-	qmax = fmax(qmax, qn);
-      }
-    }
-
-    phi = 1.0;
-    for (i=0; i<3; i++) {
-      r = 1.0;
-
-      dq = vertex_values[k3+i] - qc;    //Delta between vertex and centroid values
-      dqa[i] = dq;                      //Save dq for use in updating vertex values
-
-      if (dq > 0.0) r = (qmax - qc)/dq;
-      if (dq < 0.0) r = (qmin - qc)/dq;
-
-
-      phi = fmin( fmin(r*beta, 1.0), phi);
-    }
-
-    //Update gradient, vertex and edge values using phi limiter
-    x_gradient[k] = x_gradient[k]*phi;
-    y_gradient[k] = y_gradient[k]*phi;
-
-    vertex_values[k3+0] = qc + phi*dqa[0];
-    vertex_values[k3+1] = qc + phi*dqa[1];
-    vertex_values[k3+2] = qc + phi*dqa[2];
-
-    edge_values[k3+0] = 0.5*(vertex_values[k3+1] + vertex_values[k3+2]);
-    edge_values[k3+1] = 0.5*(vertex_values[k3+2] + vertex_values[k3+0]);
-    edge_values[k3+2] = 0.5*(vertex_values[k3+0] + vertex_values[k3+1]);
-
-  }
-
-  return 0;
-}
-
-
-
-
-int64_t _limit_edges_by_all_neighbours(keyint N, double beta,
-				   double* centroid_values,
-				   double* vertex_values,
-				   double* edge_values,
-				   int64_t*   neighbours,
-				   double* x_gradient,
-				   double* y_gradient) {
-
-  keyint i, k, k2, k3, k6;
-  keyint n;
-  double qmin, qmax, qn, qc, sign;
-  double dq, dqa[3], phi, r;
-
-  for (k=0; k<N; k++){
-    k6 = 6*k;
-    k3 = 3*k;
-    k2 = 2*k;
-
-    qc = centroid_values[k];
-    qmin = qc;
-    qmax = qc;
-
-    for (i=0; i<3; i++) {
-      n = neighbours[k3+i];
-      if (n >= 0) {
-	qn = centroid_values[n]; //Neighbour's centroid value
-
-	qmin = fmin(qmin, qn);
-	qmax = fmax(qmax, qn);
-      }
-    }
-
-    sign = 0.0;
-    if (qmin > 0.0) {
-      sign = 1.0;
-    } else if (qmax < 0) {
-      sign = -1.0;
-    }
-
-    phi = 1.0;
-    for (i=0; i<3; i++) {
-      dq = edge_values[k3+i] - qc;      //Delta between edge and centroid values
-      dqa[i] = dq;                      //Save dq for use in updating vertex values
-
-
-      // Just limit non boundary edges so that we can reconstruct a linear function
-      // FIXME Problem with stability on edges
-      //if (neighbours[k3+i] >= 0) {
-	r = 1.0;
-
-	if (dq > 0.0) r = (qmax - qc)/dq;
-	if (dq < 0.0) r = (qmin - qc)/dq;
-
-	phi = fmin( fmin(r*beta, 1.0), phi);
-	//	}
-
-      //
-      /* if (neighbours[k3+i] < 0) { */
-      /* 	r = 1.0; */
-
-      /* 	if (dq > 0.0 && (sign == -1.0 || sign == 0.0 )) r = (0.0 - qc)/dq; */
-      /* 	if (dq < 0.0 && (sign ==  1.0 || sign == 0.0 )) r = (0.0 - qc)/dq; */
-
-      /* 	phi = fmin( fmin(r*beta, 1.0), phi); */
-      /* 	} */
-
-    }
-
-    //Update gradient, vertex and edge values using phi limiter
-    x_gradient[k] = x_gradient[k]*phi;
-    y_gradient[k] = y_gradient[k]*phi;
-
-    edge_values[k3+0] = qc + phi*dqa[0];
-    edge_values[k3+1] = qc + phi*dqa[1];
-    edge_values[k3+2] = qc + phi*dqa[2];
-
-    vertex_values[k3+0] = edge_values[k3+1] + edge_values[k3+2] - edge_values[k3+0];
-    vertex_values[k3+1] = edge_values[k3+2] + edge_values[k3+0] - edge_values[k3+1];
-    vertex_values[k3+2] = edge_values[k3+0] + edge_values[k3+1] - edge_values[k3+2];
-
-  }
-
-  return 0;
-}
-
-
-int64_t _limit_edges_by_neighbour(keyint N, double beta,
-		     double* centroid_values,
-		     double* vertex_values,
-		     double* edge_values,
-		     int64_t*   neighbours) {
-
-	keyint i, k, k2, k3, k6;
-	keyint n;
-	double qmin, qmax, qn, qc;
-	double dq, dqa[3], phi, r;
-
-	for (k=0; k<N; k++){
-		k6 = 6*k;
-		k3 = 3*k;
-		k2 = 2*k;
-
-		qc = centroid_values[k];
-		phi = 1.0;
-
-		for (i=0; i<3; i++) {
-		    dq = edge_values[k3+i] - qc;     //Delta between edge and centroid values
-		    dqa[i] = dq;                      //Save dqa for use in updating vertex values
-
-		    n = neighbours[k3+i];
-		    qn = qc;
-		    if (n >= 0)  qn = centroid_values[n]; //Neighbour's centroid value
-
-		    qmin = fmin(qc, qn);
-		    qmax = fmax(qc, qn);
-
-		    r = 1.0;
-
-		    if (dq > 0.0) r = (qmax - qc)/dq;
-		    if (dq < 0.0) r = (qmin - qc)/dq;
-
-		    phi = fmin( fmin(r*beta, 1.0), phi);
-
-		}
-
-
-		//Update edge and vertex values using phi limiter
-		edge_values[k3+0] = qc + phi*dqa[0];
-		edge_values[k3+1] = qc + phi*dqa[1];
-		edge_values[k3+2] = qc + phi*dqa[2];
-
-		vertex_values[k3+0] = edge_values[k3+1] + edge_values[k3+2] - edge_values[k3+0];
-		vertex_values[k3+1] = edge_values[k3+2] + edge_values[k3+0] - edge_values[k3+1];
-		vertex_values[k3+2] = edge_values[k3+0] + edge_values[k3+1] - edge_values[k3+2];
-
-	}
-
-	return 0;
-}
-
-
-int64_t _limit_gradient_by_neighbour(keyint N, double beta,
-		     double* centroid_values,
-		     double* vertex_values,
-		     double* edge_values,
-		     double* x_gradient,
-		     double* y_gradient,
-		     int64_t*   neighbours) {
-
-	keyint i, k, k2, k3, k6;
-	keyint n;
-	double qmin, qmax, qn, qc;
-	double dq, dqa[3], phi, r;
-
-	for (k=0; k<N; k++){
-		k6 = 6*k;
-		k3 = 3*k;
-		k2 = 2*k;
-
-		qc = centroid_values[k];
-		phi = 1.0;
-
-		for (i=0; i<3; i++) {
-		    dq = edge_values[k3+i] - qc;     //Delta between edge and centroid values
-		    dqa[i] = dq;                      //Save dq for use in updating vertex values
-
-		    n = neighbours[k3+i];
-		    if (n >= 0) {
-			qn = centroid_values[n]; //Neighbour's centroid value
-
-			qmin = fmin(qc, qn);
-			qmax = fmax(qc, qn);
-
-			r = 1.0;
-
-			if (dq > 0.0) r = (qmax - qc)/dq;
-			if (dq < 0.0) r = (qmin - qc)/dq;
-
-			phi = fmin( fmin(r*beta, 1.0), phi);
-		    }
-		}
-
-
-		//Update edge and vertex values using phi limiter
-		edge_values[k3+0] = qc + phi*dqa[0];
-		edge_values[k3+1] = qc + phi*dqa[1];
-		edge_values[k3+2] = qc + phi*dqa[2];
-
-		vertex_values[k3+0] = edge_values[k3+1] + edge_values[k3+2] - edge_values[k3+0];
-		vertex_values[k3+1] = edge_values[k3+2] + edge_values[k3+0] - edge_values[k3+1];
-		vertex_values[k3+2] = edge_values[k3+0] + edge_values[k3+1] - edge_values[k3+2];
-
-	}
-
-	return 0;
-}
-
-int64_t _bound_vertices_below_by_constant(keyint N, double bound,
-		     double* centroid_values,
-		     double* vertex_values,
-		     double* edge_values,
-		     double* x_gradient,
-		     double* y_gradient) {
-
-	keyint i, k, k2, k3, k6;
-	double qmin, qc;
-	double dq, dqa[3], phi, r;
-
-	for (k=0; k<N; k++){
-		k6 = 6*k;
-		k3 = 3*k;
-		k2 = 2*k;
-
-		qc = centroid_values[k];
-		qmin = bound;
-
-
-		phi = 1.0;
-		for (i=0; i<3; i++) {
-		    r = 1.0;
-
-		    dq = vertex_values[k3+i] - qc;    //Delta between vertex and centroid values
-		    dqa[i] = dq;                      //Save dq for use in updating vertex values
-
-		    if (dq < 0.0) r = (qmin - qc)/dq;
-
-
-		    phi = fmin( fmin(r, 1.0), phi);
-		}
-
-
-		//Update gradient, vertex and edge values using phi limiter
-		x_gradient[k] = x_gradient[k]*phi;
-		y_gradient[k] = y_gradient[k]*phi;
-
-		vertex_values[k3+0] = qc + phi*dqa[0];
-		vertex_values[k3+1] = qc + phi*dqa[1];
-		vertex_values[k3+2] = qc + phi*dqa[2];
-
-		edge_values[k3+0] = 0.5*(vertex_values[k3+1] + vertex_values[k3+2]);
-		edge_values[k3+1] = 0.5*(vertex_values[k3+2] + vertex_values[k3+0]);
-		edge_values[k3+2] = 0.5*(vertex_values[k3+0] + vertex_values[k3+1]);
-
-	}
-
-	return 0;
-}
-
-int64_t _bound_vertices_below_by_quantity(keyint N,
-				      double* bound_vertex_values,
-				      double* centroid_values,
-				      double* vertex_values,
-				      double* edge_values,
-				      double* x_gradient,
-				      double* y_gradient) {
-
-	keyint i, k, k2, k3, k6;
-	double qmin, qc;
-	double dq, dqa[3], phi, r;
-
-	for (k=0; k<N; k++){
-		k6 = 6*k;
-		k3 = 3*k;
-		k2 = 2*k;
-
-		qc = centroid_values[k];
-
-		phi = 1.0;
-		for (i=0; i<3; i++) {
-		    r = 1.0;
-
-		    dq = vertex_values[k3+i] - qc;    //Delta between vertex and centroid values
-		    dqa[i] = dq;                      //Save dq for use in updating vertex values
-
-		    qmin = bound_vertex_values[k3+i];
-		    if (dq < 0.0) r = (qmin - qc)/dq;
-
-
-		    phi = fmin( fmin(r, 1.0), phi);
-		}
-
-
-		//Update gradient, vertex and edge values using phi limiter
-		x_gradient[k] = x_gradient[k]*phi;
-		y_gradient[k] = y_gradient[k]*phi;
-
-		vertex_values[k3+0] = qc + phi*dqa[0];
-		vertex_values[k3+1] = qc + phi*dqa[1];
-		vertex_values[k3+2] = qc + phi*dqa[2];
-
-		edge_values[k3+0] = 0.5*(vertex_values[k3+1] + vertex_values[k3+2]);
-		edge_values[k3+1] = 0.5*(vertex_values[k3+2] + vertex_values[k3+0]);
-		edge_values[k3+2] = 0.5*(vertex_values[k3+0] + vertex_values[k3+1]);
-
-	}
-
-	return 0;
-}
-
-int64_t _interpolate(keyint N,
-		 double* vertex_values,
-		 double* edge_values,
-                 double* centroid_values) {
-
-	keyint k, k3;
-	double q0, q1, q2;
-
-
-	for (k=0; k<N; k++) {
-		k3 = 3*k;
-
-		q0 = vertex_values[k3 + 0];
-		q1 = vertex_values[k3 + 1];
-		q2 = vertex_values[k3 + 2];
-
-                centroid_values[k] = (q0+q1+q2)/3.0;
-
-		edge_values[k3 + 0] = 0.5*(q1+q2);
-		edge_values[k3 + 1] = 0.5*(q0+q2);
-		edge_values[k3 + 2] = 0.5*(q0+q1);
-	}
-	return 0;
-}
-
-int64_t _interpolate_from_vertices_to_edges(keyint N,
-					double* vertex_values,
-					double* edge_values) {
-
-	keyint k, k3;
-	double q0, q1, q2;
-
-
-	for (k=0; k<N; k++) {
-		k3 = 3*k;
-
-		q0 = vertex_values[k3 + 0];
-		q1 = vertex_values[k3 + 1];
-		q2 = vertex_values[k3 + 2];
-
-		edge_values[k3 + 0] = 0.5*(q1+q2);
-		edge_values[k3 + 1] = 0.5*(q0+q2);
-		edge_values[k3 + 2] = 0.5*(q0+q1);
-	}
-	return 0;
-}
-
-
-int64_t _interpolate_from_edges_to_vertices(keyint N,
-					double* vertex_values,
-					double* edge_values) {
-
-	keyint k, k3;
-	double e0, e1, e2;
-
-
-	for (k=0; k<N; k++) {
-		k3 = 3*k;
-
-		e0 = edge_values[k3 + 0];
-		e1 = edge_values[k3 + 1];
-		e2 = edge_values[k3 + 2];
-
-		vertex_values[k3 + 0] = e1 + e2 - e0;
-		vertex_values[k3 + 1] = e2 + e0 - e1;
-		vertex_values[k3 + 2] = e0 + e1 - e2;
-	}
-	return 0;
-}
-
-int64_t _backup_centroid_values(keyint N,
-			    double* centroid_values,
-			    double* centroid_backup_values) {
-    // Backup centroid values
-
-
-    keyint k;
-
-    for (k=0; k<N; k++) {
-	centroid_backup_values[k] = centroid_values[k];
-    }
-
-
-    return 0;
-}
-
-
-int64_t _saxpy_centroid_values(keyint N,
-			   double a,
-			   double b,
-			   double* centroid_values,
-			   double* centroid_backup_values) {
-    // Saxby centroid values
-
-
-    keyint k;
-
-
-    for (k=0; k<N; k++) {
-	centroid_values[k] = a*centroid_values[k] + b*centroid_backup_values[k];
-    }
-
-
-    return 0;
-}
-
-
-int64_t _update(keyint N,
-	    double timestep,
-	    double* centroid_values,
-	    double* explicit_update,
-	    double* semi_implicit_update) {
-	// Update centroid values based on values stored in
-	// explicit_update and semi_implicit_update as well as given timestep
-
-
-	keyint k;
-	double denominator, x;
-
-
-	// // Divide semi_implicit update by conserved quantity
-	// #pragma omp parallel for private(k, x)
-	// for (k=0; k<N; k++) {
-	// 	x = centroid_values[k];
-	// 	if (x == 0.0) {
-	// 		semi_implicit_update[k] = 0.0;
-	// 	} else {
-	// 		semi_implicit_update[k] /= x;
-	// 	}
-	// }
-
-
-	// // Explicit updates
-	// #pragma omp parallel for private(k)
-	// for (k=0; k<N; k++) {
-	// 	centroid_values[k] += timestep*explicit_update[k];
-	// }
-
-
-	// int64_t err_return = 0;
-
-	// // Semi implicit updates
-	// #pragma omp parallel for private(k, denominator) reduction(fmin:err_return)
-	// for (k=0; k<N; k++) {
-	// 	denominator = 1.0 - timestep*semi_implicit_update[k];
-	// 	if (denominator <= 0.0) {
-	// 		err_return = -1;
-	// 	} else {
-	// 		//Update conserved_quantities from semi implicit updates
-	// 		centroid_values[k] /= denominator;
-	// 	}
-	// }
-
-	// if (err_return == -1)
-	// {
-	// 	return -1;
-	// }
-
-	// // Reset semi_implicit_update here ready for next time step
-	// #pragma omp parallel for private(k)
-	// for (k = 0; k < N; k++)
-	// {
-	// 	semi_implicit_update[k] = 0.0;
-	// }
-
-	// return 0;
-
-	int64_t err_return = 0;
-
-	// Divide semi_implicit update by conserved quantity
-	#pragma omp parallel for private(k, x)
-	for (k=0; k<N; k++) {
-
-		x = centroid_values[k];
-		if (x == 0.0) {
-			semi_implicit_update[k] = 0.0;
-		} else {
-			semi_implicit_update[k] /= x;
-		}
-
-		centroid_values[k] += timestep*explicit_update[k];
-
-		// Semi implicit updates
-		denominator = 1.0 - timestep*semi_implicit_update[k];
-		if (denominator <= 0.0) {
-			err_return = -1;
-		} else {
-			//Update conserved_quantities from semi implicit updates
-			centroid_values[k] /= denominator;
-		}
-		
-		// Reset semi_implicit_update here ready for next time step
-		semi_implicit_update[k] = 0.0;
-	}
-
-	if (err_return == -1)
-	{
-		return -1;
-	}
-
-	return 0;
-}
-
-
-int64_t _average_vertex_values(keyint N,
-			   int64_t* vertex_value_indices,
-			   int64_t* number_of_triangles_per_node,
-			   double* vertex_values,
-			   double* A) {
-  // Average vertex values to obtain one value per node
-
-  keyint i, index;
-  keyint k = 0; //Track triangles touching each node
-  keyint current_node = 0;
-  double total = 0.0;
-
-  for (i=0; i<N; i++) {
-
-    // if (current_node == N) {
-    //   printf("Current node exceeding number of nodes (%d)", N);
-    //   return 1;
-    // }
-
-		if (number_of_triangles_per_node[current_node] == 0) {
-			  // Jump over orphaned node
-				total = 0.0;
-				k = 0;
-				current_node += 1;
-			}
-		else {
-	    index = vertex_value_indices[i];
-	    k += 1;
-
-	    // volume_id = index / 3
-	    // vertex_id = index % 3
-	    // total += self.vertex_values[volume_id, vertex_id]
-	    total += vertex_values[index];
-
-	    // printf("current_node=%d, index=%d, k=%d, total=%f\n", current_node, index, k, total);
-	    if (number_of_triangles_per_node[current_node] == k) {
-	      A[current_node] = total/k;
-
-	      // Move on to next node
-	      total = 0.0;
-	      k = 0;
-	      current_node += 1;
-	    }
-		}
-  }
-
-  return 0;
-}
-
-int64_t _average_centroid_values(keyint N,
-			   int64_t* vertex_value_indices,
-			   int64_t* number_of_triangles_per_node,
-			   double* centroid_values,
-			   double* A) {
-  // Average centroid values to obtain one value per node
-
-  keyint i, index;
-  keyint volume_id;
-  keyint k = 0; //Track triangles touching each node
-  keyint current_node = 0;
-  double total = 0.0;
-
-  for (i=0; i<N; i++) {
-
-		if (number_of_triangles_per_node[current_node] == 0) {
-			  // Jump over orphaned node
-				total = 0.0;
-				k = 0;
-				current_node += 1;
-			}
-		else {
-	    index = vertex_value_indices[i];
-	    k += 1;
-
-	    volume_id = index / 3;
-	    // vertex_id = index % 3;
-	    // total += self.vertex_values[volume_id, vertex_id];
-	    total += centroid_values[volume_id];
-
-	    // printf("current_node=%d, index=%d, k=%d, total=%f\n", current_node, index, k, total);
-	    if (number_of_triangles_per_node[current_node] == k) {
-	      A[current_node] = total/k;
-
-	      // Move on to next node
-	      total = 0.0;
-	      k = 0;
-	      current_node += 1;
-			}
-    }
-  }
-
-  return 0;
-}
-
-// Note Padarn 27/11/12:
-// This function is used to set all the node values of a quantity
-// from a list of vertices and values at those vertices. Called in
-// quantity.py by _set_vertex_values.
-// Naming is a little confusing - but sticking with convention.
-int64_t _set_vertex_values_c(keyint num_verts,
-                        int64_t * vertices,
-                        int64_t * node_index,
-                        int64_t * number_of_triangles_per_node,
-                        int64_t * vertex_value_indices,
-                        double * vertex_values,
-                        double * A
-                        ){
-  keyint i,j,num_triangles,u_vert_id,vert_v_index;
-
-  for(i=0;i<num_verts;i++){
-
-    u_vert_id=vertices[i];
-    num_triangles = number_of_triangles_per_node[u_vert_id];
-
-    for(j=0;j<num_triangles;j++){
-
-      vert_v_index = vertex_value_indices[node_index[u_vert_id]+j];
-      vertex_values[vert_v_index]=A[i];
-    }
-
-  }
-
-  return 0;
-
-}
-
-int64_t _min_and_max_centroid_values(keyint N,
-                                 double * qc,
-                                 double * qv,
-                                 int64_t * neighbours,
-                                 double * qmin,
-                                 double * qmax){
-  
-  // Find fmin and fmax of this and neighbour's centroid values
-
-  keyint k, i, n, k3;
-  double qn;
-
-  for (k=0; k<N; k++) {
-    k3=k*3;
-
-    qmin[k] = qc[k];
-    qmax[k] = qmin[k];
-
-    for (i=0; i<3; i++) {
-      n = neighbours[k3+i];
-      if (n >= 0) {
-        qn = qc[n]; //Neighbour's centroid value
-
-        qmin[k] = fmin(qmin[k], qn);
-        qmax[k] = fmax(qmax[k], qn);
-      }
-      //qmin[k] = fmax(qmin[k],0.5*((double*) qc -> data)[k]);
-      //qmax[k] = fmin(qmax[k],2.0*((double*) qc -> data)[k]);
-    }
-  }
-
-  return 0;
-
-
-}
-
-
-
diff --git a/anuga/abstract_2d_finite_volumes/quantity_openacc_ext.pyx b/anuga/abstract_2d_finite_volumes/quantity_openacc_ext.pyx
deleted file mode 100644
index 936d27c51..000000000
--- a/anuga/abstract_2d_finite_volumes/quantity_openacc_ext.pyx
+++ /dev/null
@@ -1,717 +0,0 @@
-#cython: wraparound=False, boundscheck=False, cdivision=True, profile=False, nonecheck=False, overflowcheck=False, cdivision_warnings=False, unraisable_tracebacks=False
-import cython
-from libc.stdint cimport int64_t
-
-# import both numpy and the Cython declarations for numpy
-import numpy as np
-cimport numpy as np
-
-ctypedef int64_t keyint
-
-# declare the interface to the C code
-cdef extern from "quantity_openmp.c":
-  int64_t _compute_gradients(keyint N, double* centroids, double* centroid_values, int64_t* number_of_boundaries, int64_t* surrogate_neighbours, double* a, double* b)
-  int64_t _compute_local_gradients(keyint N, double* vertex_coordinates, double* vertex_values, double* a, double* b)
-  int64_t _extrapolate_from_gradient(keyint N, double* centroids, double* centroid_values, double* vertex_coordinates, double* vertex_values, double* edge_values, double* a, double* b)
-  int64_t _extrapolate_and_limit_from_gradient(keyint N, double beta, double* centroids, int64_t* neighbours, double* centroid_values, double* vertex_coordinates, double* vertex_values, double* edge_values, double* phi, double* x_gradient, double* y_gradient) 
-  int64_t _limit_vertices_by_all_neighbours(keyint N, double beta, double* centroid_values, double* vertex_values, double* edge_values, int64_t* neighbours, double* x_gradient, double* y_gradient)
-  int64_t _limit_edges_by_all_neighbours(keyint N, double beta, double* centroid_values, double* vertex_values, double* edge_values, int64_t* neighbours, double* x_gradient, double* y_gradient)
-  int64_t _limit_edges_by_neighbour(keyint N, double beta, double* centroid_values, double* vertex_values, double* edge_values, int64_t* neighbours)
-  int64_t _limit_gradient_by_neighbour(keyint N, double beta, double* centroid_values, double* vertex_values, double* edge_values, double* x_gradient, double* y_gradient, int64_t* neighbours)
-  int64_t _bound_vertices_below_by_constant(keyint N, double bound, double* centroid_values, double* vertex_values, double* edge_values, double* x_gradient, double* y_gradient)
-  int64_t _bound_vertices_below_by_quantity(keyint N, double* bound_vertex_values, double* centroid_values, double* vertex_values, double* edge_values, double* x_gradient, double* y_gradient)
-  int64_t _interpolate(keyint N, double* vertex_values, double* edge_values, double* centroid_values)
-  int64_t _interpolate_from_vertices_to_edges(keyint N, double* vertex_values, double* edge_values)
-  int64_t _interpolate_from_edges_to_vertices(keyint N, double* vertex_values, double* edge_values)
-  int64_t _backup_centroid_values(keyint N, double* centroid_values, double* centroid_backup_values)
-  int64_t _saxpy_centroid_values(keyint N, double a, double b, double* centroid_values, double* centroid_backup_values)
-  int64_t _update(keyint N, double timestep, double* centroid_values, double* explicit_update, double* semi_implicit_update)
-  int64_t _average_vertex_values(keyint N, int64_t* vertex_value_indices, int64_t* number_of_triangles_per_node, double* vertex_values, double* A)
-  int64_t _average_centroid_values(keyint N, int64_t* vertex_value_indices, int64_t* number_of_triangles_per_node, double* centroid_values, double* A)
-  int64_t _set_vertex_values_c(keyint num_verts, int64_t* vertices, int64_t* node_index, int64_t* number_of_triangles_per_node, int64_t* vertex_value_indices, double* vertex_values, double* A)
-  int64_t _min_and_max_centroid_values(keyint N, double* qc, double* qv, int64_t* neighbours, double* qmin, double* qmax)
-
-cdef extern from "util_ext.h":
-  void _limit_old(int64_t N, double beta, double* qc, double* qv, double* qmin, double* qmax)
-
-
-def update(object quantity, double timestep):
-  """Update centroid values based on values stored in
-    explicit_update and semi_implicit_update as well as given timestep
-
-    Function implementing forcing terms must take on argument
-    which is the domain and they must update either explicit
-    or implicit updates, e,g,:
-
-    def gravity(domain):
-        ....
-        domain.quantities['xmomentum'].explicit_update = ...
-        domain.quantities['ymomentum'].explicit_update = ...
-
-
-
-    Explicit terms must have the form
-
-        G(q, t)
-
-    and explicit scheme is
-
-       q^{(n+1}) = q^{(n)} + delta_t G(q^{n}, n delta_t)
-
-
-    Semi implicit forcing terms are assumed to have the form
-
-       G(q, t) = H(q, t) q
-
-    and the semi implicit scheme will then be
-
-      q^{(n+1}) = q^{(n)} + delta_t H(q^{n}, n delta_t) q^{(n+1})"""
-
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=1, mode="c"] explicit_update
-  cdef np.ndarray[double, ndim=1, mode="c"] semi_implicit_update
-
-  cdef keyint N
-  cdef int64_t err
-
-  centroid_values = quantity.centroid_values
-  explicit_update = quantity.explicit_update
-  semi_implicit_update = quantity.semi_implicit_update
-
-  N = centroid_values.shape[0]
-
-  err = _update(N, timestep, &centroid_values[0], &explicit_update[0], &semi_implicit_update[0])
-
-  assert err == 0, "update: division by zero in semi implicit update - call Stephen :)"
-
-
-def backup_centroid_values(object quantity):
-
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_backup_values
-
-  cdef keyint N
-  cdef int64_t err
-
-  centroid_values = quantity.centroid_values
-  centroid_backup_values = quantity.centroid_backup_values
-
-  N = centroid_values.shape[0]
-
-  err = _backup_centroid_values(N, &centroid_values[0], &centroid_backup_values[0])
-
-def saxpy_centroid_values(object quantity, double a, double b):
-
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_backup_values
-
-  cdef keyint N
-  cdef int64_t err
-
-  centroid_values = quantity.centroid_values
-  centroid_backup_values = quantity.centroid_backup_values
-
-  N = centroid_values.shape[0]
-
-  err = _saxpy_centroid_values(N, a, b, &centroid_values[0], &centroid_backup_values[0])
-
-
-def set_vertex_values_c(object quantity, np.ndarray[int64_t, ndim=1, mode="c"] vertices not None, np.ndarray[double, ndim=1, mode="c"] A not None):
-
-  cdef object domain
-  cdef object mesh
-
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[int64_t, ndim=1, mode="c"] node_index
-  cdef np.ndarray[int64_t, ndim=1, mode="c"] number_of_triangles_per_node
-  cdef np.ndarray[int64_t, ndim=1, mode="c"] vertex_value_indices
-
-  cdef keyint N
-  cdef int64_t err
-  cdef keyint num_verts
-
-  domain = quantity.domain
-  mesh = domain.mesh
-
-  vertex_values = quantity.vertex_values
-  node_index = mesh.node_index
-  number_of_triangles_per_node = mesh.number_of_triangles_per_node
-  vertex_value_indices = mesh.vertex_value_indices
-
-  num_verts = vertices.shape[0]
-
-  err = _set_vertex_values_c(num_verts, &vertices[0], &node_index[0], &number_of_triangles_per_node[0], &vertex_value_indices[0], &vertex_values[0,0], &A[0])
-
-
-def interpolate(object quantity):
-  
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-
-  cdef keyint N
-  cdef int64_t err
-
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-  centroid_values = quantity.centroid_values
-
-  N = vertex_values.shape[0]
-
-  err = _interpolate(N, &vertex_values[0,0], &edge_values[0,0], &centroid_values[0])
-
-  assert err == 0, "Interpolate: could not be computed"
-
-def interpolate_from_vertices_to_edges(object quantity):
-
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-
-  cdef keyint N
-  cdef int64_t err
-
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-
-  N = vertex_values.shape[0]
-
-  err = _interpolate_from_vertices_to_edges(N, &vertex_values[0,0], &edge_values[0,0])
-
-  assert err == 0, "Interpolate: could not be computed"
-
-def interpolate_from_edges_to_vertices(object quantity):
-
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-
-  cdef keyint N
-  cdef int64_t err
-
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-
-  N = vertex_values.shape[0]
-
-  err = _interpolate_from_edges_to_vertices(N, &vertex_values[0,0], &edge_values[0,0])
-
-  assert err == 0, "Interpolate: could not be computed"
-
-def average_vertex_values(np.ndarray[int64_t, ndim=1, mode="c"] vertex_value_indices not None, np.ndarray[int64_t, ndim=1, mode="c"] number_of_triangles_per_node not None, np.ndarray[double, ndim=2, mode="c"] vertex_values not None, np.ndarray[double, ndim=1, mode="c"] A not None):
-
-  cdef keyint N
-  cdef int64_t err
-
-  N = vertex_value_indices.shape[0]
-
-  err = _average_vertex_values(N, &vertex_value_indices[0], &number_of_triangles_per_node[0], &vertex_values[0,0], &A[0])
-
-  assert err == 0, "average_vertex_values: could not be computed"
-
-def average_centroid_values(np.ndarray[int64_t, ndim=1, mode="c"] vertex_value_indices not None, np.ndarray[int64_t, ndim=1, mode="c"] number_of_triangles_per_node not None, np.ndarray[double, ndim=1, mode="c"] centroid_values not None, np.ndarray[double, ndim=1, mode="c"] A not None):
-
-  cdef keyint N
-  cdef int64_t err
-
-  N = vertex_value_indices.shape[0]
-
-  err = _average_centroid_values(N, &vertex_value_indices[0], &number_of_triangles_per_node[0], &centroid_values[0], &A[0])
-
-  assert err == 0, "average_centroid_values: could not be computed"
-
-def extrapolate_from_gradient(object quantity):
-
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] centroids
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_coordinates
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-  cdef np.ndarray[int64_t, ndim=1, mode="c"] number_of_boundaries
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] surrogate_neighbours
-  cdef np.ndarray[double, ndim=1, mode="c"] x_gradient
-  cdef np.ndarray[double, ndim=1, mode="c"] y_gradient
-
-  cdef keyint N
-  cdef int64_t err
-
-  domain = quantity.domain
-
-  centroids = domain.centroid_coordinates
-  centroid_values = quantity.centroid_values
-  surrogate_neighbours = domain.surrogate_neighbours
-  number_of_boundaries = domain.number_of_boundaries
-  vertex_coordinates = domain.vertex_coordinates
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-  x_gradient = quantity.x_gradient
-  y_gradient = quantity.y_gradient
-
-  N = centroid_values.shape[0]
-
-  err = _extrapolate_from_gradient(N,\
-							&centroids[0,0],\
-							&centroid_values[0],\
-							&vertex_coordinates[0,0],\
-							&vertex_values[0,0],\
-							&edge_values[0,0],\
-							&x_gradient[0],\
-							&y_gradient[0])
-
-  assert err == 0, "Internal function _extrapolate failed"
-
-def compute_local_gradients(object quantity):
-
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_coordinates
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=1, mode="c"] x_gradient
-  cdef np.ndarray[double, ndim=1, mode="c"] y_gradient
-
-  cdef keyint N
-  cdef int64_t err
-
-  domain = quantity.domain
-
-  vertex_coordinates = domain.vertex_coordinates
-  vertex_values = quantity.vertex_values
-  x_gradient = quantity.x_gradient
-  y_gradient = quantity.y_gradient
-
-  N = vertex_values.shape[0]
-
-  err = _compute_local_gradients(N, &vertex_coordinates[0,0], &vertex_values[0,0], &x_gradient[0], &y_gradient[0])
-
-  assert err == 0, "Internal function _compute_local_gradient failed"
-
-def extrapolate_second_order_and_limit_by_edge(object quantity):
-
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] domain_centroids
-  cdef np.ndarray[double, ndim=2, mode="c"] domain_vertex_coordinates
-  cdef np.ndarray[int64_t, ndim=1, mode="c"] domain_number_of_boundaries
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] domain_surrogate_neighbours
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] domain_neighbours
-
-  cdef np.ndarray[double, ndim=1, mode="c"] quantity_centroid_values
-  cdef np.ndarray[double, ndim=2, mode="c"] quantity_vertex_values
-  cdef np.ndarray[double, ndim=2, mode="c"] quantity_edge_values
-  cdef np.ndarray[double, ndim=1, mode="c"] quantity_phi
-  cdef np.ndarray[double, ndim=1, mode="c"] quantity_x_gradient
-  cdef np.ndarray[double, ndim=1, mode="c"] quantity_y_gradient
-
-  cdef keyint ntri
-  cdef double beta
-  cdef int64_t err
-
-  domain = quantity.object
-
-  domain_centroids = domain.centroid_coordinates
-  domain_surrogate_neighbours = domain.surrogate_neighbours
-  domain_number_of_boundaries = domain.number_of_boundaries
-  domain_vertex_coordinates = domain.vertex_coordinates
-  domain_neighbours = domain.neighbours
-
-  quantity_centroid_values = quantity.centroid_values
-  quantity_vertex_values = quantity.vertex_values
-  quantity_edge_values = quantity.edge_values
-  quantity_phi = quantity.phi
-  quantity_x_gradient = quantity.x_gradient
-  quantity_y_gradient = quantity.y_gradient
-
-  beta = quantity.beta
-
-  ntri = quantity_centroid_values.shape[0]
-
-  err = _compute_gradients(ntri,\
-						&domain_centroids[0,0],\
-						&quantity_centroid_values[0],\
-						&domain_number_of_boundaries[0],\
-						&domain_surrogate_neighbours[0,0],\
-						&quantity_x_gradient[0],\
-						&quantity_y_gradient[0])
-
-  assert err == 0, "Internal function _compute_gradient failed"
-
-  err = _extrapolate_from_gradient(ntri,\
-						&domain_centroids[0,0],\
-						&quantity_centroid_values[0],\
-						&domain_vertex_coordinates[0,0],\
-						&quantity_vertex_values[0,0],\
-						&quantity_edge_values[0,0],\
-						&quantity_x_gradient[0],\
-						&quantity_y_gradient[0])
-
-  assert err == 0, "Internal function _extrapolate_from_gradient failed"
-
-  err = _limit_edges_by_all_neighbours(ntri, beta,\
-						&quantity_centroid_values[0],\
-						&quantity_vertex_values[0,0],\
-						&quantity_edge_values[0,0],\
-						&domain_neighbours[0,0],\
-						&quantity_x_gradient[0],\
-						&quantity_y_gradient[0])
-
-  assert err == 0, "Internal function _limit_edges_by_all_neighbours failed"
-
-def extrapolate_second_order_and_limit_by_vertex(object quantity):
-
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] domain_centroids
-  cdef np.ndarray[double, ndim=2, mode="c"] domain_vertex_coordinates
-  cdef np.ndarray[int64_t, ndim=1, mode="c"] domain_number_of_boundaries
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] domain_surrogate_neighbours
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] domain_neighbours
-
-  cdef np.ndarray[double, ndim=1, mode="c"] quantity_centroid_values
-  cdef np.ndarray[double, ndim=2, mode="c"] quantity_vertex_values
-  cdef np.ndarray[double, ndim=2, mode="c"] quantity_edge_values
-  cdef np.ndarray[double, ndim=1, mode="c"] quantity_phi
-  cdef np.ndarray[double, ndim=1, mode="c"] quantity_x_gradient
-  cdef np.ndarray[double, ndim=1, mode="c"] quantity_y_gradient
-
-  cdef keyint ntri
-  cdef double beta
-  cdef int64_t err
-
-  domain = quantity.object
-
-  domain_centroids = domain.centroid_coordinates
-  domain_surrogate_neighbours = domain.surrogate_neighbours
-  domain_number_of_boundaries = domain.number_of_boundaries
-  domain_vertex_coordinates = domain.vertex_coordinates
-  domain_neighbours = domain.neighbours
-
-  quantity_centroid_values = quantity.centroid_values
-  quantity_vertex_values = quantity.vertex_values
-  quantity_edge_values = quantity.edge_values
-  quantity_phi = quantity.phi
-  quantity_x_gradient = quantity.x_gradient
-  quantity_y_gradient = quantity.y_gradient
-
-  beta = quantity.beta
-
-  ntri = quantity_centroid_values.shape[0]
-
-  err = _compute_gradients(ntri,\
-						&domain_centroids[0,0],\
-						&quantity_centroid_values[0],\
-						&domain_number_of_boundaries[0],\
-						&domain_surrogate_neighbours[0,0],\
-						&quantity_x_gradient[0],\
-						&quantity_y_gradient[0])
-
-  assert err == 0, "Internal function _compute_gradient failed"
-
-  err = _extrapolate_from_gradient(ntri,\
-						&domain_centroids[0,0],\
-						&quantity_centroid_values[0],\
-						&domain_vertex_coordinates[0,0],\
-						&quantity_vertex_values[0,0],\
-						&quantity_edge_values[0,0],\
-						&quantity_x_gradient[0],\
-						&quantity_y_gradient[0])
-
-  assert err == 0, "Internal function _extrapolate_from_gradient failed"
-
-  err = _limit_vertices_by_all_neighbours(ntri, beta,\
-						&quantity_centroid_values[0],\
-						&quantity_vertex_values[0,0],\
-						&quantity_edge_values[0,0],\
-						&domain_neighbours[0,0],\
-						&quantity_x_gradient[0],\
-						&quantity_y_gradient[0])
-
-  assert err == 0, "Internal function _limit_edges_by_all_neighbours failed"
-
-def compute_gradients(object quantity):
-
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] centroids
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_coordinates
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-  cdef np.ndarray[int64_t, ndim=1, mode="c"] number_of_boundaries
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] surrogate_neighbours
-  cdef np.ndarray[double, ndim=1, mode="c"] x_gradient
-  cdef np.ndarray[double, ndim=1, mode="c"] y_gradient
-
-  cdef keyint N
-  cdef int64_t err
-
-  domain = quantity.domain
-
-  centroids = domain.centroid_coordinates
-  centroid_values = quantity.centroid_values
-  surrogate_neighbours = domain.surrogate_neighbours
-  number_of_boundaries = domain.number_of_boundaries
-  vertex_coordinates = domain.vertex_coordinates
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-  x_gradient = quantity.x_gradient
-  y_gradient = quantity.y_gradient
-
-  N = centroid_values.shape[0]
-
-  err = _compute_gradients(N,\
-						&centroids[0,0],\
-						&centroid_values[0],\
-						&number_of_boundaries[0],\
-						&surrogate_neighbours[0,0],\
-						&x_gradient[0],\
-						&y_gradient[0])
-
-  assert err == 0, "Gradient could not be computed"
-
-def limit_old(object quantity):
-  
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=1, mode="c"] qc
-  cdef np.ndarray[double, ndim=2, mode="c"] qv
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] neighbours
-
-  cdef keyint N
-  cdef double beta_w
-  cdef int64_t err
-
-  domain = quantity.domain
-
-  neighbours = domain.neighbours
-
-  beta_w = domain.beta_w
-
-  qc = quantity.centroid_values
-  qv = quantity.vertex_values
-
-  N = qc.shape[0]
-
-  cdef np.ndarray[double, ndim=1, mode="c"] qmin = np.empty(N, dtype=np.float64)
-  cdef np.ndarray[double, ndim=1, mode="c"] qmax = np.empty(N, dtype=np.float64)
-
-  err = _min_and_max_centroid_values(N, &qc[0], &qv[0,0], &neighbours[0,0], &qmin[0], &qmax[0])
-
-  assert err == 0, "Internal function _min_and_max_centroid_values failed"
-
-  _limit_old(N, beta_w, &qc[0], &qv[0,0], &qmin[0], &qmax[0])
-
-def limit_vertices_by_all_neighbours(object quantity):
-  
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] neighbours
-  cdef np.ndarray[double, ndim=1, mode="c"] x_gradient
-  cdef np.ndarray[double, ndim=1, mode="c"] y_gradient
-
-  cdef double beta_w
-  cdef keyint N
-  cdef int64_t err
-
-  domain = quantity.domain
-
-  beta_w = domain.beta_w
-
-  neighbours = domain.neighbours
-  centroid_values = quantity.centroid_values
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-  x_gradient = quantity.x_gradient
-  y_gradient = quantity.y_gradient
-  beta_w = domain.beta_w
-
-  N = centroid_values.shape[0]
-
-  err = _limit_vertices_by_all_neighbours(N, beta_w,\
-											&centroid_values[0],\
-											&vertex_values[0,0],\
-											&edge_values[0,0],\
-											&neighbours[0,0],\
-											&x_gradient[0],\
-											&y_gradient[0])
-
-  assert err == 0, "Internal function _limit_by_vertex failed"
-
-def limit_edges_by_all_neighbours(object quantity):
-
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] neighbours
-  cdef np.ndarray[double, ndim=1, mode="c"] x_gradient
-  cdef np.ndarray[double, ndim=1, mode="c"] y_gradient
-
-  cdef double beta_w
-  cdef keyint N
-  cdef int64_t err
-
-  domain = quantity.domain
-
-  beta_w = domain.beta_w
-
-  neighbours = domain.neighbours
-  centroid_values = quantity.centroid_values
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-  x_gradient = quantity.x_gradient
-  y_gradient = quantity.y_gradient
-  beta_w = domain.beta_w
-
-  N = centroid_values.shape[0]
-
-  err = _limit_edges_by_all_neighbours(N, beta_w,\
-											&centroid_values[0],\
-											&vertex_values[0,0],\
-											&edge_values[0,0],\
-											&neighbours[0,0],\
-											&x_gradient[0],\
-											&y_gradient[0])
-
-  assert err == 0, "Internal function _limit_by_edges failed"
-
-def bound_vertices_below_by_constant(object quantity, double bound):
-  
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-  cdef np.ndarray[double, ndim=1, mode="c"] x_gradient
-  cdef np.ndarray[double, ndim=1, mode="c"] y_gradient
-
-  cdef keyint N
-  cdef int64_t err
-
-  domain = quantity.domain
-
-  centroid_values = quantity.centroid_values
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-  x_gradient = quantity.x_gradient
-  y_gradient = quantity.y_gradient
-
-  N = centroid_values.shape[0]
-
-  err = _bound_vertices_below_by_constant(N, bound,\
-										&centroid_values[0],\
-										&vertex_values[0,0],\
-										&edge_values[0,0],\
-										&x_gradient[0],\
-										&y_gradient[0])
-
-  assert err == 0, "Internal function _bound_vertices_below_by_constant failed"
-
-def bound_vertices_below_by_quantity(object quantity, object bounding_quantity):
-  
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-  cdef np.ndarray[double, ndim=1, mode="c"] x_gradient
-  cdef np.ndarray[double, ndim=1, mode="c"] y_gradient
-  cdef np.ndarray[double, ndim=2, mode="c"] bound_vertex_values
-
-  cdef keyint N
-  cdef int64_t err
-
-  domain = quantity.domain
-
-  centroid_values = quantity.centroid_values
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-  x_gradient = quantity.x_gradient
-  y_gradient = quantity.y_gradient
-  bound_vertex_values = bounding_quantity.vertex_values
-
-  N = centroid_values.shape[0]
-
-  err = _bound_vertices_below_by_quantity(N,\
-  										&bound_vertex_values[0,0],\
-										&centroid_values[0],\
-										&vertex_values[0,0],\
-										&edge_values[0,0],\
-										&x_gradient[0],\
-										&y_gradient[0])
-
-  assert err == 0, "Internal function _bound_vertices_below_by_quantity failed"
-
-def limit_edges_by_neighbour(object quantity):
-  
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] neighbours
-
-  cdef double beta_w
-  cdef keyint N
-  cdef int64_t err
-
-  domain = quantity.domain
-
-  beta_w = domain.beta_w
-
-  neighbours = domain.neighbours
-  centroid_values = quantity.centroid_values
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-
-  N = centroid_values.shape[0]
-
-  err = _limit_edges_by_neighbour(N, beta_w,\
-								&centroid_values[0],\
-								&vertex_values[0,0],\
-								&edge_values[0,0],\
-								&neighbours[0,0])
-
-  assert err == 0, "Internal function _limit_edges_by_neighbour failed"
-
-def limit_gradient_by_neighbour(object quantity):
-  
-  cdef object domain
-
-  cdef np.ndarray[double, ndim=2, mode="c"] vertex_values
-  cdef np.ndarray[double, ndim=1, mode="c"] centroid_values
-  cdef np.ndarray[double, ndim=2, mode="c"] edge_values
-  cdef np.ndarray[double, ndim=1, mode="c"] x_gradient
-  cdef np.ndarray[double, ndim=1, mode="c"] y_gradient
-  cdef np.ndarray[int64_t, ndim=2, mode="c"] neighbours
-
-  cdef double beta_w
-  cdef keyint N
-  cdef int64_t err
-
-  domain = quantity.domain
-
-  beta_w = domain.beta_w
-
-  neighbours = domain.neighbours
-  centroid_values = quantity.centroid_values
-  vertex_values = quantity.vertex_values
-  edge_values = quantity.edge_values
-  x_gradient = quantity.x_gradient
-  y_gradient = quantity.y_gradient
-
-  N = centroid_values.shape[0]
-
-  err = _limit_gradient_by_neighbour(N, beta_w,\
-								&centroid_values[0],\
-								&vertex_values[0,0],\
-								&edge_values[0,0],\
-								&x_gradient[0],\
-								&y_gradient[0],\
-								&neighbours[0,0])
-
-  assert err == 0, "Internal function _limit_gradient_by_neighbour failed"
-
diff --git a/anuga/abstract_2d_finite_volumes/quantity_openmp.c b/anuga/abstract_2d_finite_volumes/quantity_openmp.c
index 20c27623c..02b42f64d 100644
--- a/anuga/abstract_2d_finite_volumes/quantity_openmp.c
+++ b/anuga/abstract_2d_finite_volumes/quantity_openmp.c
@@ -14,6 +14,8 @@
 
 #if defined(__APPLE__)
 // clang doesn't have openmp
+// FIXME SR: Need to determine if openmp has been enabled
+// FIXME SR: Maybe need to try to compile a test program
 #else
 #include "omp.h"
 #endif
@@ -21,21 +23,20 @@
 //Shared code snippets
 #include "util_ext.h"
 
-typedef int64_t keyint;
 
 //-------------------------------------------
 // Low level routines (called from wrappers)
 //------------------------------------------
 
-int64_t _compute_gradients(keyint N,
+anuga_int _compute_gradients(anuga_int N,
 			double* centroids,
 			double* centroid_values,
-			int64_t* number_of_boundaries,
-			int64_t* surrogate_neighbours,
+			anuga_int* number_of_boundaries,
+			anuga_int* surrogate_neighbours,
 			double* a,
 			double* b){
 
-  keyint i, k, k0, k1, k2, index3;
+  anuga_int i, k, k0, k1, k2, index3;
   double x0, x1, x2, y0, y1, y2, q0, q1, q2; //, det;
 
 
@@ -99,19 +100,19 @@ int64_t _compute_gradients(keyint N,
 }
 
 
-int64_t _compute_local_gradients(keyint N,
+anuga_int _compute_local_gradients(anuga_int N,
 			       double* vertex_coordinates,
 			       double* vertex_values,
 			       double* a,
 			       double* b) {
 
-  keyint k, k2, k3, k6;
+  anuga_int k, k3, k6;
   double x0, y0, x1, y1, x2, y2, v0, v1, v2;
 
   for (k=0; k<N; k++) {
     k6 = 6*k;
     k3 = 3*k;
-    k2 = 2*k;
+    //k2 = 2*k;
 
     // vertex coordinates
     // x0, y0, x1, y1, x2, y2 = X[k,:]
@@ -134,7 +135,7 @@ int64_t _compute_local_gradients(keyint N,
     return 0;
 }
 
-int64_t _extrapolate_from_gradient(keyint N,
+anuga_int _extrapolate_from_gradient(anuga_int N,
 			       double* centroids,
 			       double* centroid_values,
 			       double* vertex_coordinates,
@@ -143,7 +144,7 @@ int64_t _extrapolate_from_gradient(keyint N,
 			       double* a,
 			       double* b) {
 
-  keyint k, k2, k3, k6;
+  anuga_int k, k2, k3, k6;
   double x, y, x0, y0, x1, y1, x2, y2;
 
   for (k=0; k<N; k++){
@@ -178,9 +179,9 @@ int64_t _extrapolate_from_gradient(keyint N,
 }
 
 
-int64_t _extrapolate_and_limit_from_gradient(keyint N,double beta,
+anuga_int _extrapolate_and_limit_from_gradient(anuga_int N,double beta,
 					 double* centroids,
-					 int64_t*   neighbours,
+					 anuga_int*   neighbours,
 					 double* centroid_values,
 					 double* vertex_coordinates,
 					 double* vertex_values,
@@ -189,9 +190,9 @@ int64_t _extrapolate_and_limit_from_gradient(keyint N,double beta,
 					 double* x_gradient,
 					 double* y_gradient) {
 
-  keyint i, k, k2, k3, k6;
+  anuga_int i, k, k2, k3, k6;
   double x, y, x0, y0, x1, y1, x2, y2;
-  keyint n;
+  anuga_int n;
   double qmin, qmax, qc;
   double qn[3];
   double dq, dqa[3], r;
@@ -302,24 +303,24 @@ int64_t _extrapolate_and_limit_from_gradient(keyint N,double beta,
 
 
 
-int64_t _limit_vertices_by_all_neighbours(keyint N, double beta,
+anuga_int _limit_vertices_by_all_neighbours(anuga_int N, double beta,
 				      double* centroid_values,
 				      double* vertex_values,
 				      double* edge_values,
-				      int64_t*   neighbours,
+				      anuga_int*   neighbours,
 				      double* x_gradient,
 				      double* y_gradient) {
 
 
-  keyint i, k, k2, k3, k6;
-  keyint n;
+  anuga_int i, k, k3;
+  anuga_int n;
   double qmin, qmax, qn, qc;
   double dq, dqa[3], phi, r;
 
   for (k=0; k<N; k++){
-    k6 = 6*k;
+    //k6 = 6*k;
     k3 = 3*k;
-    k2 = 2*k;
+    //k2 = 2*k;
 
     qc = centroid_values[k];
     qmin = qc;
@@ -369,23 +370,23 @@ int64_t _limit_vertices_by_all_neighbours(keyint N, double beta,
 
 
 
-int64_t _limit_edges_by_all_neighbours(keyint N, double beta,
+anuga_int _limit_edges_by_all_neighbours(anuga_int N, double beta,
 				   double* centroid_values,
 				   double* vertex_values,
 				   double* edge_values,
-				   int64_t*   neighbours,
+				   anuga_int*   neighbours,
 				   double* x_gradient,
 				   double* y_gradient) {
 
-  keyint i, k, k2, k3, k6;
-  keyint n;
-  double qmin, qmax, qn, qc, sign;
+  anuga_int i, k, k3;
+  anuga_int n;
+  double qmin, qmax, qn, qc;
   double dq, dqa[3], phi, r;
 
   for (k=0; k<N; k++){
-    k6 = 6*k;
+    //k6 = 6*k;
     k3 = 3*k;
-    k2 = 2*k;
+    //k2 = 2*k;
 
     qc = centroid_values[k];
     qmin = qc;
@@ -401,12 +402,6 @@ int64_t _limit_edges_by_all_neighbours(keyint N, double beta,
       }
     }
 
-    sign = 0.0;
-    if (qmin > 0.0) {
-      sign = 1.0;
-    } else if (qmax < 0) {
-      sign = -1.0;
-    }
 
     phi = 1.0;
     for (i=0; i<3; i++) {
@@ -416,24 +411,13 @@ int64_t _limit_edges_by_all_neighbours(keyint N, double beta,
 
       // Just limit non boundary edges so that we can reconstruct a linear function
       // FIXME Problem with stability on edges
-      //if (neighbours[k3+i] >= 0) {
 	r = 1.0;
 
 	if (dq > 0.0) r = (qmax - qc)/dq;
 	if (dq < 0.0) r = (qmin - qc)/dq;
 
 	phi = fmin( fmin(r*beta, 1.0), phi);
-	//	}
-
-      //
-      /* if (neighbours[k3+i] < 0) { */
-      /* 	r = 1.0; */
-
-      /* 	if (dq > 0.0 && (sign == -1.0 || sign == 0.0 )) r = (0.0 - qc)/dq; */
-      /* 	if (dq < 0.0 && (sign ==  1.0 || sign == 0.0 )) r = (0.0 - qc)/dq; */
 
-      /* 	phi = fmin( fmin(r*beta, 1.0), phi); */
-      /* 	} */
 
     }
 
@@ -455,21 +439,19 @@ int64_t _limit_edges_by_all_neighbours(keyint N, double beta,
 }
 
 
-int64_t _limit_edges_by_neighbour(keyint N, double beta,
+anuga_int _limit_edges_by_neighbour(anuga_int N, double beta,
 		     double* centroid_values,
 		     double* vertex_values,
 		     double* edge_values,
-		     int64_t*   neighbours) {
+		     anuga_int*   neighbours) {
 
-	keyint i, k, k2, k3, k6;
-	keyint n;
+	anuga_int i, k, k3;
+	anuga_int n;
 	double qmin, qmax, qn, qc;
 	double dq, dqa[3], phi, r;
 
 	for (k=0; k<N; k++){
-		k6 = 6*k;
 		k3 = 3*k;
-		k2 = 2*k;
 
 		qc = centroid_values[k];
 		phi = 1.0;
@@ -510,23 +492,21 @@ int64_t _limit_edges_by_neighbour(keyint N, double beta,
 }
 
 
-int64_t _limit_gradient_by_neighbour(keyint N, double beta,
+anuga_int _limit_gradient_by_neighbour(anuga_int N, double beta,
 		     double* centroid_values,
 		     double* vertex_values,
 		     double* edge_values,
 		     double* x_gradient,
 		     double* y_gradient,
-		     int64_t*   neighbours) {
+		     anuga_int*   neighbours) {
 
-	keyint i, k, k2, k3, k6;
-	keyint n;
+	anuga_int i, k, k3;
+	anuga_int n;
 	double qmin, qmax, qn, qc;
 	double dq, dqa[3], phi, r;
 
 	for (k=0; k<N; k++){
-		k6 = 6*k;
 		k3 = 3*k;
-		k2 = 2*k;
 
 		qc = centroid_values[k];
 		phi = 1.0;
@@ -566,21 +546,19 @@ int64_t _limit_gradient_by_neighbour(keyint N, double beta,
 	return 0;
 }
 
-int64_t _bound_vertices_below_by_constant(keyint N, double bound,
+anuga_int _bound_vertices_below_by_constant(anuga_int N, double bound,
 		     double* centroid_values,
 		     double* vertex_values,
 		     double* edge_values,
 		     double* x_gradient,
 		     double* y_gradient) {
 
-	keyint i, k, k2, k3, k6;
+	anuga_int i, k, k3;
 	double qmin, qc;
 	double dq, dqa[3], phi, r;
 
 	for (k=0; k<N; k++){
-		k6 = 6*k;
 		k3 = 3*k;
-		k2 = 2*k;
 
 		qc = centroid_values[k];
 		qmin = bound;
@@ -617,7 +595,7 @@ int64_t _bound_vertices_below_by_constant(keyint N, double bound,
 	return 0;
 }
 
-int64_t _bound_vertices_below_by_quantity(keyint N,
+anuga_int _bound_vertices_below_by_quantity(anuga_int N,
 				      double* bound_vertex_values,
 				      double* centroid_values,
 				      double* vertex_values,
@@ -625,14 +603,14 @@ int64_t _bound_vertices_below_by_quantity(keyint N,
 				      double* x_gradient,
 				      double* y_gradient) {
 
-	keyint i, k, k2, k3, k6;
+	anuga_int i, k, k3;
 	double qmin, qc;
 	double dq, dqa[3], phi, r;
 
 	for (k=0; k<N; k++){
-		k6 = 6*k;
+		//k6 = 6*k;
 		k3 = 3*k;
-		k2 = 2*k;
+		//k2 = 2*k;
 
 		qc = centroid_values[k];
 
@@ -668,12 +646,12 @@ int64_t _bound_vertices_below_by_quantity(keyint N,
 	return 0;
 }
 
-int64_t _interpolate(keyint N,
+anuga_int _interpolate(anuga_int N,
 		 double* vertex_values,
 		 double* edge_values,
                  double* centroid_values) {
 
-	keyint k, k3;
+	anuga_int k, k3;
 	double q0, q1, q2;
 
 
@@ -693,11 +671,11 @@ int64_t _interpolate(keyint N,
 	return 0;
 }
 
-int64_t _interpolate_from_vertices_to_edges(keyint N,
+anuga_int _interpolate_from_vertices_to_edges(anuga_int N,
 					double* vertex_values,
 					double* edge_values) {
 
-	keyint k, k3;
+	anuga_int k, k3;
 	double q0, q1, q2;
 
 
@@ -716,11 +694,11 @@ int64_t _interpolate_from_vertices_to_edges(keyint N,
 }
 
 
-int64_t _interpolate_from_edges_to_vertices(keyint N,
+anuga_int _interpolate_from_edges_to_vertices(anuga_int N,
 					double* vertex_values,
 					double* edge_values) {
 
-	keyint k, k3;
+	anuga_int k, k3;
 	double e0, e1, e2;
 
 
@@ -738,13 +716,13 @@ int64_t _interpolate_from_edges_to_vertices(keyint N,
 	return 0;
 }
 
-int64_t _backup_centroid_values(keyint N,
+anuga_int _backup_centroid_values(anuga_int N,
 			    double* centroid_values,
 			    double* centroid_backup_values) {
     // Backup centroid values
 
 
-    keyint k;
+    anuga_int k;
 
     for (k=0; k<N; k++) {
 	centroid_backup_values[k] = centroid_values[k];
@@ -755,7 +733,7 @@ int64_t _backup_centroid_values(keyint N,
 }
 
 
-int64_t _saxpy_centroid_values(keyint N,
+anuga_int _saxpy_centroid_values(anuga_int N,
 			   double a,
 			   double b,
 			   double* centroid_values,
@@ -763,7 +741,7 @@ int64_t _saxpy_centroid_values(keyint N,
     // Saxby centroid values
 
 
-    keyint k;
+    anuga_int k;
 
 
     for (k=0; k<N; k++) {
@@ -775,7 +753,7 @@ int64_t _saxpy_centroid_values(keyint N,
 }
 
 
-int64_t _update(keyint N,
+anuga_int _update(anuga_int N,
 	    double timestep,
 	    double* centroid_values,
 	    double* explicit_update,
@@ -784,7 +762,7 @@ int64_t _update(keyint N,
 	// explicit_update and semi_implicit_update as well as given timestep
 
 
-	keyint k;
+	anuga_int k;
 	double denominator, x;
 
 
@@ -807,7 +785,7 @@ int64_t _update(keyint N,
 	// }
 
 
-	// int64_t err_return = 0;
+	// anuga_int err_return = 0;
 
 	// // Semi implicit updates
 	// #pragma omp parallel for private(k, denominator) reduction(fmin:err_return)
@@ -835,7 +813,7 @@ int64_t _update(keyint N,
 
 	// return 0;
 
-	int64_t err_return = 0;
+	anuga_int err_return = 0;
 
 	// Divide semi_implicit update by conserved quantity
 	#pragma omp parallel for private(k, x) reduction(min:err_return)
@@ -874,16 +852,16 @@ int64_t _update(keyint N,
 }
 
 
-int64_t _average_vertex_values(keyint N,
-			   int64_t* vertex_value_indices,
-			   int64_t* number_of_triangles_per_node,
+anuga_int _average_vertex_values(anuga_int N,
+			   anuga_int* vertex_value_indices,
+			   anuga_int* number_of_triangles_per_node,
 			   double* vertex_values,
 			   double* A) {
   // Average vertex values to obtain one value per node
 
-  keyint i, index;
-  keyint k = 0; //Track triangles touching each node
-  keyint current_node = 0;
+  anuga_int i, index;
+  anuga_int k = 0; //Track triangles touching each node
+  anuga_int current_node = 0;
   double total = 0.0;
 
   for (i=0; i<N; i++) {
@@ -923,17 +901,17 @@ int64_t _average_vertex_values(keyint N,
   return 0;
 }
 
-int64_t _average_centroid_values(keyint N,
-			   int64_t* vertex_value_indices,
-			   int64_t* number_of_triangles_per_node,
+anuga_int _average_centroid_values(anuga_int N,
+			   anuga_int* vertex_value_indices,
+			   anuga_int* number_of_triangles_per_node,
 			   double* centroid_values,
 			   double* A) {
   // Average centroid values to obtain one value per node
 
-  keyint i, index;
-  keyint volume_id;
-  keyint k = 0; //Track triangles touching each node
-  keyint current_node = 0;
+  anuga_int i, index;
+  anuga_int volume_id;
+  anuga_int k = 0; //Track triangles touching each node
+  anuga_int current_node = 0;
   double total = 0.0;
 
   for (i=0; i<N; i++) {
@@ -973,15 +951,15 @@ int64_t _average_centroid_values(keyint N,
 // from a list of vertices and values at those vertices. Called in
 // quantity.py by _set_vertex_values.
 // Naming is a little confusing - but sticking with convention.
-int64_t _set_vertex_values_c(keyint num_verts,
-                        int64_t * vertices,
-                        int64_t * node_index,
-                        int64_t * number_of_triangles_per_node,
-                        int64_t * vertex_value_indices,
+anuga_int _set_vertex_values_c(anuga_int num_verts,
+                        anuga_int * vertices,
+                        anuga_int * node_index,
+                        anuga_int * number_of_triangles_per_node,
+                        anuga_int * vertex_value_indices,
                         double * vertex_values,
                         double * A
                         ){
-  keyint i,j,num_triangles,u_vert_id,vert_v_index;
+  anuga_int i,j,num_triangles,u_vert_id,vert_v_index;
 
   for(i=0;i<num_verts;i++){
 
@@ -1000,16 +978,16 @@ int64_t _set_vertex_values_c(keyint num_verts,
 
 }
 
-int64_t _min_and_max_centroid_values(keyint N,
+anuga_int _min_and_max_centroid_values(anuga_int N,
                                  double * qc,
                                  double * qv,
-                                 int64_t * neighbours,
+                                 anuga_int * neighbours,
                                  double * qmin,
                                  double * qmax){
   
   // Find fmin and fmax of this and neighbour's centroid values
 
-  keyint k, i, n, k3;
+  anuga_int k, i, n, k3;
   double qn;
 
   for (k=0; k<N; k++) {
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_ermapper.py b/anuga/abstract_2d_finite_volumes/tests/test_ermapper.py
index 087c16ea6..ffe77e8e9 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_ermapper.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_ermapper.py
@@ -184,6 +184,6 @@ def test_write_non_default_header(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_ERMapper,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_ERMapper)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_gauge.py b/anuga/abstract_2d_finite_volumes/tests/test_gauge.py
index 6a8dd8423..f636dafc1 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_gauge.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_gauge.py
@@ -586,7 +586,6 @@ def test_sww2csv_multiple_files(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Gauge, 'test')
-#    runner = unittest.TextTestRunner(verbosity=2)
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Gauge)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_general_mesh.py b/anuga/abstract_2d_finite_volumes/tests/test_general_mesh.py
index 59ce92eaf..7713c2a5b 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_general_mesh.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_general_mesh.py
@@ -513,8 +513,7 @@ def test_assert_index_in_nodes(self):
 ################################################################################
 
 if __name__ == "__main__":
-    #suite = unittest.makeSuite(Test_General_Mesh, 'test')
-    suite = unittest.makeSuite(Test_General_Mesh, 'test')     
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_General_Mesh)
     runner = unittest.TextTestRunner()
     runner.run(suite)
 
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_generic_boundary_conditions.py b/anuga/abstract_2d_finite_volumes/tests/test_generic_boundary_conditions.py
index 15e82d587..59576451c 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_generic_boundary_conditions.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_generic_boundary_conditions.py
@@ -437,6 +437,6 @@ def test_fileboundary_exception(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Generic_Boundary_Conditions, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Generic_Boundary_Conditions)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_generic_domain.py b/anuga/abstract_2d_finite_volumes/tests/test_generic_domain.py
index f94fd6e63..0d4c80d0f 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_generic_domain.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_generic_domain.py
@@ -961,6 +961,6 @@ def xylocation(x,y):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Domain,'test_')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Domain)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_ghost.py b/anuga/abstract_2d_finite_volumes/tests/test_ghost.py
index a898f4705..44d50d515 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_ghost.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_ghost.py
@@ -47,6 +47,6 @@ def test_simple(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Domain,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Domain)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_neighbour_mesh.py b/anuga/abstract_2d_finite_volumes/tests/test_neighbour_mesh.py
index 4a430c33b..3069ab8ac 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_neighbour_mesh.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_neighbour_mesh.py
@@ -1909,6 +1909,6 @@ def test_get_intersecting_segments7(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Mesh, 'test_')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Mesh)
     runner = unittest.TextTestRunner()#verbosity=2)
     runner.run(suite)
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_pmesh2domain.py b/anuga/abstract_2d_finite_volumes/tests/test_pmesh2domain.py
index 260cbc452..81561db21 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_pmesh2domain.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_pmesh2domain.py
@@ -214,6 +214,6 @@ def test_pmesh2Domain_instance(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_pmesh2domain, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_pmesh2domain)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_quantity.py b/anuga/abstract_2d_finite_volumes/tests/test_quantity.py
index 6f210a6c7..d5dc5ff96 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_quantity.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_quantity.py
@@ -24,6 +24,14 @@
 import numpy as num
 import pprint
 
+try:
+    import osgeo
+except ImportError:
+    pass
+
+import pytest
+import sys
+
 def zone_letter_to_hemisphere(zone_letter):
     hemisphere = 'undefined'
     if zone_letter.lower() in 'cdefghjklm':
@@ -2286,6 +2294,8 @@ def test_set_values_from_ll_grid_file_with_indices_nan(self):
         except:
             pass
 
+    @pytest.mark.skipif('osgeo' not in sys.modules,
+                    reason="requires the gdal module")
     def test_set_values_from_ll_tif_file_north(self):
 
         # Mesh in zone 56 (relative coords) southern hemisphere
@@ -2395,6 +2405,8 @@ def test_set_values_from_ll_tif_file_north(self):
         import os
         os.remove(tif_file)
 
+    @pytest.mark.skipif('osgeo' not in sys.modules,
+                    reason="requires the gdal module")
     def test_set_values_from_ll_tif_file_north_indices(self):
 
         from pprint import pprint
@@ -2500,6 +2512,8 @@ def test_set_values_from_ll_tif_file_north_indices(self):
         import os
         os.remove(tif_file)
 
+    @pytest.mark.skipif('osgeo' not in sys.modules,
+                    reason="requires the gdal module")
     def test_set_values_from_ll_tif_file_south(self):
 
         # Mesh in zone 56 (relative coords) southern hemisphere
@@ -2597,6 +2611,8 @@ def test_set_values_from_ll_tif_file_south(self):
         import os
         os.remove(tif_file)
 
+    @pytest.mark.skipif('osgeo' not in sys.modules,
+                    reason="requires the gdal module")
     def test_set_values_from_utm_tif_file(self):
 
         # Mesh in zone 56 (relative coords) southern hemisphere
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_region.py b/anuga/abstract_2d_finite_volumes/tests/test_region.py
index 2171dab28..8b4070998 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_region.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_region.py
@@ -94,6 +94,6 @@ def test_region_polygon_expanded(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_region, 'test')    
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_region)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_tag_region.py b/anuga/abstract_2d_finite_volumes/tests/test_tag_region.py
index b0315e92a..91a30a8ae 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_tag_region.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_tag_region.py
@@ -257,6 +257,6 @@ def test_unique_vertices_average_loc_unique_vert_de0(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_tag_region, 'test')    
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_tag_region)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/abstract_2d_finite_volumes/tests/test_util.py b/anuga/abstract_2d_finite_volumes/tests/test_util.py
index a31548c2e..ce238c8ab 100644
--- a/anuga/abstract_2d_finite_volumes/tests/test_util.py
+++ b/anuga/abstract_2d_finite_volumes/tests/test_util.py
@@ -1614,7 +1614,6 @@ def test_calc_bearings_zero_vector(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Util, 'test')
-#    runner = unittest.TextTestRunner(verbosity=2)
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Util)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/advection/advection.c b/anuga/advection/advection.c
index 25dee7a80..d50b932a8 100644
--- a/anuga/advection/advection.c
+++ b/anuga/advection/advection.c
@@ -13,6 +13,7 @@
 #include "stdio.h"
 #include "stdlib.h"
 #include "stdint.h"
+#include "anuga_typedefs.h"
 
 //-------------------------------------------
 // Low level routines (called from wrappers)
@@ -22,18 +23,18 @@ double _compute_fluxes(
 		    double* quantity_update,
 		    double* quantity_edge,
 		    double* quantity_bdry,
-            int64_t*   domain_neighbours,
-		    int64_t*   domain_neighbour_edges,
+            anuga_int*   domain_neighbours,
+		    anuga_int*   domain_neighbour_edges,
 		    double* domain_normals,
             double* domain_areas,
 		    double* domain_radii,
 		    double* domain_edgelengths,
-		    int64_t*   domain_tri_full_flag,
+		    anuga_int*   domain_tri_full_flag,
 		    double* domain_velocity,
             double  huge_timestep,
             double  max_timestep,
-		    int64_t ntri,
-		    int64_t nbdry){
+		    anuga_int ntri,
+		    anuga_int nbdry){
 
  
         //Local Variables
@@ -45,9 +46,9 @@ double _compute_fluxes(
         double max_speed;
         double optimal_timestep;
         double timestep;
-        int64_t k_i, n_m, k_i_j;
-        int64_t k, i, j, n, m;
-        int64_t k3;
+        anuga_int k_i, n_m, k_i_j;
+        anuga_int k, i, j, n, m;
+        anuga_int k3;
 
         // Loop through triangles
 
diff --git a/anuga/advection/advection.py b/anuga/advection/advection.py
index 252a143cb..9ccffbb5c 100644
--- a/anuga/advection/advection.py
+++ b/anuga/advection/advection.py
@@ -285,3 +285,39 @@ def compute_fluxes_python(self):
 
         self.timestep = timestep
 
+    def apply_protection_against_isolated_degenerate_timesteps(self):
+
+        if self.protect_against_isolated_degenerate_timesteps is False:
+            return
+
+        # FIXME (Ole): Make this configurable
+        if num.max(self.max_speed) < 10.0:
+            return
+
+        # Setup 10 bins for speed histogram
+        from anuga.utilities.numerical_tools import histogram, create_bins
+
+        bins = create_bins(self.max_speed, 10)
+        hist = histogram(self.max_speed, bins)
+
+        # Look for characteristic signature
+        if len(hist) > 1 and hist[-1] > 0 and \
+           hist[4] == hist[5] == hist[6] == hist[7] == hist[8] == 0:
+            # Danger of isolated degenerate triangles
+
+            # Find triangles in last bin
+            # FIXME - speed up using numeric package
+            d = 0
+            for i in range(self.number_of_triangles):
+                if self.max_speed[i] > bins[-1]:
+                    msg = 'Time=%f: Ignoring isolated high ' % self.get_time()
+                    msg += 'speed triangle '
+                    msg += '#%d of %d with max speed = %f' \
+                        % (i, self.number_of_triangles, self.max_speed[i])
+
+                    self.get_quantity('xmomentum').set_values(0.0, indices=[i])
+                    self.get_quantity('ymomentum').set_values(0.0, indices=[i])
+                    self.max_speed[i] = 0.0
+                    d += 1
+
+
diff --git a/anuga/advection/setup.py b/anuga/advection/setup.py
deleted file mode 100644
index 079996f92..000000000
--- a/anuga/advection/setup.py
+++ /dev/null
@@ -1,33 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-from Cython.Build import cythonize
-import Cython.Compiler.Options
-Cython.Compiler.Options.annotate = True
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('advection', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    #util_dir = os.path.abspath(join(os.path.dirname(__file__),'..','utilities'))
-    util_dir = join('..','utilities')
-            
-    config.add_extension('advection_ext',
-                         sources=['advection_ext.pyx'],
-                         include_dirs=[util_dir])
-
-    config.ext_modules = cythonize(config.ext_modules,annotate=True)
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/advection/tests/test_advection.py b/anuga/advection/tests/test_advection.py
index 1353b294e..1be2b9c28 100644
--- a/anuga/advection/tests/test_advection.py
+++ b/anuga/advection/tests/test_advection.py
@@ -179,6 +179,6 @@ def test_advection_example(self):
 
 #-------------------------------------------------------------
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Advection, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Advection)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/alpha_shape/setup.py b/anuga/alpha_shape/setup.py
deleted file mode 100644
index bf4d52b91..000000000
--- a/anuga/alpha_shape/setup.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('alpha_shape', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/alpha_shape/tests/test_alpha_shape.py b/anuga/alpha_shape/tests/test_alpha_shape.py
index 52b383c20..80c71de07 100644
--- a/anuga/alpha_shape/tests/test_alpha_shape.py
+++ b/anuga/alpha_shape/tests/test_alpha_shape.py
@@ -397,6 +397,6 @@ def test_small_islands(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(TestCase,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestCase)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/caching/setup.py b/anuga/caching/setup.py
deleted file mode 100644
index 1615f0b56..000000000
--- a/anuga/caching/setup.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('caching', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/caching/tests/test_caching.py b/anuga/caching/tests/test_caching.py
index d214854f9..8334feb96 100644
--- a/anuga/caching/tests/test_caching.py
+++ b/anuga/caching/tests/test_caching.py
@@ -996,6 +996,6 @@ def test_objects_are_created(self):
 
 #-------------------------------------------------------------
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Caching, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Caching)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/coordinate_transforms/setup.py b/anuga/coordinate_transforms/setup.py
deleted file mode 100644
index ea5a340d9..000000000
--- a/anuga/coordinate_transforms/setup.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('coordinate_transforms', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/coordinate_transforms/tests/test_geo_reference.py b/anuga/coordinate_transforms/tests/test_geo_reference.py
index 0695f1d76..42371815a 100644
--- a/anuga/coordinate_transforms/tests/test_geo_reference.py
+++ b/anuga/coordinate_transforms/tests/test_geo_reference.py
@@ -769,7 +769,7 @@ def test_georef_types_coerceable(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(geo_referenceTestCase, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(geo_referenceTestCase)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
     
diff --git a/anuga/coordinate_transforms/tests/test_lat_long_UTM_conversion.py b/anuga/coordinate_transforms/tests/test_lat_long_UTM_conversion.py
index cba62769b..5c180583f 100644
--- a/anuga/coordinate_transforms/tests/test_lat_long_UTM_conversion.py
+++ b/anuga/coordinate_transforms/tests/test_lat_long_UTM_conversion.py
@@ -121,6 +121,6 @@ def test_UTM_5(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    mysuite = unittest.makeSuite(TestCase,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestCase)
     runner = unittest.TextTestRunner()
-    runner.run(mysuite)
+    runner.run(suite)
diff --git a/anuga/coordinate_transforms/tests/test_point.py b/anuga/coordinate_transforms/tests/test_point.py
index 190e62dbd..ef8d9f5ee 100644
--- a/anuga/coordinate_transforms/tests/test_point.py
+++ b/anuga/coordinate_transforms/tests/test_point.py
@@ -119,7 +119,7 @@ def testRSISE2Kobenhavn(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    mysuite = unittest.makeSuite(TestCase,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestCase)
     runner = unittest.TextTestRunner()
-    runner.run(mysuite)
+    runner.run(suite)
 
diff --git a/anuga/coordinate_transforms/tests/test_redfearn.py b/anuga/coordinate_transforms/tests/test_redfearn.py
index d3f6ee54e..1fb4df9fa 100644
--- a/anuga/coordinate_transforms/tests/test_redfearn.py
+++ b/anuga/coordinate_transforms/tests/test_redfearn.py
@@ -537,6 +537,6 @@ def test_convert_latlon_to_UTM5(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    mysuite = unittest.makeSuite(TestCase,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestCase)
     runner = unittest.TextTestRunner()
-    runner.run(mysuite)
+    runner.run(suite)
diff --git a/anuga/culvert_flows/setup.py b/anuga/culvert_flows/setup.py
deleted file mode 100644
index e628e511a..000000000
--- a/anuga/culvert_flows/setup.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('culvert_flows', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/culvert_flows/tests/test_culvert_class.py b/anuga/culvert_flows/tests/test_culvert_class.py
index 04a9309c0..2ae7505e3 100644
--- a/anuga/culvert_flows/tests/test_culvert_class.py
+++ b/anuga/culvert_flows/tests/test_culvert_class.py
@@ -813,7 +813,7 @@ def topography(x, y):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Culvert, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Culvert)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
         
diff --git a/anuga/culvert_flows/tests/test_culvert_polygons.py b/anuga/culvert_flows/tests/test_culvert_polygons.py
index df066c162..0ed377631 100644
--- a/anuga/culvert_flows/tests/test_culvert_polygons.py
+++ b/anuga/culvert_flows/tests/test_culvert_polygons.py
@@ -78,7 +78,7 @@ def test_2(self):
                
 #-------------------------------------------------------------
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_poly, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_poly)
     runner = unittest.TextTestRunner()
     runner.run(suite)
         
diff --git a/anuga/culvert_flows/tests/test_culvert_routines.py b/anuga/culvert_flows/tests/test_culvert_routines.py
index 7b7e8f54d..960aee40d 100644
--- a/anuga/culvert_flows/tests/test_culvert_routines.py
+++ b/anuga/culvert_flows/tests/test_culvert_routines.py
@@ -579,7 +579,7 @@ def Xtest_boyd_10(self):
                
 #-------------------------------------------------------------
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_culvert_routines, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_culvert_routines)
     runner = unittest.TextTestRunner()
     runner.run(suite)
 
diff --git a/anuga/culvert_flows/tests/test_culvert_routines_box_10pct.py b/anuga/culvert_flows/tests/test_culvert_routines_box_10pct.py
index f53ed6ee4..be501c86a 100644
--- a/anuga/culvert_flows/tests/test_culvert_routines_box_10pct.py
+++ b/anuga/culvert_flows/tests/test_culvert_routines_box_10pct.py
@@ -336,6 +336,6 @@ def test_boyd_6(self):
 # =========================================================================
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_culvert_routines_box_10pct, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_culvert_routines_box_10pct)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/culvert_flows/tests/test_culvert_routines_box_1pct.py b/anuga/culvert_flows/tests/test_culvert_routines_box_1pct.py
index cc4863b4e..98a448e8e 100644
--- a/anuga/culvert_flows/tests/test_culvert_routines_box_1pct.py
+++ b/anuga/culvert_flows/tests/test_culvert_routines_box_1pct.py
@@ -336,6 +336,6 @@ def test_boyd_6(self):
 
 # =========================================================================
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_culvert_routines_box_1pct, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_culvert_routines_box_1pct)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/culvert_flows/tests/test_culvert_routines_pipe_10pct.py b/anuga/culvert_flows/tests/test_culvert_routines_pipe_10pct.py
index b77d2e514..3fbf2691d 100644
--- a/anuga/culvert_flows/tests/test_culvert_routines_pipe_10pct.py
+++ b/anuga/culvert_flows/tests/test_culvert_routines_pipe_10pct.py
@@ -332,6 +332,6 @@ def test_boyd_6(self):
 # =========================================================================
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_culvert_routines_pipe_10pct, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_culvert_routines_pipe_10pct)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/culvert_flows/tests/test_culvert_routines_pipe_1pct.py b/anuga/culvert_flows/tests/test_culvert_routines_pipe_1pct.py
index 82d75cd33..8e227d4bc 100644
--- a/anuga/culvert_flows/tests/test_culvert_routines_pipe_1pct.py
+++ b/anuga/culvert_flows/tests/test_culvert_routines_pipe_1pct.py
@@ -333,6 +333,6 @@ def test_boyd_6(self):
 # =========================================================================
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_culvert_routines_pipe_1pct, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_culvert_routines_pipe_1pct)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/culvert_flows/tests/test_new_culvert_class.py b/anuga/culvert_flows/tests/test_new_culvert_class.py
index c2c551527..13ec909d8 100644
--- a/anuga/culvert_flows/tests/test_new_culvert_class.py
+++ b/anuga/culvert_flows/tests/test_new_culvert_class.py
@@ -818,7 +818,7 @@ def topography(x, y):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Culvert, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Culvert)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
         
diff --git a/anuga/damage_modelling/setup.py b/anuga/damage_modelling/setup.py
deleted file mode 100644
index 3b42180bd..000000000
--- a/anuga/damage_modelling/setup.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('damage_modelling', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/damage_modelling/tests/test_exposure.py b/anuga/damage_modelling/tests/test_exposure.py
index fac7f9539..c0dbf515d 100644
--- a/anuga/damage_modelling/tests/test_exposure.py
+++ b/anuga/damage_modelling/tests/test_exposure.py
@@ -333,6 +333,6 @@ def test_exposure_csv_loading_x_y2(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Exposure,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Exposure)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/damage_modelling/tests/test_inundation_damage.py b/anuga/damage_modelling/tests/test_inundation_damage.py
index 9eebb9208..0d32f1dc3 100644
--- a/anuga/damage_modelling/tests/test_inundation_damage.py
+++ b/anuga/damage_modelling/tests/test_inundation_damage.py
@@ -641,7 +641,7 @@ def test_calc_max_depth_and_momentum(self):
         sys.stdout = fid
     else:
         pass
-    suite = unittest.makeSuite(Test_inundation_damage,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_inundation_damage)
     runner = unittest.TextTestRunner()
     runner.run(suite)
 
diff --git a/anuga/file/setup.py b/anuga/file/setup.py
deleted file mode 100644
index e755052b9..000000000
--- a/anuga/file/setup.py
+++ /dev/null
@@ -1,33 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-from Cython.Build import cythonize
-import Cython.Compiler.Options
-Cython.Compiler.Options.annotate = True
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('file', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    #util_dir = os.path.abspath(join(os.path.dirname(__file__),'..','utilities'))
-    util_dir = join('..','utilities')
-    
-    config.add_extension('urs_ext',
-                         sources=['urs_ext.pyx'],
-                         include_dirs=[util_dir])
-    
-    config.ext_modules = cythonize(config.ext_modules, annotate=True)
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/file/tests/test_csv.py b/anuga/file/tests/test_csv.py
index 20db3f073..d6c7bf50b 100644
--- a/anuga/file/tests/test_csv.py
+++ b/anuga/file/tests/test_csv.py
@@ -406,6 +406,6 @@ def test_csv2building_polygons(self):
 #################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_csv, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_csv)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/file/tests/test_mux.py b/anuga/file/tests/test_mux.py
index 67a30dda5..17dc4f9c3 100644
--- a/anuga/file/tests/test_mux.py
+++ b/anuga/file/tests/test_mux.py
@@ -1540,7 +1540,7 @@ def test_Urs_points(self):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Mux,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Mux)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
         
diff --git a/anuga/file/tests/test_read_sww.py b/anuga/file/tests/test_read_sww.py
index 20a4c1809..931834a83 100644
--- a/anuga/file/tests/test_read_sww.py
+++ b/anuga/file/tests/test_read_sww.py
@@ -345,7 +345,7 @@ def test_read_sww_with_centroids(self):
         
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_read_sww, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_read_sww)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
     
diff --git a/anuga/file/tests/test_sww.py b/anuga/file/tests/test_sww.py
index 09e4f5987..265517f7f 100644
--- a/anuga/file/tests/test_sww.py
+++ b/anuga/file/tests/test_sww.py
@@ -902,6 +902,6 @@ def test_triangulation_2_geo_refs(self):
 #################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_sww, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_sww)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/file/tests/test_ungenerate.py b/anuga/file/tests/test_ungenerate.py
index 665d396c1..6077cd528 100644
--- a/anuga/file/tests/test_ungenerate.py
+++ b/anuga/file/tests/test_ungenerate.py
@@ -277,7 +277,7 @@ def test_import_ungenerate_file_different_region_tags(self):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(ungenerateTestCase,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(ungenerateTestCase)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
     
diff --git a/anuga/file/tests/test_urs.py b/anuga/file/tests/test_urs.py
index 299178ef2..c9dce34d2 100644
--- a/anuga/file/tests/test_urs.py
+++ b/anuga/file/tests/test_urs.py
@@ -213,6 +213,6 @@ def in_development_URS_points_needed_poly2(self):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Urs,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Urs)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
diff --git a/anuga/file/urs.c b/anuga/file/urs.c
index 0b87b44b3..f266840a7 100644
--- a/anuga/file/urs.c
+++ b/anuga/file/urs.c
@@ -11,6 +11,7 @@ gcc -shared urs_ext.o  -o urs_ext.so
 #include <float.h>
 #include <time.h>
 #include <stdint.h>
+#include "anuga_typedefs.h"
 
 #define MAX_FILE_NAME_LENGTH 128
 #define NODATA 99.0
@@ -24,7 +25,7 @@ static int32_t *fros=NULL;  // First recorded output step
 static int32_t *lros=NULL;  // Last recorded output step 
 static struct tgsrwg* mytgs0=NULL;
 
-static int64_t numDataMax=0;
+static anuga_int numDataMax=0;
 
 
 /*The MUX file format 
@@ -41,7 +42,7 @@ void fillDataArray(int32_t ista, int32_t total_number_of_stations, int32_t nt, i
                    int32_t *istop_p, float *muxData)
 {
     int32_t it, last_it, jsta;
-    int64_t offset=0;
+    anuga_int offset=0;
 
 
     last_it = -1;
@@ -144,23 +145,25 @@ char isdata(float x)
 }
 
 
-int64_t getNumData(const int32_t *fros, const int32_t *lros, const int32_t total_number_of_stations)
+anuga_int getNumData(const int32_t *fros, const int32_t *lros, const int32_t total_number_of_stations)
 /* calculates the number of data in the data block of a mux file */
 /* based on the first and last recorded output steps for each gauge */ 
 {
     int32_t ista, last_output_step;
-    int64_t numData = 0;
+    anuga_int numData = 0;
 
     last_output_step = 0;   
-    for(ista = 0; ista < total_number_of_stations; ista++)
+    for(ista = 0; ista < total_number_of_stations; ista++){
+
         if(*(fros + ista) != -1)
         {
             numData += *(lros + ista) - *(fros + ista) + 1;
             last_output_step = (last_output_step < *(lros+ista) ? 
                 *(lros+ista):last_output_step);
         }   
-        numData += last_output_step*total_number_of_stations; /* these are the t records */
-        return numData;
+    }
+    numData += last_output_step*total_number_of_stations; /* these are the t records */
+    return numData;
 }
 
 /////////////////////////////////////////////////////////////////////////
@@ -170,7 +173,7 @@ int32_t _read_mux2_headers(int32_t numSrc,
                        int32_t* total_number_of_stations,
                        int32_t* number_of_time_steps,
                        double* delta_t,
-                       //int64_t* numDataMax,
+                       //anuga_int* numDataMax,
                        int32_t verbose)
 {
     FILE *fp;
@@ -178,7 +181,7 @@ int32_t _read_mux2_headers(int32_t numSrc,
     struct tgsrwg *mytgs=0;
     char *muxFileName;                                                                  
     char susMuxFileName;
-    int64_t numData;
+    anuga_int numData;
     size_t elements_read; // fread return value
     int32_t block_size;
 
@@ -358,7 +361,7 @@ float** _read_mux2(int32_t numSrc,
                    float *weights, 
                    double *params, 
                    int32_t *number_of_stations,
-                   int64_t *permutation,
+                   anuga_int *permutation,
                    int32_t verbose)
 {
     FILE *fp;
@@ -367,15 +370,15 @@ float** _read_mux2(int32_t numSrc,
     int32_t istart=-1, istop=-1;
     int32_t number_of_selected_stations;
     float *muxData=NULL; // Suppress warning
-    int64_t numData;
-    int64_t *perm = NULL;
-    int64_t *permutation_temp = NULL;
+    anuga_int numData;
+    anuga_int *perm = NULL;
+    anuga_int *permutation_temp = NULL;
 
     int32_t len_sts_data, error_code;
     float **sts_data;
     float *temp_sts_data;
 
-    int64_t offset;
+    anuga_int offset;
 
     int32_t number_of_time_steps, N;
     double delta_t;
@@ -413,7 +416,7 @@ float** _read_mux2(int32_t numSrc,
         *number_of_stations = total_number_of_stations;     
 
         // Create the Identity permutation vector
-        permutation_temp = (int64_t *) malloc(number_of_selected_stations*sizeof(int64_t));
+        permutation_temp = (anuga_int *) malloc(number_of_selected_stations*sizeof(anuga_int));
         if (permutation_temp == NULL)
         {
             printf("ERROR: Memory for permutation_temp could not be allocated.\n");
@@ -422,7 +425,7 @@ float** _read_mux2(int32_t numSrc,
         
         for (i = 0; i < number_of_selected_stations; i++)
         {
-            permutation_temp[i] = (int64_t) i;  
+            permutation_temp[i] = (anuga_int) i;  
         }
 
         perm = permutation_temp;
@@ -489,7 +492,6 @@ float** _read_mux2(int32_t numSrc,
             fprintf(stderr, "cannot open file %s\n", muxFileName);
             free(muxData);
             free(temp_sts_data);
-            free(muxData);
 
             return NULL;                    
         }
@@ -498,8 +500,8 @@ float** _read_mux2(int32_t numSrc,
             printf("Reading mux file %s\n", muxFileName);
         }
 
-        offset = (int64_t) sizeof(int32_t) + total_number_of_stations*(sizeof(struct tgsrwg) + 2*sizeof(int32_t));
-        //printf("\n offset %i ", (int64_t int32_t)offset);
+        offset = (anuga_int) sizeof(int32_t) + total_number_of_stations*(sizeof(struct tgsrwg) + 2*sizeof(int32_t));
+        //printf("\n offset %i ", (anuga_int int32_t)offset);
         fseek(fp, offset, 0);
 
         numData = getNumData(fros_per_source, 
@@ -520,7 +522,6 @@ float** _read_mux2(int32_t numSrc,
             fclose(fp);
             free(muxData);
             free(temp_sts_data);
-            free(muxData);
 
             return NULL;
         }	
diff --git a/anuga/file_conversion/calc_grid_values.c b/anuga/file_conversion/calc_grid_values.c
index 87e1fc143..d1ff66995 100644
--- a/anuga/file_conversion/calc_grid_values.c
+++ b/anuga/file_conversion/calc_grid_values.c
@@ -1,6 +1,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <stdint.h>
+#include "anuga_typedefs.h"
 
 #define MIN(a, b) (((a)<=(b))?(a):(b))
 #define MAX(a, b) (((a)>(b))?(a):(b))
@@ -51,8 +52,8 @@ void get_tri_extent(double *vertices, PTR_EXTENT out)
 }
 
 void get_tri_vertices( double *x, double *y,\
-			int64_t *volumes, \
-			int64_t tri_id, \
+			anuga_int *volumes, \
+			anuga_int tri_id, \
 			double *out, \
 			double *v1,  \
 			double *v2,  \
@@ -80,7 +81,7 @@ void get_tri_vertices( double *x, double *y,\
 	}
 }
 
-void get_tri_norms( double *norms, int64_t tri_id, 
+void get_tri_norms( double *norms, anuga_int tri_id, 
 		       double *n1, double *n2, double *n3)
 {
 	n1[0] = norms[tri_id*6];
@@ -91,9 +92,9 @@ void get_tri_norms( double *norms, int64_t tri_id,
 	n3[1] = norms[tri_id*6+5];
 }
 
-void init_norms( double *x, double *y, double *norms, int64_t *volumes, int64_t num_tri  )
+void init_norms( double *x, double *y, double *norms, anuga_int *volumes, anuga_int num_tri  )
 {
-	int64_t i;
+	anuga_int i;
 	double x1, x2, x3, y1, y2, y3;
 	double xn1, yn1, xn2, yn2, xn3, yn3;
 	double l1, l2, l3;
@@ -139,12 +140,12 @@ void init_norms( double *x, double *y, double *norms, int64_t *volumes, int64_t
 }
 
 // remove nodes that are not in any triangles
-void remove_lone_verts( double **verts, int64_t *volumes )
+void remove_lone_verts( double **verts, anuga_int *volumes )
 {
 	
 }
 
-int64_t _point_on_line(double x, double y,
+anuga_int _point_on_line(double x, double y,
 		   double x0, double y0,
 		   double x1, double y1,
 		   double rtol,
@@ -153,7 +154,7 @@ int64_t _point_on_line(double x, double y,
 
   double a0, a1, a_normal0, a_normal1, b0, b1, len_a, len_b;
   double nominator, denominator;
-  int64_t is_parallel;
+  anuga_int is_parallel;
 
   a0 = x - x0;
   a1 = y - y0;
@@ -198,9 +199,9 @@ int64_t _point_on_line(double x, double y,
   }
 }
 
-int64_t _is_inside_triangle(double *point,
+anuga_int _is_inside_triangle(double *point,
 			double *triangle,
-			int64_t closed,
+			anuga_int closed,
 			double rtol,
 			double atol) 
 {			 
@@ -209,7 +210,7 @@ int64_t _is_inside_triangle(double *point,
   double denom, alpha, beta;
   
   double x, y; // Point coordinates
-  int64_t i, j, res;
+  anuga_int i, j, res;
 
   x = point[0];
   y = point[1];
@@ -280,21 +281,21 @@ int64_t _is_inside_triangle(double *point,
 }
 
 void _calc_grid_values( double *x, double *y, double *norms,
-				 int64_t num_vert,
-				 int64_t *volumes, 
-				 int64_t num_tri, 
+				 anuga_int num_vert,
+				 anuga_int *volumes, 
+				 anuga_int num_tri, 
 				 double cell_size,
-				 int64_t nrow,
-				 int64_t ncol,
+				 anuga_int nrow,
+				 anuga_int ncol,
 				 double *vertex_val,
 				 double *grid_val )
 {
-	int64_t i, j, k;
-	int64_t x_min, x_max, y_min, y_max, point_index;
+	anuga_int i, j, k;
+	anuga_int x_min, x_max, y_min, y_max, point_index;
 	double x_dist, y_dist, x_base, y_base;
 	double sigma0, sigma1, sigma2;
-	double fraction;
-	double intpart;
+	//double fraction;
+	double intpart = 0.0;
 	double triangle[6], point[2];
 	double v1[2], v2[2], v3[2];
 	double n1[2], n2[2], n3[2];
@@ -309,19 +310,6 @@ void _calc_grid_values( double *x, double *y, double *norms,
 	y_base = 0.0;
 
 
-/*
-        printf("%d\n",num_tri);
-        for ( i=0; i< num_tri; i++){
-            printf("volumes\n");
-            printf("%ld %ld %ld \n",volumes[3*i],volumes[3*i+1],volumes[3*i+2]);
-        }
-
-        printf("%d\n",num_vert);
-        for ( i=0; i< num_vert; i++){
-            printf("vertices\n");
-            printf("%g %g \n",x[i],y[i]);
-        }
-*/
 
 	for ( i = 0; i < num_tri; i++ ) {
 
@@ -329,34 +317,22 @@ void _calc_grid_values( double *x, double *y, double *norms,
 		get_tri_norms( norms, i, n1, n2, n3 );
 		get_tri_extent( triangle, extent );
 
-/*
-                printf("tri %g %g  %g %g %g %g\n",
-                   triangle[0],triangle[1],triangle[2],triangle[3], triangle[4],triangle[5]);
-                printf("v1 %g %g\n", v1[0], v1[1]);
-                printf("v2 %g %g\n", v2[0], v2[1]);
-                printf("v3 %g %g\n", v3[0], v3[1]);
-
 
-                printf("e.xmin %g \n", extent->x_min);
-                printf("e.xmax %g \n", extent->x_max);
-                printf("e.ymin %g \n", extent->y_min);
-                printf("e.ymax %g \n", extent->y_max);
-*/
 
-		fraction = modf( (extent->x_min - x_base)/x_dist, &intpart );
-		x_min = (int64_t) intpart;
+		(void) modf( (extent->x_min - x_base)/x_dist, &intpart );
+		x_min = (anuga_int) intpart;
 		x_min = (x_min < 0) ? 0 : x_min; 
 
-		fraction = modf( ABS(extent->x_max - x_base)/x_dist, &intpart );
-		x_max = (int64_t) intpart;
+		(void) modf( ABS(extent->x_max - x_base)/x_dist, &intpart );
+		x_max = (anuga_int) intpart;
 		x_max = (x_max > (ncol-1)) ? (ncol-1) : x_max;
 
-		fraction = modf( (extent->y_min - y_base)/y_dist, &intpart );
-		y_min = (int64_t) intpart;
+		(void) modf( (extent->y_min - y_base)/y_dist, &intpart );
+		y_min = (anuga_int) intpart;
 		y_min = (y_min < 0 ) ? 0 : y_min;
 
-		fraction = modf( ABS(extent->y_max - y_base)/y_dist, &intpart );
-		y_max = (int64_t) intpart;
+		(void) modf( ABS(extent->y_max - y_base)/y_dist, &intpart );
+		y_max = (anuga_int) intpart;
 		y_max = (y_max > (nrow-1)) ? (nrow-1) : y_max;
 		
 		if ( x_max >= 0 && y_max >= 0 ) {
diff --git a/anuga/file_conversion/file_conversion.py b/anuga/file_conversion/file_conversion.py
index df175a302..6b0721bb8 100644
--- a/anuga/file_conversion/file_conversion.py
+++ b/anuga/file_conversion/file_conversion.py
@@ -23,6 +23,7 @@
                             netcdf_float
 
 from anuga.anuga_exceptions import *
+from anuga.utilities.file_utils import create_filename
 
 
 #shallow water imports
@@ -31,6 +32,8 @@
 from anuga.shallow_water.shallow_water_domain import Domain
 
 
+
+
 def sww2obj(filename, size):
     """ Convert netcdf based data output to obj
 
@@ -39,8 +42,10 @@ def sww2obj(filename, size):
         size The number of lines to write.
     """
 
+    # FIXME SR: shouldn't size be read from the incoming file?
+
     if filename[-4:] != '.sww':
-        raise IOError('Output file %s should be of type .sww.' % sww_file)
+        raise IOError('Output file %s should be of type .sww.' % filename)
 
     basefilename = filename[:-4]
 
diff --git a/anuga/file_conversion/setup.py b/anuga/file_conversion/setup.py
deleted file mode 100644
index 5bd8c53bf..000000000
--- a/anuga/file_conversion/setup.py
+++ /dev/null
@@ -1,33 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-from Cython.Build import cythonize
-import Cython.Compiler.Options
-Cython.Compiler.Options.annotate = True
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('file_conversion', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    #util_dir = os.path.abspath(join(os.path.dirname(__file__),'..','utilities'))
-    util_dir = join('..','utilities')
-    
-    config.add_extension('calc_grid_values_ext',
-                         sources=['calc_grid_values_ext.pyx'],
-                         include_dirs=[util_dir])
-
-    config.ext_modules = cythonize(config.ext_modules, annotate=True)
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/file_conversion/tests/test_2pts.py b/anuga/file_conversion/tests/test_2pts.py
index 5f537d9c9..d665369fd 100644
--- a/anuga/file_conversion/tests/test_2pts.py
+++ b/anuga/file_conversion/tests/test_2pts.py
@@ -242,6 +242,6 @@ def test_sww2pts_centroids_de0(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_2Pts, 'test_')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_2Pts)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)    
diff --git a/anuga/file_conversion/tests/test_csv2sts.py b/anuga/file_conversion/tests/test_csv2sts.py
index a17c9e774..8ac6186e8 100644
--- a/anuga/file_conversion/tests/test_csv2sts.py
+++ b/anuga/file_conversion/tests/test_csv2sts.py
@@ -118,6 +118,6 @@ def _check_generated_sts(self):
             os.remove(sts_out)           
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_csv2sts,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_csv2sts)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/file_conversion/tests/test_dem2array.py b/anuga/file_conversion/tests/test_dem2array.py
index cab6f209c..1617d8fbd 100644
--- a/anuga/file_conversion/tests/test_dem2array.py
+++ b/anuga/file_conversion/tests/test_dem2array.py
@@ -115,6 +115,6 @@ def test_dem2array(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_dem2array,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_dem2array)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/file_conversion/tests/test_dem2dem.py b/anuga/file_conversion/tests/test_dem2dem.py
index 244071993..7a22b87c4 100644
--- a/anuga/file_conversion/tests/test_dem2dem.py
+++ b/anuga/file_conversion/tests/test_dem2dem.py
@@ -214,7 +214,7 @@ def test_decimate_dem_NODATA(self):
 #################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Dem2Dem, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Dem2Dem)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
         
diff --git a/anuga/file_conversion/tests/test_dem2pts.py b/anuga/file_conversion/tests/test_dem2pts.py
old mode 100755
new mode 100644
index 2dfa298e9..4f921ce7c
--- a/anuga/file_conversion/tests/test_dem2pts.py
+++ b/anuga/file_conversion/tests/test_dem2pts.py
@@ -407,6 +407,6 @@ def test_dem2pts_bounding_box_removeNullvalues_v3(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Dem2Pts,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Dem2Pts)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/file_conversion/tests/test_file_conversion.py b/anuga/file_conversion/tests/test_file_conversion.py
index 48afcc9fd..f120ea2fe 100644
--- a/anuga/file_conversion/tests/test_file_conversion.py
+++ b/anuga/file_conversion/tests/test_file_conversion.py
@@ -1108,6 +1108,6 @@ def test_grd2array_dem2array(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_File_Conversion,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_File_Conversion)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/file_conversion/tests/test_grd2array.py b/anuga/file_conversion/tests/test_grd2array.py
index a301553df..ce2b4c21c 100644
--- a/anuga/file_conversion/tests/test_grd2array.py
+++ b/anuga/file_conversion/tests/test_grd2array.py
@@ -265,7 +265,7 @@ def test_grd2array_2(self):
 #################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_grd2array, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_grd2array)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
         
diff --git a/anuga/file_conversion/tests/test_llasc2pts.py b/anuga/file_conversion/tests/test_llasc2pts.py
old mode 100755
new mode 100644
index c826b75c3..e30b4b482
--- a/anuga/file_conversion/tests/test_llasc2pts.py
+++ b/anuga/file_conversion/tests/test_llasc2pts.py
@@ -126,6 +126,6 @@ def test_llasc2pts_bounding_box_v2(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_LLAsc2Pts,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_LLAsc2Pts)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/file_conversion/tests/test_sww2dem.py b/anuga/file_conversion/tests/test_sww2dem.py
index 069424409..025a1dda7 100644
--- a/anuga/file_conversion/tests/test_sww2dem.py
+++ b/anuga/file_conversion/tests/test_sww2dem.py
@@ -2748,8 +2748,6 @@ def test_sww2dem_verbose_True(self):
 #################################################################################
 
 if __name__ == "__main__":
-    # suite = unittest.makeSuite(Test_Shallow_Water, 'test_rainfall_forcing_with_evolve')
-
-    suite = unittest.makeSuite(Test_Sww2Dem, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Sww2Dem)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/file_conversion/tests/test_tif2.py b/anuga/file_conversion/tests/test_tif2.py
index de044ac4d..eeac629d8 100644
--- a/anuga/file_conversion/tests/test_tif2.py
+++ b/anuga/file_conversion/tests/test_tif2.py
@@ -584,6 +584,8 @@ def test_tif_lat_lon_too_small(self):
             #Expected ValueError
             raise Exception()       
 
+    @pytest.mark.skipif('osgeo' not in sys.modules,
+                    reason="requires the gdal module")
     def test_tif2point_values_utm(self):
 
         import os
diff --git a/anuga/file_conversion/tests/test_urs2sts.py b/anuga/file_conversion/tests/test_urs2sts.py
index dce6cd945..48d9f285b 100644
--- a/anuga/file_conversion/tests/test_urs2sts.py
+++ b/anuga/file_conversion/tests/test_urs2sts.py
@@ -2124,6 +2124,6 @@ def test_file_boundary_sts_time_limit(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Urs2Sts,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Urs2Sts)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/file_conversion/tests/test_urs2sww.py b/anuga/file_conversion/tests/test_urs2sww.py
index 860be50ff..dba512cc7 100644
--- a/anuga/file_conversion/tests/test_urs2sww.py
+++ b/anuga/file_conversion/tests/test_urs2sww.py
@@ -625,6 +625,6 @@ def test_urs_ungridded2sww (self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Dem2Pts,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Dem2Pts)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/fit_interpolate/fitsmooth.c b/anuga/fit_interpolate/fitsmooth.c
index 49d9da483..b9ddd3dcb 100644
--- a/anuga/fit_interpolate/fitsmooth.c
+++ b/anuga/fit_interpolate/fitsmooth.c
@@ -4,11 +4,12 @@
 #include <stdio.h>   /* gets */
 #include <stdlib.h>  /* atoi, malloc */
 #include <string.h>  /* strcpy */
-#include <stdint.h>  /* uint64_t */
+#include <stdint.h>  /* uanuga_int */
 #include "math.h"
 
 #include "sparse_dok.h" /* in utilities */
 #include "quad_tree.h"  /* in utilities */
+#include "anuga_typedefs.h" /* in utilities */
 
 #if defined(__APPLE__)
    // clang doesn't have openmp
@@ -25,23 +26,23 @@
 
 // Builds the matrix D used to smooth the interpolation 
 // of a variables from scattered data points to a mesh. See fit.py for more details.s
-int64_t _build_smoothing_matrix(int64_t n,
-                      int64_t* triangles,
+anuga_int _build_smoothing_matrix(anuga_int n,
+                      anuga_int* triangles,
         		      double* areas,
                       double* vertex_coordinates,
-                      int64_t* strides,
+                      anuga_int* strides,
                       sparse_dok * smoothing_mat)
 		      {
 
 
-    int64_t k;
-    int64_t k3,k6;
-    int64_t err = 0;
+    anuga_int k;
+    anuga_int k3,k6;
+    anuga_int err = 0;
     edge_key_t key;
 
     double det,area,x0,x1,x2,y0,y1,y2;
     double a0,b0,a1,b1,a2,b2,e01,e12,e20;
-    int64_t v0,v1,v2;
+    anuga_int v0,v1,v2;
     double smoothing_val;
 
     
@@ -140,13 +141,13 @@ int64_t _build_smoothing_matrix(int64_t n,
 
 // Builds a quad tree out of a list of triangles for quick 
 // searching. 
-quad_tree * _build_quad_tree(int64_t n,
-                      int64_t* triangles,
+quad_tree * _build_quad_tree(anuga_int n,
+                      anuga_int* triangles,
                       double* vertex_coordinates,
                       double* extents)               
 {   
     
-    int64_t k,k6;
+    anuga_int k,k6;
     double x0,y0,x1,y1,x2,y2;
 
     // set up quad tree and allocate memory
@@ -176,19 +177,19 @@ quad_tree * _build_quad_tree(int64_t n,
 // and residual. Uses a quad_tree for fast access to the triangles of the mesh.
 // This function takes a list of point coordinates, and associated point values
 // (for any number of attributes).
-int64_t _build_matrix_AtA_Atz_points(int64_t N, 
-    int64_t * triangles,
+anuga_int _build_matrix_AtA_Atz_points(anuga_int N, 
+    anuga_int * triangles,
     double * point_coordinates, 
     double * point_values,
-    int64_t zdims,
-    int64_t npts,
+    anuga_int zdims,
+    anuga_int npts,
     sparse_dok * AtA,
     double ** Atz,
     quad_tree * quadtree)
               {
 
-    int64_t k;
-    int64_t i,w;
+    anuga_int k;
+    anuga_int i,w;
     
     for(w=0;w<zdims;w++){
         for(i=0;i<N;i++){
@@ -211,7 +212,7 @@ int64_t _build_matrix_AtA_Atz_points(int64_t N,
 
         if(T!=NULL){
             double * sigma = calculate_sigma(T,x,y);
-            int64_t js[3];
+            anuga_int js[3];
             for(i=0;i<3;i++){
                 js[i]=triangles[3*(T->index)+i];
             }
@@ -246,11 +247,11 @@ int64_t _build_matrix_AtA_Atz_points(int64_t N,
 void _combine_partial_AtA_Atz(sparse_dok * dok_AtA1,sparse_dok * dok_AtA2,
                              double* Atz1,
                              double* Atz2,
-                             int64_t n, int64_t zdim){
+                             anuga_int n, anuga_int zdim){
 
     add_sparse_dok(dok_AtA1,1,dok_AtA2,1);
 
-    int64_t i;
+    anuga_int i;
     for(i=0;i<n*zdim;i++){
         Atz1[i]+=Atz2[i];
     }
diff --git a/anuga/fit_interpolate/meson.build b/anuga/fit_interpolate/meson.build
index 4fbbb9626..22c7ec76c 100644
--- a/anuga/fit_interpolate/meson.build
+++ b/anuga/fit_interpolate/meson.build
@@ -1,6 +1,7 @@
 
 inc_dir = include_directories('../utilities', incdir_numpy)
 
+
 util_srcs = ['../utilities/quad_tree.c',
                  '../utilities/sparse_dok.c',
                  '../utilities/sparse_csr.c']
@@ -9,7 +10,8 @@ util_srcs = ['../utilities/quad_tree.c',
 py3.extension_module('fitsmooth_ext',
   sources: ['fitsmooth_ext.pyx'] + util_srcs,
   include_directories: inc_dir,
-  dependencies: dependencies,
+  c_args : openmp_c_args,
+  dependencies: openmp_deps,
   subdir: 'anuga/fit_interpolate',
   install: true,
 )
@@ -31,4 +33,4 @@ py3.install_sources(
   subdir: 'anuga/fit_interpolate'
 )
 
-subdir('tests')
\ No newline at end of file
+subdir('tests')
diff --git a/anuga/fit_interpolate/setup.py b/anuga/fit_interpolate/setup.py
deleted file mode 100644
index eb2304b2f..000000000
--- a/anuga/fit_interpolate/setup.py
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-from Cython.Build import cythonize
-import Cython.Compiler.Options
-Cython.Compiler.Options.annotate = True
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('fit_interpolate', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-
-    #util_dir = os.path.abspath(join(os.path.dirname(__file__),'..','utilities'))
-    
-    util_dir = join('..','utilities')
-    
-    util_srcs = [join(util_dir,'quad_tree.c'),
-                 join(util_dir,'sparse_dok.c'),
-                 join(util_dir,'sparse_csr.c')]
-    
-    if sys.platform == 'darwin':
-        extra_args = None
-    else:
-        extra_args = ['-fopenmp']
-
-    config.add_extension('fitsmooth',
-                         sources=['fitsmooth_ext.pyx']+util_srcs,
-                         include_dirs=[util_dir],
-                         extra_compile_args=extra_args,
-                         extra_link_args=extra_args)
-
-    config.ext_modules = cythonize(config.ext_modules, annotate=True)
-
-
-    return config
-
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/fit_interpolate/tests/test_fit.py b/anuga/fit_interpolate/tests/test_fit.py
index 3db7e7dae..a3faacdb3 100644
--- a/anuga/fit_interpolate/tests/test_fit.py
+++ b/anuga/fit_interpolate/tests/test_fit.py
@@ -1147,6 +1147,6 @@ def test_fit_to_mesh_file_errorsIII(self):
 
 #-------------------------------------------------------------
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Fit,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Fit)
     runner = unittest.TextTestRunner() #verbosity=1)
     runner.run(suite)
diff --git a/anuga/fit_interpolate/tests/test_interpolate.py b/anuga/fit_interpolate/tests/test_interpolate.py
index 58a1a9f90..bbdab0551 100644
--- a/anuga/fit_interpolate/tests/test_interpolate.py
+++ b/anuga/fit_interpolate/tests/test_interpolate.py
@@ -1959,7 +1959,7 @@ def test_interpolate_one_point_many_triangles(self):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Interpolate,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Interpolate)
     runner = unittest.TextTestRunner() #verbosity=1)
     runner.run(suite)
 
diff --git a/anuga/fit_interpolate/tests/test_interpolate2d.py b/anuga/fit_interpolate/tests/test_interpolate2d.py
index dc92d1ba0..03c46c9bc 100644
--- a/anuga/fit_interpolate/tests/test_interpolate2d.py
+++ b/anuga/fit_interpolate/tests/test_interpolate2d.py
@@ -464,6 +464,6 @@ def test_linear_interpolation_outside_domain(self):
 
 
 if __name__ == '__main__':
-    suite = unittest.makeSuite(Test_interpolate, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_interpolate)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/fit_interpolate/tests/test_search_functions.py b/anuga/fit_interpolate/tests/test_search_functions.py
index c44c194b2..b3ff65ac6 100644
--- a/anuga/fit_interpolate/tests/test_search_functions.py
+++ b/anuga/fit_interpolate/tests/test_search_functions.py
@@ -231,7 +231,7 @@ def expanding_search(self):
 
 #-------------------------------------------------------------
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_search_functions, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_search_functions)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
     
diff --git a/anuga/geometry/meson.build b/anuga/geometry/meson.build
index 75e0d8c1e..39f40ab59 100644
--- a/anuga/geometry/meson.build
+++ b/anuga/geometry/meson.build
@@ -4,8 +4,9 @@ inc_dir = include_directories('../utilities', incdir_numpy)
 # Compile the Cython-generated C code and additional C code
 py3.extension_module('polygon_ext',
   sources: ['polygon_ext.pyx'],
+  c_args : openmp_c_args,
   include_directories: inc_dir,
-  dependencies: dependencies,
+  dependencies: openmp_deps,
   subdir: 'anuga/geometry',
   install: true,
 )
@@ -23,4 +24,4 @@ py3.install_sources(
   subdir: 'anuga/geometry'
 )
 
-subdir('tests')
\ No newline at end of file
+subdir('tests')
diff --git a/anuga/geometry/polygon.c b/anuga/geometry/polygon.c
index 83916b56e..0b7f637da 100644
--- a/anuga/geometry/polygon.c
+++ b/anuga/geometry/polygon.c
@@ -9,41 +9,41 @@
 //
 // Ole Nielsen, GA 2004
 //
-// NOTE: We use int64_t* instead of int64_t* for numeric arrays as this will work both 
+// NOTE: We use anuga_int* instead of anuga_int* for numeric arrays as this will work both
 //       for 64 as well as 32 bit systems
 
 #include "math.h"
 #include "stdint.h"
-
+#include "stdio.h"
+#include "anuga_typedefs.h"
 #define YES 1
 #define NO 0
 
+inline double dist(const double x,
+                   const double y)
+{
 
-double dist(double x,
-	    double y) {
-  
-  return sqrt(x*x + y*y);
+  return sqrt(x * x + y * y);
 }
 
-
-int64_t __point_on_line(double x, double y,
-		    double x0, double y0,
-		    double x1, double y1,
-		    double rtol,
-		    double atol) {
+anuga_int __point_on_line(const double x, const double y,
+                        const double x0, const double y0,
+                        const double x1, const double y1,
+                        const double rtol,
+                        const double atol)
+{
   /*Determine whether a point is on a line segment
 
     Input: x, y, x0, x0, x1, y1: where
         point is given by x, y
-	line is given by (x0, y0) and (x1, y1)
+  line is given by (x0, y0) and (x1, y1)
 
   */
 
   double a0, a1, a_normal0, a_normal1, b0, b1, len_a, len_b;
-  double a_dot_b, len_ba;
+  double a_dot_b;
   double nominator, denominator;
-  int64_t is_parallel;
-
+  anuga_int is_parallel;
 
   a0 = x - x0;
   a1 = y - y0;
@@ -54,47 +54,53 @@ int64_t __point_on_line(double x, double y,
   b0 = x1 - x0;
   b1 = y1 - y0;
 
-  nominator = fabs(a_normal0*b0 + a_normal1*b1);
-  denominator = b0*b0 + b1*b1;
-  
+  nominator = fabs(a_normal0 * b0 + a_normal1 * b1);
+  denominator = b0 * b0 + b1 * b1;
+
   // Determine if line is parallel to point vector up to a tolerance
   is_parallel = 0;
-  if (denominator == 0.0) {
+  if (denominator == 0.0)
+  {
     // Use absolute tolerance
-    if (nominator <= atol) {
+    if (nominator <= atol)
+    {
       is_parallel = 1;
     }
-  } else {
+  }
+  else
+  {
     // Denominator is positive - use relative tolerance
-    if (nominator/denominator <= rtol) {
+    if (nominator / denominator <= rtol)
+    {
       is_parallel = 1;
-    }    
+    }
   }
-    
-  if (is_parallel) {
+
+  if (is_parallel)
+  {
     // Point is somewhere on the infinite extension of the line
     // subject to specified absolute tolerance
 
-    len_a = dist(a0, a1); //sqrt(a0*a0 + a1*a1);
-    len_b = dist(b0, b1); //sqrt(b0*b0 + b1*b1);
-
-    a_dot_b = a0*b0 + a1*b1;
-    //len_ba  = len_b - len_a;
+    len_a = dist(a0, a1); // sqrt(a0*a0 + a1*a1);
+    len_b = dist(b0, b1); // sqrt(b0*b0 + b1*b1);
 
-    //printf("a0*b0 + a1*b1 = %20.10e \n",a_dot_b);
-    //printf("len_b - len_a = %20.10e \n",len_ba);
+    a_dot_b = a0 * b0 + a1 * b1;
 
-    if (a_dot_b >= -1.0e-308 && -1.0e-15 <= len_b - len_a) {
+    if (a_dot_b >= -1.0e-308 && -1.0e-15 <= len_b - len_a)
+    {
       return 1;
-    } else {
+    }
+    else
+    {
       return 0;
     }
-  } else {
+  }
+  else
+  {
     return 0;
   }
 }
 
-
 //  public domain function by Darel Rex Finley, 2006
 //  http://www.alienryderflex.com/intersect/
 //
@@ -105,66 +111,70 @@ int64_t __point_on_line(double x, double y,
 //  Returns NO if there is no determinable intersection point, in which case X,Y will
 //  be unmodified.
 
-int64_t __line_segment_intersection(
-        double Ax, double Ay,
-        double Bx, double By,
-        double Cx, double Cy,
-        double Dx, double Dy,
-        double *X, double *Y) {
+anuga_int __line_segment_intersection(
+    double Ax, double Ay,
+    double Bx, double By,
+    double Cx, double Cy,
+    double Dx, double Dy,
+    double *X, double *Y)
+{
 
-    double distAB, theCos, theSin, newX, ABpos;
+  double distAB, theCos, theSin, newX, ABpos;
 
-    //  Fail if either line segment is zero-length.
-    if ( (Ax == Bx && Ay == By) || (Cx == Dx && Cy == Dy) ) return NO ;
+  //  Fail if either line segment is zero-length.
+  if ((Ax == Bx && Ay == By) || (Cx == Dx && Cy == Dy))
+    return NO;
 
-    //  Fail if the segments share an end-point.
-    if ( (Ax == Cx && Ay == Cy) || (Bx == Cx && By == Cy)
-            || (Ax == Dx && Ay == Dy) || (Bx == Dx && By == Dy) ) {
-        return NO;
-    }
+  //  Fail if the segments share an end-point.
+  if ((Ax == Cx && Ay == Cy) || (Bx == Cx && By == Cy) || (Ax == Dx && Ay == Dy) || (Bx == Dx && By == Dy))
+  {
+    return NO;
+  }
 
-    //  (1) Translate the system so that point A is on the origin.
-    Bx -= Ax;
-    By -= Ay;
-    Cx -= Ax;
-    Cy -= Ay;
-    Dx -= Ax;
-    Dy -= Ay;
-
-    //  Discover the length of segment A-B.
-    distAB = sqrt(Bx * Bx + By * By);
-
-    //  (2) Rotate the system so that point B is on the positive X axis.
-    theCos = Bx / distAB;
-    theSin = By / distAB;
-    newX = Cx * theCos + Cy*theSin;
-    Cy = Cy * theCos - Cx*theSin;
-    Cx = newX;
-    newX = Dx * theCos + Dy*theSin;
-    Dy = Dy * theCos - Dx*theSin;
-    Dx = newX;
-
-    //  Fail if segment C-D doesn't cross line A-B.
-    if ( (Cy < 0. && Dy < 0.) || (Cy >= 0. && Dy >= 0.) ) return NO;
-
-    //  (3) Discover the position of the intersection point along line A-B.
-    ABpos = Dx + (Cx - Dx) * Dy / (Dy - Cy);
-
-    //  Fail if segment C-D crosses line A-B outside of segment A-B.
-    if (ABpos < 0. || ABpos > distAB) return NO;
-
-    //  (4) Apply the discovered position to line A-B in the original coordinate system.
-    *X = Ax + ABpos*theCos;
-    *Y = Ay + ABpos*theSin;
-
-    //  Success.
-    return YES;
+  //  (1) Translate the system so that point A is on the origin.
+  Bx -= Ax;
+  By -= Ay;
+  Cx -= Ax;
+  Cy -= Ay;
+  Dx -= Ax;
+  Dy -= Ay;
+
+  //  Discover the length of segment A-B.
+  distAB = sqrt(Bx * Bx + By * By);
+
+  //  (2) Rotate the system so that point B is on the positive X axis.
+  theCos = Bx / distAB;
+  theSin = By / distAB;
+  newX = Cx * theCos + Cy * theSin;
+  Cy = Cy * theCos - Cx * theSin;
+  Cx = newX;
+  newX = Dx * theCos + Dy * theSin;
+  Dy = Dy * theCos - Dx * theSin;
+  Dx = newX;
+
+  //  Fail if segment C-D doesn't cross line A-B.
+  if ((Cy < 0. && Dy < 0.) || (Cy >= 0. && Dy >= 0.))
+    return NO;
+
+  //  (3) Discover the position of the intersection point along line A-B.
+  ABpos = Dx + (Cx - Dx) * Dy / (Dy - Cy);
+
+  //  Fail if segment C-D crosses line A-B outside of segment A-B.
+  if (ABpos < 0. || ABpos > distAB)
+    return NO;
+
+  //  (4) Apply the discovered position to line A-B in the original coordinate system.
+  *X = Ax + ABpos * theCos;
+  *Y = Ay + ABpos * theSin;
+
+  //  Success.
+  return YES;
 }
 
 /*
 WORK IN PROGRESS TO OPTIMISE INTERSECTION
-int64_t __intersection(double x0, double y0,
-		   double x1, double y1) {
+anuga_int __intersection(double x0, double y0,
+       double x1, double y1) {
 
 
     x0 = line0[0,0]; y0 = line0[0,1]
@@ -176,7 +186,7 @@ int64_t __intersection(double x0, double y0,
     denom = (y3-y2)*(x1-x0) - (x3-x2)*(y1-y0)
     u0 = (x3-x2)*(y0-y2) - (y3-y2)*(x0-x2)
     u1 = (x2-x0)*(y1-y0) - (y2-y0)*(x1-x0)
-        
+
     if allclose(denom, 0.0):
         # Lines are parallel - check if they coincide on a shared a segment
 
@@ -186,18 +196,18 @@ int64_t __intersection(double x0, double y0,
 
             line0_starts_on_line1 = line0_ends_on_line1 =\
             line1_starts_on_line0 = line1_ends_on_line0 = False
-                
+
             if point_on_line([x0, y0], line1):
                 line0_starts_on_line1 = True
 
             if point_on_line([x1, y1], line1):
                 line0_ends_on_line1 = True
- 
+
             if point_on_line([x2, y2], line0):
                 line1_starts_on_line0 = True
 
             if point_on_line([x3, y3], line0):
-                line1_ends_on_line0 = True                               
+                line1_ends_on_line0 = True
 
             if not(line0_starts_on_line1 or line0_ends_on_line1\
                or line1_starts_on_line0 or line1_ends_on_line0):
@@ -208,12 +218,12 @@ int64_t __intersection(double x0, double y0,
             # One line fully included in the other. Use direction of included line
             if line0_starts_on_line1 and line0_ends_on_line1:
                 # Shared segment is line0 fully included in line1
-                segment = array([[x0, y0], [x1, y1]])                
+                segment = array([[x0, y0], [x1, y1]])
 
             if line1_starts_on_line0 and line1_ends_on_line0:
                 # Shared segment is line1 fully included in line0
                 segment = array([[x2, y2], [x3, y3]])
-            
+
 
             # Overlap with lines are oriented the same way
             if line0_starts_on_line1 and line1_ends_on_line0:
@@ -222,7 +232,7 @@ int64_t __intersection(double x0, double y0,
 
             if line1_starts_on_line0 and line0_ends_on_line1:
                 # Shared segment from line1 start to line 0 end
-                segment = array([[x2, y2], [x1, y1]])                                
+                segment = array([[x2, y2], [x1, y1]])
 
 
             # Overlap in opposite directions - use direction of line0
@@ -232,28 +242,28 @@ int64_t __intersection(double x0, double y0,
 
             if line0_ends_on_line1 and line1_ends_on_line0:
                 # Shared segment from line0 start to line 1 end
-                segment = array([[x3, y3], [x1, y1]])                
+                segment = array([[x3, y3], [x1, y1]])
+
 
-                
             return 2, segment
         else:
             # Lines are parallel but they do not coincide
-            return 4, None #FIXME (Ole): Add distance here instead of None 
-            
+            return 4, None #FIXME (Ole): Add distance here instead of None
+
     else:
         # Lines are not parallel or coinciding
         u0 = u0/denom
-        u1 = u1/denom        
+        u1 = u1/denom
 
         x = x0 + u0*(x1-x0)
         y = y0 + u0*(y1-y0)
 
         # Sanity check - can be removed to speed up if needed
         assert allclose(x, x2 + u1*(x3-x2))
-        assert allclose(y, y2 + u1*(y3-y2))        
+        assert allclose(y, y2 + u1*(y3-y2))
 
         # Check if point found lies within given line segments
-        if 0.0 <= u0 <= 1.0 and 0.0 <= u1 <= 1.0: 
+        if 0.0 <= u0 <= 1.0 and 0.0 <= u1 <= 1.0:
             # We have intersection
 
             return 1, array([x, y])
@@ -262,467 +272,582 @@ int64_t __intersection(double x0, double y0,
             return 0, None
 
 
-} 
+}
 */
 
+anuga_int __interpolate_polyline(anuga_int number_of_nodes,
+                               anuga_int number_of_points,
+                               double *data,
+                               double *polyline_nodes,
+                               anuga_int *gauge_neighbour_id,
+                               double *interpolation_points,
+                               double *interpolated_values,
+                               double rtol,
+                               double atol)
+{
 
+  anuga_int j, i, neighbour_id;
+  double x0, y0, x1, y1, x, y;
+  double segment_len, segment_delta, slope, alpha;
 
-int64_t __interpolate_polyline(int64_t number_of_nodes,
-		int64_t number_of_points,
-		double* data,
-		double* polyline_nodes,
-		int64_t* gauge_neighbour_id,
-		double* interpolation_points,
-		double* interpolated_values,
-		double rtol,
-		double atol) {
+  for (j = 0; j < number_of_nodes; j++)
+  {
 
-	int64_t j, i, neighbour_id;
-	double x0, y0, x1, y1, x, y;
-	double segment_len, segment_delta, slope, alpha;
+    neighbour_id = gauge_neighbour_id[j];
 
-	for (j=0; j<number_of_nodes; j++) {
+    // FIXME(Ole): I am convinced that gauge_neighbour_id can be discarded, but need to check with John J.
+    // Keep it for now (17 Jan 2009)
+    // When gone, we can simply interpolate between neighbouring nodes, i.e. neighbour_id = j+1.
+    // and the test below becomes something like: if j < number_of_nodes...
 
-		neighbour_id = gauge_neighbour_id[j];
+    if (neighbour_id >= 0)
+    {
+      x0 = polyline_nodes[2 * j];
+      y0 = polyline_nodes[2 * j + 1];
 
-		// FIXME(Ole): I am convinced that gauge_neighbour_id can be discarded, but need to check with John J.
-		// Keep it for now (17 Jan 2009)
-		// When gone, we can simply interpolate between neighbouring nodes, i.e. neighbour_id = j+1.
-		// and the test below becomes something like: if j < number_of_nodes...
+      x1 = polyline_nodes[2 * neighbour_id];
+      y1 = polyline_nodes[2 * neighbour_id + 1];
 
-		if (neighbour_id >= 0) {
-			x0 = polyline_nodes[2*j];
-			y0 = polyline_nodes[2*j+1];
+      segment_len = dist(x1 - x0, y1 - y0);
+      segment_delta = data[neighbour_id] - data[j];
+      slope = segment_delta / segment_len;
 
-			x1 = polyline_nodes[2*neighbour_id];
-			y1 = polyline_nodes[2*neighbour_id+1];
+      for (i = 0; i < number_of_points; i++)
+      {
+        x = interpolation_points[2 * i];
+        y = interpolation_points[2 * i + 1];
 
+        if (__point_on_line(x, y, x0, y0, x1, y1, rtol, atol))
+        {
+          alpha = dist(x - x0, y - y0);
+          interpolated_values[i] = slope * alpha + data[j];
+        }
+      }
+    }
+  }
 
-			segment_len = dist(x1-x0, y1-y0);
-			segment_delta = data[neighbour_id] - data[j];
-			slope = segment_delta/segment_len;
+  return 0;
+}
 
-			for (i=0; i<number_of_points; i++) {
-				x = interpolation_points[2*i];
-				y = interpolation_points[2*i+1];
+anuga_int __triangle_polygon_overlap(double *polygon,
+                                   double *triangle,
+                                   anuga_int polygon_number_of_vertices)
+{
+  anuga_int i, ii, j, jj, A, B;
+  double p0_x, p0_y, p1_x, p1_y, pp_x, pp_y;
+  double t0_x, t0_y, t1_x, t1_y, tp_x, tp_y;
+  double u_x, u_y, v_x, v_y, w_x, w_y;
+  double u_dot_tp, v_dot_tp, v_dot_pp, w_dot_pp;
+  double a, b;
 
-				if (__point_on_line(x, y, x0, y0, x1, y1, rtol, atol)) {
-					alpha = dist(x-x0, y-y0);
-					interpolated_values[i] = slope*alpha + data[j];
-				}
-			}
-		}
-	}
+  p0_x = polygon[0];
+  p0_y = polygon[1];
 
-	return 0;
-}			       			       
+  A = 0;
+  B = 0;
 
+  for (i = 1; i < polygon_number_of_vertices + 1; i++)
+  {
+    ii = i % polygon_number_of_vertices;
 
-int64_t __triangle_polygon_overlap(double* polygon,
-                               double* triangle,
-                               int64_t polygon_number_of_vertices)
-{
-    int64_t i, ii, j, jj, A, B;
-    double p0_x, p0_y, p1_x, p1_y, pp_x, pp_y;
-    double t0_x, t0_y, t1_x, t1_y, tp_x, tp_y;
-    double u_x, u_y, v_x, v_y, w_x, w_y;
-    double u_dot_tp, v_dot_tp, v_dot_pp, w_dot_pp;
-    double a, b;
-    
-    p0_x = polygon[0];
-    p0_y = polygon[1];
-    
-    A = 0;
-    B = 0;
-    
-    for (i = 1; i < polygon_number_of_vertices + 1; i++)
+    p1_x = polygon[2 * ii];
+    p1_y = polygon[2 * ii + 1];
+
+    pp_x = -(p1_y - p0_y);
+    pp_y = p1_x - p0_x;
+
+    t0_x = triangle[0];
+    t0_y = triangle[1];
+
+    for (j = 1; j < 4; j++)
     {
-        ii = i%polygon_number_of_vertices;
-        
-        p1_x = polygon[2*ii];
-        p1_y = polygon[2*ii + 1];
-        
-        pp_x = -(p1_y - p0_y);
-        pp_y = p1_x - p0_x;
-  
-        t0_x = triangle[0];
-        t0_y = triangle[1];
-  
-        for (j = 1; j < 4; j++)
+      jj = j % 3;
+
+      t1_x = triangle[2 * jj];
+      t1_y = triangle[2 * jj + 1];
+
+      tp_x = -(t1_y - t0_y); // perpendicular to triangle vector
+      tp_y = t1_x - t0_x;    // perpendicular to polygon vector
+
+      u_x = p1_x - p0_x;
+      u_y = p1_y - p0_y;
+      v_x = t0_x - p0_x;
+      v_y = t0_y - p0_y;
+      w_x = t1_x - t0_x;
+      w_y = t1_y - t0_y;
+
+      u_dot_tp = (u_x * tp_x) + (u_y * tp_y);
+
+      if (u_dot_tp != 0.0f) // Vectors are not parallel
+      {
+        v_dot_tp = (v_x * tp_x) + (v_y * tp_y);
+        v_dot_pp = (v_x * pp_x) + (v_y * pp_y);
+        w_dot_pp = (w_x * pp_x) + (w_y * pp_y);
+
+        a = v_dot_tp / u_dot_tp;
+        b = -v_dot_pp / w_dot_pp;
+
+        if (a >= 0.0f && a <= 1.0f && b >= 0.0f && b <= 1.0f)
         {
-            jj = j%3;
-                      
-            t1_x = triangle[2*jj];
-            t1_y = triangle[2*jj + 1];
-            
-            tp_x = -(t1_y - t0_y); //perpendicular to triangle vector
-            tp_y = t1_x - t0_x; //perpendicular to polygon vector
-        
-            u_x = p1_x - p0_x;
-            u_y = p1_y - p0_y;
-            v_x = t0_x - p0_x;
-            v_y = t0_y - p0_y;
-            w_x = t1_x - t0_x;
-            w_y = t1_y - t0_y;
-
-            u_dot_tp = (u_x*tp_x) + (u_y*tp_y);
-            
-            if (u_dot_tp != 0.0f) //Vectors are not parallel
-            {
-                v_dot_tp = (v_x*tp_x) + (v_y*tp_y);
-                v_dot_pp = (v_x*pp_x) + (v_y*pp_y);
-                w_dot_pp = (w_x*pp_x) + (w_y*pp_y);
-                
-                a = v_dot_tp/u_dot_tp;
-                b = -v_dot_pp/w_dot_pp;
-                             
-                if (a >= 0.0f && a <= 1.0f && b >=0.0f && b <=1.0f)
-                {
-                    return 1; //overlap
-                }
-                
-                if (b >= 0.0f && b <= 1.0f && a > 1.0f)
-                {
-                    A++; 
-                }
-                
-                if (a >= 0.0f && a <= 1.0f && b > 1.0f)
-                {
-                    B++; 
-                }
-                
-                if (A == 4 || B == 3)
-                {
-                    return 1; //overlap
-                }
-            }
-            
-            t0_x = t1_x;
-            t0_y = t1_y;
+          return 1; // overlap
         }
-        
-        p0_x = p1_x;
-        p0_y = p1_y;
-    }
 
-    return 0; //no overlap
-}
-                 
+        if (b >= 0.0f && b <= 1.0f && a > 1.0f)
+        {
+          A++;
+        }
 
-int64_t __polygon_overlap(double* polygon,
-                      double* triangles,
-                      int64_t* indices,
-                      int64_t M, //number of triangles
-                      int64_t polygon_number_of_vertices)
-{
-    double* triangle;
-    int64_t i, inside_index, outside_index;
-    
-    inside_index = 0;    // Keep track of triangles that overlap
-    outside_index = M - 1; // Keep track of triangles that don't overlap (starting from end)
-    
-    for (i = 0; i < M; i++)
-    {
-        triangle = triangles + 6*i;
-        
-        if (__triangle_polygon_overlap(polygon, 
-                                      triangle, 
-                                      polygon_number_of_vertices))
+        if (a >= 0.0f && a <= 1.0f && b > 1.0f)
         {
-            indices[inside_index] = i;
-            inside_index++;
+          B++;
         }
-        else
+
+        if (A == 4 || B == 3)
         {
-            indices[outside_index] = i;
-            outside_index -= 1;            
+          return 1; // overlap
         }
+      }
+
+      t0_x = t1_x;
+      t0_y = t1_y;
     }
-    
-    return inside_index;
-}              
 
+    p0_x = p1_x;
+    p0_y = p1_y;
+  }
+
+  return 0; // no overlap
+}
 
-int64_t __triangle_line_intersect(double* line,
-                              double* triangle)
+anuga_int __polygon_overlap(double *polygon,
+                          double *triangles,
+                          anuga_int *indices,
+                          anuga_int M, // number of triangles
+                          anuga_int polygon_number_of_vertices)
 {
-    int64_t j, jj, A, B;
-    double p0_x, p0_y, p1_x, p1_y, pp_x, pp_y;
-    double t0_x, t0_y, t1_x, t1_y, tp_x, tp_y;
-    double u_x, u_y, v_x, v_y, w_x, w_y;
-    double u_dot_tp, v_dot_tp, v_dot_pp, w_dot_pp;
-    double a, b;
-    
-    p0_x = line[0];
-    p0_y = line[1];
-    p1_x = line[2];
-    p1_y = line[3];
-    
-    pp_x = -(p1_y - p0_y);
-    pp_y = p1_x - p0_x;
-    
-    A = 0;
-    B = 0;
-    
-    t0_x = triangle[0];
-    t0_y = triangle[1];
+  double *triangle;
+  anuga_int i, inside_index, outside_index;
 
-    for (j = 1; j < 4; j++)
+  inside_index = 0;      // Keep track of triangles that overlap
+  outside_index = M - 1; // Keep track of triangles that don't overlap (starting from end)
+
+  for (i = 0; i < M; i++)
+  {
+    triangle = triangles + 6 * i;
+
+    if (__triangle_polygon_overlap(polygon,
+                                   triangle,
+                                   polygon_number_of_vertices))
     {
-        jj = j%3;
-                  
-        t1_x = triangle[2*jj];
-        t1_y = triangle[2*jj + 1];
-        
-        tp_x = -(t1_y - t0_y); //perpendicular to triangle vector
-        tp_y = t1_x - t0_x; 
-    
-        u_x = p1_x - p0_x;
-        u_y = p1_y - p0_y;
-        v_x = t0_x - p0_x;
-        v_y = t0_y - p0_y;
-        w_x = t1_x - t0_x;
-        w_y = t1_y - t0_y;
-
-        u_dot_tp = (u_x*tp_x) + (u_y*tp_y);
-        
-        if (u_dot_tp != 0.0f) //If vectors are not parallel, continue
-        {
-            v_dot_tp = (v_x*tp_x) + (v_y*tp_y);
-            v_dot_pp = (v_x*pp_x) + (v_y*pp_y);
-            w_dot_pp = (w_x*pp_x) + (w_y*pp_y);
-            
-            a = v_dot_tp/u_dot_tp;
-            b = -v_dot_pp/w_dot_pp;
-                         
-            if (a >= 0.0f && a <= 1.0f && b >=0.0f && b <=1.0f)
-            {
-                return 1; //intersect
-            }
-            
-            if (a > 1.0f && b >= 0.0f && b <= 1.0f)
-            {
-                A++; 
-            }
-            
-            if (a < 0.0f && b >= 0.0f && b <= 1.0f)
-            {
-                B++; 
-            }
-        }
-        
-        t0_x = t1_x;
-        t0_y = t1_y;
+      indices[inside_index] = i;
+      inside_index++;
     }
-    
-    if (A >= 1 && B >= 1)
+    else
     {
-        return 1; //line sits completely inside a triangle
+      indices[outside_index] = i;
+      outside_index -= 1;
     }
-    
-    return 0; //no intersection
+  }
+
+  return inside_index;
 }
-                 
 
-int64_t __line_intersect(double* line,
-                     double* triangles,
-                     int64_t* indices,
-                     int64_t M) //number of triangles
+anuga_int __triangle_line_intersect(double *line,
+                                  double *triangle)
 {
-    double* triangle;
-    int64_t i, inside_index, outside_index;
-    
-    inside_index = 0;    // Keep track of triangles that intersect
-    outside_index = M - 1; // Keep track of triangles that don't intersect (starting from end)
-    
-    for (i = 0; i < M; i++)
+  anuga_int j, jj, A, B;
+  double p0_x, p0_y, p1_x, p1_y, pp_x, pp_y;
+  double t0_x, t0_y, t1_x, t1_y, tp_x, tp_y;
+  double u_x, u_y, v_x, v_y, w_x, w_y;
+  double u_dot_tp, v_dot_tp, v_dot_pp, w_dot_pp;
+  double a, b;
+
+  p0_x = line[0];
+  p0_y = line[1];
+  p1_x = line[2];
+  p1_y = line[3];
+
+  pp_x = -(p1_y - p0_y);
+  pp_y = p1_x - p0_x;
+
+  A = 0;
+  B = 0;
+
+  t0_x = triangle[0];
+  t0_y = triangle[1];
+
+  for (j = 1; j < 4; j++)
+  {
+    jj = j % 3;
+
+    t1_x = triangle[2 * jj];
+    t1_y = triangle[2 * jj + 1];
+
+    tp_x = -(t1_y - t0_y); // perpendicular to triangle vector
+    tp_y = t1_x - t0_x;
+
+    u_x = p1_x - p0_x;
+    u_y = p1_y - p0_y;
+    v_x = t0_x - p0_x;
+    v_y = t0_y - p0_y;
+    w_x = t1_x - t0_x;
+    w_y = t1_y - t0_y;
+
+    u_dot_tp = (u_x * tp_x) + (u_y * tp_y);
+
+    if (u_dot_tp != 0.0f) // If vectors are not parallel, continue
     {
-        triangle = triangles + 6*i;
-        
-        if (__triangle_line_intersect(line, 
-                                      triangle))
-        {
-            indices[inside_index] = i;
-            inside_index++;
-        }
-        else
-        {
-            indices[outside_index] = i;
-            outside_index -= 1;            
-        }
+      v_dot_tp = (v_x * tp_x) + (v_y * tp_y);
+      v_dot_pp = (v_x * pp_x) + (v_y * pp_y);
+      w_dot_pp = (w_x * pp_x) + (w_y * pp_y);
+
+      a = v_dot_tp / u_dot_tp;
+      b = -v_dot_pp / w_dot_pp;
+
+      if (a >= 0.0f && a <= 1.0f && b >= 0.0f && b <= 1.0f)
+      {
+        return 1; // intersect
+      }
+
+      if (a > 1.0f && b >= 0.0f && b <= 1.0f)
+      {
+        A++;
+      }
+
+      if (a < 0.0f && b >= 0.0f && b <= 1.0f)
+      {
+        B++;
+      }
     }
-    
-    return inside_index;
-}              
 
+    t0_x = t1_x;
+    t0_y = t1_y;
+  }
+
+  if (A >= 1 && B >= 1)
+  {
+    return 1; // line sits completely inside a triangle
+  }
+
+  return 0; // no intersection
+}
+
+anuga_int __line_intersect(double *line,
+                         double *triangles,
+                         anuga_int *indices,
+                         anuga_int M) // number of triangles
+{
+  double *triangle;
+  anuga_int i, inside_index, outside_index;
 
+  inside_index = 0;      // Keep track of triangles that intersect
+  outside_index = M - 1; // Keep track of triangles that don't intersect (starting from end)
+
+  for (i = 0; i < M; i++)
+  {
+    triangle = triangles + 6 * i;
+
+    if (__triangle_line_intersect(line,
+                                  triangle))
+    {
+      indices[inside_index] = i;
+      inside_index++;
+    }
+    else
+    {
+      indices[outside_index] = i;
+      outside_index -= 1;
+    }
+  }
+
+  return inside_index;
+}
+
+anuga_int __is_inside_triangle(double *point,
+                             double *triangle,
+                             anuga_int closed,
+                             double rtol,
+                             double atol)
+{
 
-int64_t __is_inside_triangle(double* point,
-			 double* triangle,
-			 int64_t closed,
-			 double rtol,
-			 double atol) {
-			 
   double vx, vy, v0x, v0y, v1x, v1y;
   double a00, a10, a01, a11, b0, b1;
   double denom, alpha, beta;
-  
+
   double x, y; // Point coordinates
-  int64_t i, j, res;
+  anuga_int i, j, res;
 
   x = point[0];
   y = point[1];
-  
+
   // Quickly reject points that are clearly outside
-  if ((x < triangle[0]) && 
-      (x < triangle[2]) && 
-      (x < triangle[4])) return 0;       
-      
-  if ((x > triangle[0]) && 
-      (x > triangle[2]) && 
-      (x > triangle[4])) return 0;             
-  
-  if ((y < triangle[1]) && 
-      (y < triangle[3]) && 
-      (y < triangle[5])) return 0;       
-      
-  if ((y > triangle[1]) && 
-      (y > triangle[3]) && 
-      (y > triangle[5])) return 0;             
-  
-  
-  // v0 = C-A 
-  v0x = triangle[4]-triangle[0]; 
-  v0y = triangle[5]-triangle[1];
-  
-  // v1 = B-A   
-  v1x = triangle[2]-triangle[0]; 
-  v1y = triangle[3]-triangle[1];
+  if ((x < triangle[0]) &&
+      (x < triangle[2]) &&
+      (x < triangle[4]))
+    return 0;
+
+  if ((x > triangle[0]) &&
+      (x > triangle[2]) &&
+      (x > triangle[4]))
+    return 0;
+
+  if ((y < triangle[1]) &&
+      (y < triangle[3]) &&
+      (y < triangle[5]))
+    return 0;
+
+  if ((y > triangle[1]) &&
+      (y > triangle[3]) &&
+      (y > triangle[5]))
+    return 0;
+
+  // v0 = C-A
+  v0x = triangle[4] - triangle[0];
+  v0y = triangle[5] - triangle[1];
+
+  // v1 = B-A
+  v1x = triangle[2] - triangle[0];
+  v1y = triangle[3] - triangle[1];
 
   // First check if point lies wholly inside triangle
-  a00 = v0x*v0x + v0y*v0y; // innerproduct(v0, v0)
-  a01 = v0x*v1x + v0y*v1y; // innerproduct(v0, v1)
-  a10 = a01;               // innerproduct(v1, v0)
-  a11 = v1x*v1x + v1y*v1y; // innerproduct(v1, v1)
-    
-  denom = a11*a00 - a01*a10;
-
-  if (fabs(denom) > 0.0) {
-    // v = point-A  
-    vx = x - triangle[0]; 
-    vy = y - triangle[1];     
-    
-    b0 = v0x*vx + v0y*vy; // innerproduct(v0, v)        
-    b1 = v1x*vx + v1y*vy; // innerproduct(v1, v)            
-    
-    alpha = (b0*a11 - b1*a01)/denom;
-    beta = (b1*a00 - b0*a10)/denom;        
-    
-    if ((alpha > 0.0) && (beta > 0.0) && (alpha+beta < 1.0)) return 1;
+  a00 = v0x * v0x + v0y * v0y; // innerproduct(v0, v0)
+  a01 = v0x * v1x + v0y * v1y; // innerproduct(v0, v1)
+  a10 = a01;                   // innerproduct(v1, v0)
+  a11 = v1x * v1x + v1y * v1y; // innerproduct(v1, v1)
+
+  denom = a11 * a00 - a01 * a10;
+
+  if (fabs(denom) > 0.0)
+  {
+    // v = point-A
+    vx = x - triangle[0];
+    vy = y - triangle[1];
+
+    b0 = v0x * vx + v0y * vy; // innerproduct(v0, v)
+    b1 = v1x * vx + v1y * vy; // innerproduct(v1, v)
+
+    alpha = (b0 * a11 - b1 * a01) / denom;
+    beta = (b1 * a00 - b0 * a10) / denom;
+
+    if ((alpha > 0.0) && (beta > 0.0) && (alpha + beta < 1.0))
+      return 1;
   }
 
-  if (closed) {
+  if (closed)
+  {
     // Check if point lies on one of the edges
-        
-    for (i=0; i<3; i++) {
-      j = (i+1) % 3; // Circular index into triangle array
+
+    for (i = 0; i < 3; i++)
+    {
+      j = (i + 1) % 3; // Circular index into triangle array
       res = __point_on_line(x, y,
-                            triangle[2*i], triangle[2*i+1], 
-                            triangle[2*j], triangle[2*j+1], 			    
-			    rtol, atol);
-      if (res) return 1;
+                            triangle[2 * i], triangle[2 * i + 1],
+                            triangle[2 * j], triangle[2 * j + 1],
+                            rtol, atol);
+      if (res)
+        return 1;
     }
   }
-                
-  // Default return if point is outside triangle			 
-  return 0;			 			 
-}			  			       			       
 
+  // Default return if point is outside triangle
+  return 0;
+}
 
-int64_t __separate_points_by_polygon(int64_t M,     // Number of points
-				 int64_t N,     // Number of polygon vertices
-				 double* points,
-				 double* polygon,
-				 int64_t* indices,  // M-Array for storage indices
-				 int64_t closed,
-				 int64_t verbose) {
+anuga_int __separate_points_by_polygon(const anuga_int M, // Number of points
+                                     const anuga_int N, // Number of polygon vertices
+                                     double *points,
+                                     double *polygon,
+                                     anuga_int *indices, // M-Array for storage indices
+                                     const anuga_int closed,
+                                     const anuga_int verbose)
+{
 
-  double minpx, maxpx, minpy, maxpy, x, y, px_i, py_i, px_j, py_j, rtol=0.0, atol=0.0;
-  int64_t i, j, k, outside_index, inside_index, inside;
+  double minpx, maxpx, minpy, maxpy, rtol = 0.0, atol = 0.0;
+  anuga_int outside_index, inside_index;
 
   // Find min and max of poly used for optimisation when points
   // are far away from polygon
-  
-  // FIXME(Ole): Pass in rtol and atol from Python
-
-  minpx = polygon[0]; maxpx = minpx;
-  minpy = polygon[1]; maxpy = minpy;
 
-  for (i=0; i<N; i++) {
-    px_i = polygon[2*i];
-    py_i = polygon[2*i + 1];
+  // FIXME(Ole): Pass in rtol and atol from Python
 
-    if (px_i < minpx) minpx = px_i;
-    if (px_i > maxpx) maxpx = px_i;
-    if (py_i < minpy) minpy = py_i;
-    if (py_i > maxpy) maxpy = py_i;
+  minpx = polygon[0];
+  maxpx = minpx;
+  minpy = polygon[1];
+  maxpy = minpy;
+#pragma omp parallel for reduction(min : minpx, minpy) reduction(max : maxpx, maxpy)
+  for (int i = 0; i < N; i++)
+  {
+    double px_i = polygon[2 * i];
+    double py_i = polygon[2 * i + 1];
+
+    if (px_i < minpx)
+      minpx = px_i;
+    if (px_i > maxpx)
+      maxpx = px_i;
+    if (py_i < minpy)
+      minpy = py_i;
+    if (py_i > maxpy)
+      maxpy = py_i;
   }
 
   // Begin main loop (for each point)
-  inside_index = 0;    // Keep track of points inside
-  outside_index = M-1; // Keep track of points outside (starting from end)   
-  if (verbose){
-     printf("Separating %ld points\n", M);
-  }  
-  for (k=0; k<M; k++) {
-    if (verbose){
-      if (k %((M+10)/10)==0) printf("Doing %ld of %ld\n", k, M);
-    }
-    
-    x = points[2*k];
-    y = points[2*k + 1];
-
-    inside = 0;
+  inside_index = 0;      // Keep track of points inside
+  outside_index = M - 1; // Keep track of points outside (starting from end)
+  // if (verbose){
+  //    printf("Separating %ld points\n", M);
+  // }
+  // TODO, JLGV: Use OpenMP to parallelise this loop
+  for (int k = 0; k < M; k++)
+  {
+    // if (verbose){
+    //   if (k %((M+10)/10)==0) printf("Doing %ld of %ld\n", k, M);
+    // }
+
+    double x = points[2 * k];
+    double y = points[2 * k + 1];
+
+    int inside = 0;
 
     // Optimisation
-    if ((x > maxpx) || (x < minpx) || (y > maxpy) || (y < minpy)) {
+    if ((x > maxpx) || (x < minpx) || (y > maxpy) || (y < minpy))
+    {
       // Nothing
-    } else {   
+    }
+    else
+    {
       // Check polygon
-      for (i=0; i<N; i++) {
-        j = (i+1)%N;
+      for (int i = 0; i < N; i++)
+      {
+        int j = (i + 1) % N;
 
-        px_i = polygon[2*i];
-        py_i = polygon[2*i+1];
-        px_j = polygon[2*j];
-        py_j = polygon[2*j+1];
+        double px_i = polygon[2 * i];
+        double py_i = polygon[2 * i + 1];
+        double px_j = polygon[2 * j];
+        double py_j = polygon[2 * j + 1];
 
         // Check for case where point is contained in line segment
-        if (__point_on_line(x, y, px_i, py_i, px_j, py_j, rtol, atol)) {
-	  if (closed == 1) {
-	    inside = 1;
-	  } else {
-	    inside = 0;
-	  }
-	  break;
-        } else {
-          //Check if truly inside polygon
-	  if ( ((py_i < y) && (py_j >= y)) ||
-	       ((py_j < y) && (py_i >= y)) ) {
-	    if (px_i + (y-py_i)/(py_j-py_i)*(px_j-px_i) < x)
-	      inside = 1-inside;
-	  }
+        if (__point_on_line(x, y, px_i, py_i, px_j, py_j, rtol, atol))
+        {
+          if (closed == 1)
+          {
+            inside = 1;
+          }
+          else
+          {
+            inside = 0;
+          }
+          break;
+        }
+        else
+        {
+          // Check if truly inside polygon
+          if (((py_i < y) && (py_j >= y)) ||
+              ((py_j < y) && (py_i >= y)))
+          {
+            if (px_i + (y - py_i) / (py_j - py_i) * (px_j - px_i) < x)
+              inside = 1 - inside;
+          }
         }
       }
-    } 
-    if (inside == 1) {
+    }
+    if (inside == 1)
+    {
       indices[inside_index] = k;
       inside_index += 1;
-    } else {
+    }
+    else
+    {
       indices[outside_index] = k;
-      outside_index -= 1;    
+      outside_index -= 1;
     }
   } // End k
 
   return inside_index;
 }
+// anuga_int __separate_points_by_polygon(anuga_int M,     // Number of points
+// 				 anuga_int N,     // Number of polygon vertices
+// 				 double* points,
+// 				 double* polygon,
+// 				 anuga_int* indices,  // M-Array for storage indices
+// 				 anuga_int closed,
+// 				 anuga_int verbose) {
+
+//   double minpx, maxpx, minpy, maxpy, x, y, px_i, py_i, px_j, py_j, rtol=0.0, atol=0.0;
+//   anuga_int i, j, k, outside_index, inside_index, inside;
+
+//   // Find min and max of poly used for optimisation when points
+//   // are far away from polygon
+
+//   // FIXME(Ole): Pass in rtol and atol from Python
+
+//   minpx = polygon[0]; maxpx = minpx;
+//   minpy = polygon[1]; maxpy = minpy;
+
+//   for (i=0; i<N; i++) {
+//     px_i = polygon[2*i];
+//     py_i = polygon[2*i + 1];
+
+//     if (px_i < minpx) minpx = px_i;
+//     if (px_i > maxpx) maxpx = px_i;
+//     if (py_i < minpy) minpy = py_i;
+//     if (py_i > maxpy) maxpy = py_i;
+//   }
+
+//   // Begin main loop (for each point)
+//   inside_index = 0;    // Keep track of points inside
+//   outside_index = M-1; // Keep track of points outside (starting from end)
+//   if (verbose){
+//      printf("Separating %ld points\n", M);
+//   }
+//   for (k=0; k<M; k++) {
+//     if (verbose){
+//       if (k %((M+10)/10)==0) printf("Doing %ld of %ld\n", k, M);
+//     }
+
+//     x = points[2*k];
+//     y = points[2*k + 1];
+
+//     inside = 0;
+
+//     // Optimisation
+//     if ((x > maxpx) || (x < minpx) || (y > maxpy) || (y < minpy)) {
+//       // Nothing
+//     } else {
+//       // Check polygon
+//       for (i=0; i<N; i++) {
+//         j = (i+1)%N;
+
+//         px_i = polygon[2*i];
+//         py_i = polygon[2*i+1];
+//         px_j = polygon[2*j];
+//         py_j = polygon[2*j+1];
+
+//         // Check for case where point is contained in line segment
+//         if (__point_on_line(x, y, px_i, py_i, px_j, py_j, rtol, atol)) {
+// 	  if (closed == 1) {
+// 	    inside = 1;
+// 	  } else {
+// 	    inside = 0;
+// 	  }
+// 	  break;
+//         } else {
+//           //Check if truly inside polygon
+// 	  if ( ((py_i < y) && (py_j >= y)) ||
+// 	       ((py_j < y) && (py_i >= y)) ) {
+// 	    if (px_i + (y-py_i)/(py_j-py_i)*(px_j-px_i) < x)
+// 	      inside = 1-inside;
+// 	  }
+//         }
+//       }
+//     }
+//     if (inside == 1) {
+//       indices[inside_index] = k;
+//       inside_index += 1;
+//     } else {
+//       indices[outside_index] = k;
+//       outside_index -= 1;
+//     }
+//   } // End k
+
+//   return inside_index;
+// }
diff --git a/anuga/geometry/setup.py b/anuga/geometry/setup.py
deleted file mode 100644
index a2ff799e4..000000000
--- a/anuga/geometry/setup.py
+++ /dev/null
@@ -1,33 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-from Cython.Build import cythonize
-import Cython.Compiler.Options
-Cython.Compiler.Options.annotate = True
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('geometry', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    #util_dir = os.path.abspath(join(os.path.dirname(__file__),'..','utilities'))
-    util_dir = join('..','utilities')
-    
-    config.add_extension('polygon_ext',
-                         sources=['polygon_ext.pyx'],
-                         include_dirs=[util_dir])
-    
-    config.ext_modules = cythonize(config.ext_modules,annotate=True)
-    
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/geometry/tests/test_geometry.py b/anuga/geometry/tests/test_geometry.py
index 751236475..20c6e6586 100644
--- a/anuga/geometry/tests/test_geometry.py
+++ b/anuga/geometry/tests/test_geometry.py
@@ -105,6 +105,6 @@ def test_get_siblings(self):
 ################################################################################
 
 if __name__ == "__main__":
-    mysuite = unittest.makeSuite(Test_Geometry, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Geometry)
     runner = unittest.TextTestRunner()
-    runner.run(mysuite)
+    runner.run(suite)
diff --git a/anuga/geometry/tests/test_polygon.py b/anuga/geometry/tests/test_polygon.py
index 740475309..65347e9a2 100644
--- a/anuga/geometry/tests/test_polygon.py
+++ b/anuga/geometry/tests/test_polygon.py
@@ -1982,7 +1982,6 @@ def test_is_polygon_complex2(self):
 
 
 if __name__ == "__main__":
-    # _intersection_bug_20081110_TR_BL')
-    suite = unittest.makeSuite(Test_Polygon, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Polygon)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/geospatial_data/setup.py b/anuga/geospatial_data/setup.py
deleted file mode 100644
index 352196b82..000000000
--- a/anuga/geospatial_data/setup.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('geospatial_data', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/geospatial_data/tests/test_geospatial_data.py b/anuga/geospatial_data/tests/test_geospatial_data.py
index 8c6a137bd..5e8423ec9 100644
--- a/anuga/geospatial_data/tests/test_geospatial_data.py
+++ b/anuga/geospatial_data/tests/test_geospatial_data.py
@@ -1857,7 +1857,7 @@ def test_find_optimal_smoothing_parameter2(self):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Geospatial_data, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Geospatial_data)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
 
diff --git a/anuga/lib/setup.py b/anuga/lib/setup.py
deleted file mode 100644
index d65f186b8..000000000
--- a/anuga/lib/setup.py
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('lib', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/load_mesh/setup.py b/anuga/load_mesh/setup.py
deleted file mode 100644
index 62227492b..000000000
--- a/anuga/load_mesh/setup.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('load_mesh', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/load_mesh/tests/test_loadASCII.py b/anuga/load_mesh/tests/test_loadASCII.py
index 76e937791..dbd91283a 100644
--- a/anuga/load_mesh/tests/test_loadASCII.py
+++ b/anuga/load_mesh/tests/test_loadASCII.py
@@ -551,6 +551,6 @@ def throws_error_2_screen_import_mesh_bad(self):
 ################################################################################
 
 if __name__ == '__main__':
-    suite = unittest.makeSuite(loadASCIITestCase,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(loadASCIITestCase)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
diff --git a/anuga/mesh_engine/setup.py b/anuga/mesh_engine/setup.py
deleted file mode 100644
index 11c5da4d8..000000000
--- a/anuga/mesh_engine/setup.py
+++ /dev/null
@@ -1,27 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-from Cython.Build import cythonize
-import Cython.Compiler.Options
-Cython.Compiler.Options.annotate = True
-
-def configuration(parent_package='',top_path=None):
-
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-
-    config = Configuration('mesh_engine', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    #util_dir = os.path.abspath(join(os.path.dirname(__file__),'..','utilities'))
-    util_dir = join('..','utilities') 
-
-    return config
-
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/mesh_engine/tests/test_generate_mesh.py b/anuga/mesh_engine/tests/test_generate_mesh.py
index aace09483..4a43961cf 100644
--- a/anuga/mesh_engine/tests/test_generate_mesh.py
+++ b/anuga/mesh_engine/tests/test_generate_mesh.py
@@ -462,6 +462,6 @@ def test_pointattlist(self):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(triangTestCase, 'test_')
+    suite = unittest.TestLoader().loadTestsFromTestCase(triangTestCase)
     runner = unittest.TextTestRunner(verbosity=2)
     runner.run(suite)
diff --git a/anuga/meson.build b/anuga/meson.build
index 07e3b0347..709e6e5eb 100644
--- a/anuga/meson.build
+++ b/anuga/meson.build
@@ -7,7 +7,6 @@ python_sources = [
     'extras.py',
     '__init__.py',
     '__metadata__.py',
-    'old_setup.py',
     'revision.py',
 ]
 
diff --git a/anuga/old_setup.py b/anuga/old_setup.py
deleted file mode 100644
index b3c3e9e5b..000000000
--- a/anuga/old_setup.py
+++ /dev/null
@@ -1,50 +0,0 @@
-
-
-import sys
-
-
-def configuration(parent_package='',top_path=None):
-
-    from numpy.distutils.misc_util import Configuration
-
-    config = Configuration('anuga',parent_package,top_path)
-    config.add_subpackage('abstract_2d_finite_volumes')
-    config.add_subpackage('advection')
-    config.add_subpackage('alpha_shape')
-    config.add_subpackage('caching')
-    config.add_subpackage('coordinate_transforms')
-    config.add_subpackage('culvert_flows')
-    config.add_subpackage('damage_modelling')
-    config.add_subpackage('file')
-    config.add_subpackage('file_conversion')
-    config.add_subpackage('fit_interpolate')
-    config.add_subpackage('geometry')
-    config.add_subpackage('geospatial_data')
-    config.add_subpackage('lib')
-    config.add_subpackage('load_mesh')
-    config.add_subpackage('mesh_engine')
-    config.add_subpackage('operators')
-    config.add_subpackage('parallel')
-    config.add_subpackage('pmesh')
-    config.add_subpackage('simulation')
-    config.add_subpackage('shallow_water')
-    config.add_subpackage('structures')
-    config.add_subpackage('tsunami_source')
-    config.add_subpackage('utilities')
-    config.add_subpackage('validation_utilities')
-
-
-    try:
-        import vtk
-        config.add_subpackage('visualiser')
-    except:
-        pass
-
-    config.make_config_py()
-
-    return config
-
-if __name__ == '__main__':
-    print('This is the wrong setup.py file to run')
-    #from numpy.distutils.core import setup
-    #setup(**configuration(top_path='').todict())
diff --git a/anuga/operators/kinematic_viscosity_operator.c b/anuga/operators/kinematic_viscosity_operator.c
index 60ea5559c..0d798d6a0 100644
--- a/anuga/operators/kinematic_viscosity_operator.c
+++ b/anuga/operators/kinematic_viscosity_operator.c
@@ -1,23 +1,23 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdint.h>
-
+#include "anuga_typedefs.h"
 //Rough quicksort implementation (for build_operator_matrix)
 // taken from http://cprogramminglanguage.net/quicksort-algorithm-c-source-code.aspx
 
-void swap(int64_t *x, int64_t *y) {
-    int64_t temp;
+void swap(anuga_int *x, anuga_int *y) {
+    anuga_int temp;
     temp = *x;
     *x = *y;
     *y = temp;
 }
 
-int64_t choose_pivot(int64_t i, int64_t j) {
+anuga_int choose_pivot(anuga_int i, anuga_int j) {
     return ((i + j) / 2);
 }
 
-void quicksort(int64_t list[], int64_t m, int64_t n) {
-    int64_t key, i, j, k;
+void quicksort(anuga_int list[], anuga_int m, anuga_int n) {
+    anuga_int key, i, j, k;
     if (m < n) {
         k = choose_pivot(m, n);
         swap(&list[m], &list[k]);
@@ -40,17 +40,16 @@ void quicksort(int64_t list[], int64_t m, int64_t n) {
     }
 }
 
-int64_t _build_geo_structure(int64_t n,
-        int64_t tot_len,
+anuga_int _build_geo_structure(anuga_int n,
+        anuga_int tot_len,
         double *centroids,
-        int64_t *neighbours,
+        anuga_int *neighbours,
         double *edgelengths,
         double *edge_midpoints,
-        int64_t *geo_indices,
+        anuga_int *geo_indices,
         double *geo_values) {
-    int64_t i, edge, edge_counted, j, m;
+    anuga_int i, edge, j, m;
     double dist, this_x, this_y, other_x, other_y, edge_length;
-    edge_counted = 0;
     for (i = 0; i < n; i++) {
         //The centroid coordinates of triangle i
         this_x = centroids[2 * i];
@@ -65,7 +64,6 @@ int64_t _build_geo_structure(int64_t n,
             if (j < 0) {
                 m = -j - 1;
                 geo_indices[3 * i + edge] = n + m;
-                edge_counted++;
 
                 other_x = edge_midpoints[2 * (3 * i + edge)];
                 other_y = edge_midpoints[2 * (3 * i + edge) + 1];
@@ -85,16 +83,16 @@ int64_t _build_geo_structure(int64_t n,
     return 0;
 }
 
-int64_t _build_elliptic_matrix_not_symmetric(int64_t n,
-        int64_t tot_len,
-        int64_t *geo_indices,
+anuga_int _build_elliptic_matrix_not_symmetric(anuga_int n,
+        anuga_int tot_len,
+        anuga_int *geo_indices,
         double *geo_values,
         double *cell_data,
         double *bdry_data,
         double *data,
-        int64_t *colind) {
-    int64_t i, k, edge, j[4], sorted_j[4], this_index;
-    double h_j, v[3], v_i; //v[k] = value of the interaction of edge k in a given triangle, v_i = (i,i) entry
+        anuga_int *colind) {
+    anuga_int i, k, edge, j[4], sorted_j[4], this_index;
+    double v[3], v_i; //v[k] = value of the interaction of edge k in a given triangle, v_i = (i,i) entry
     for (i = 0; i < n; i++) {
         v_i = 0.0;
         j[3] = i;
@@ -138,15 +136,15 @@ int64_t _build_elliptic_matrix_not_symmetric(int64_t n,
     return 0;
 }
 
-int64_t _build_elliptic_matrix(int64_t n,
-        int64_t tot_len,
-        int64_t *geo_indices,
+anuga_int _build_elliptic_matrix(anuga_int n,
+        anuga_int tot_len,
+        anuga_int *geo_indices,
         double *geo_values,
         double *cell_data,
         double *bdry_data,
         double *data,
-        int64_t *colind) {
-    int64_t i, k, edge, j[4], sorted_j[4], this_index;
+        anuga_int *colind) {
+    anuga_int i, k, edge, j[4], sorted_j[4], this_index;
     double h_j, v[3], v_i; //v[k] = value of the interaction of edge k in a given triangle, v_i = (i,i) entry
     for (i = 0; i < n; i++) {
         v_i = 0.0;
@@ -191,16 +189,16 @@ int64_t _build_elliptic_matrix(int64_t n,
     return 0;
 }
 
-int64_t _update_elliptic_matrix_not_symmetric(int64_t n,
-        int64_t tot_len,
-        int64_t *geo_indices,
+anuga_int _update_elliptic_matrix_not_symmetric(anuga_int n,
+        anuga_int tot_len,
+        anuga_int *geo_indices,
         double *geo_values,
         double *cell_data,
         double *bdry_data,
         double *data,
-        int64_t *colind) {
-    int64_t i, k, edge, j[4], sorted_j[4], this_index;
-    double h_j, v[3], v_i; //v[k] = value of the interaction of edge k in a given triangle, v_i = (i,i) entry
+        anuga_int *colind) {
+    anuga_int i, k, edge, j[4], sorted_j[4], this_index;
+    double  v[3], v_i; //v[k] = value of the interaction of edge k in a given triangle, v_i = (i,i) entry
     for (i = 0; i < n; i++) {
         v_i = 0.0;
         j[3] = i;
@@ -208,11 +206,6 @@ int64_t _update_elliptic_matrix_not_symmetric(int64_t n,
         //Get the values of each interaction, and the column index at which they occur
         for (edge = 0; edge < 3; edge++) {
             j[edge] = geo_indices[3 * i + edge];
-            if (j[edge] < n) { //interior
-                h_j = cell_data[j[edge]];
-            } else { //boundary
-                h_j = bdry_data[j[edge] - n];
-            }
             v[edge] = -cell_data[i] * geo_values[3 * i + edge]; //the negative of the individual interaction
             v_i += cell_data[i] * geo_values[3 * i + edge]; //sum the three interactions
         }
@@ -245,15 +238,15 @@ int64_t _update_elliptic_matrix_not_symmetric(int64_t n,
     return 0;
 }
 
-int64_t _update_elliptic_matrix(int64_t n,
-        int64_t tot_len,
-        int64_t *geo_indices,
+anuga_int _update_elliptic_matrix(anuga_int n,
+        anuga_int tot_len,
+        anuga_int *geo_indices,
         double *geo_values,
         double *cell_data,
         double *bdry_data,
         double *data,
-        int64_t *colind) {
-    int64_t i, k, edge, j[4], sorted_j[4], this_index;
+        anuga_int *colind) {
+    anuga_int i, k, edge, j[4], sorted_j[4], this_index;
     double h_j, v[3], v_i; //v[k] = value of the interaction of edge k in a given triangle, v_i = (i,i) entry
     for (i = 0; i < n; i++) {
         v_i = 0.0;
diff --git a/anuga/operators/mannings_operator.c b/anuga/operators/mannings_operator.c
index c4340bccf..9967b1775 100644
--- a/anuga/operators/mannings_operator.c
+++ b/anuga/operators/mannings_operator.c
@@ -15,17 +15,17 @@
 
 #include "math.h"
 #include "util_ext.h"
+#include "anuga_constants.h"
 #include <stdio.h>
 #include <stdint.h>
 
-const double pi = 3.14159265358979;
 
-void _manning_friction_flat(double g, double eps, int64_t N,
+void _manning_friction_flat(double g, double eps, anuga_int N,
                double* w, double* zv,
                double* uh, double* vh,
                double* eta, double* xmom, double* ymom) {
 
-  int64_t k, k3;
+  anuga_int k, k3;
   double S, h, z, z0, z1, z2;
 
   for (k=0; k<N; k++) {
@@ -53,12 +53,12 @@ void _manning_friction_flat(double g, double eps, int64_t N,
 }
 
 
-void _manning_friction_sloped(double g, double eps, int64_t N,
+void _manning_friction_sloped(double g, double eps, anuga_int N,
                double* x, double* w, double* zv,
                double* uh, double* vh,
                double* eta, double* xmom_update, double* ymom_update) {
 
-  int64_t k, k3, k6;
+  anuga_int k, k3, k6;
   double S, h, z, z0, z1, z2, zs, zx, zy;
   double x0,y0,x1,y1,x2,y2;
 
@@ -102,12 +102,12 @@ void _manning_friction_sloped(double g, double eps, int64_t N,
 
 
 
-void _chezy_friction(double g, double eps, int64_t N,
+void _chezy_friction(double g, double eps, anuga_int N,
                double* x, double* w, double* zv,
                double* uh, double* vh,
                double* chezy, double* xmom_update, double* ymom_update) {
 
-  int64_t k, k3, k6;
+  anuga_int k, k3, k6;
   double S, h, z, z0, z1, z2, zs, zx, zy;
   double x0,y0,x1,y1,x2,y2;
 
diff --git a/anuga/operators/setup.py b/anuga/operators/setup.py
deleted file mode 100644
index 9c8aad8a8..000000000
--- a/anuga/operators/setup.py
+++ /dev/null
@@ -1,38 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-from Cython.Build import cythonize
-import Cython.Compiler.Options
-Cython.Compiler.Options.annotate = True
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('operators', parent_package, top_path)
-
-    config.add_data_dir('tests')
-
-    #util_dir = os.path.abspath(join(os.path.dirname(__file__),'..','utilities'))
-    util_dir = join('..','utilities')
-    
-    config.add_extension('mannings_operator_ext',
-                         sources=['mannings_operator_ext.pyx'],
-                         include_dirs=[util_dir])
-
-    config.add_extension('kinematic_viscosity_operator_ext',
-                         sources=['kinematic_viscosity_operator_ext.pyx'],
-                         include_dirs=[util_dir])
-
-    config.ext_modules = cythonize(config.ext_modules, annotate=True)
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
- 
diff --git a/anuga/operators/tests/test_base_operator.py b/anuga/operators/tests/test_base_operator.py
index c30e84e77..53d2ff539 100644
--- a/anuga/operators/tests/test_base_operator.py
+++ b/anuga/operators/tests/test_base_operator.py
@@ -46,6 +46,6 @@ def test_create_operator(self):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Operator, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Operator)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/operators/tests/test_boundary_flux_integral_operator.py b/anuga/operators/tests/test_boundary_flux_integral_operator.py
index 81dbf136d..d6e57d195 100644
--- a/anuga/operators/tests/test_boundary_flux_integral_operator.py
+++ b/anuga/operators/tests/test_boundary_flux_integral_operator.py
@@ -130,7 +130,7 @@ def test_boundary_flux_operator_DE2(self):
          
         
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_boundary_flux_integral_operator, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_boundary_flux_integral_operator)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
 
diff --git a/anuga/operators/tests/test_erosion_operators.py b/anuga/operators/tests/test_erosion_operators.py
index f55af49ba..e3cec6573 100644
--- a/anuga/operators/tests/test_erosion_operators.py
+++ b/anuga/operators/tests/test_erosion_operators.py
@@ -100,6 +100,6 @@ def test_erosion_operator_simple_de0(self):
 
             
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_erosion_operators, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_erosion_operators)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/operators/tests/test_friction_operators.py b/anuga/operators/tests/test_friction_operators.py
index 49db53be5..e1057d96a 100644
--- a/anuga/operators/tests/test_friction_operators.py
+++ b/anuga/operators/tests/test_friction_operators.py
@@ -216,6 +216,6 @@ def test_friction_string(self):
             Set_depth_friction_operator(domain, friction="invalid_string", region=region)
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_set_friction_operators, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_set_friction_operators)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/operators/tests/test_kinematic_viscosity_operator.py b/anuga/operators/tests/test_kinematic_viscosity_operator.py
index ed133c88c..a183d4b04 100644
--- a/anuga/operators/tests/test_kinematic_viscosity_operator.py
+++ b/anuga/operators/tests/test_kinematic_viscosity_operator.py
@@ -1391,6 +1391,6 @@ def test_kinematic_operator_string(self):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_kinematic_viscosity, 'test_') #test_')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_kinematic_viscocity)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/operators/tests/test_rate_operators.py b/anuga/operators/tests/test_rate_operators.py
index 4a8766d29..e97715f20 100644
--- a/anuga/operators/tests/test_rate_operators.py
+++ b/anuga/operators/tests/test_rate_operators.py
@@ -1770,6 +1770,6 @@ def main_spatial_rate(x,y,t):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_rate_operators, 'test_')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_rate_operators)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/operators/tests/test_set_elevation_operator.py b/anuga/operators/tests/test_set_elevation_operator.py
index a40e1a182..e7dcfc7d1 100644
--- a/anuga/operators/tests/test_set_elevation_operator.py
+++ b/anuga/operators/tests/test_set_elevation_operator.py
@@ -728,6 +728,6 @@ def elev(t):
 
             
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_set_elevation_operator, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_set_elevation_operator)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/operators/tests/test_set_quantity.py b/anuga/operators/tests/test_set_quantity.py
index 28eefa600..962f6df6c 100644
--- a/anuga/operators/tests/test_set_quantity.py
+++ b/anuga/operators/tests/test_set_quantity.py
@@ -506,6 +506,6 @@ def stage(x,y, t):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_set_quantity, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_set_quantity)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/operators/tests/test_set_stage_operator.py b/anuga/operators/tests/test_set_stage_operator.py
index a68c6e301..ac39f4bb2 100644
--- a/anuga/operators/tests/test_set_stage_operator.py
+++ b/anuga/operators/tests/test_set_stage_operator.py
@@ -486,6 +486,6 @@ def stage(x,y, t):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_set_stage_operators, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_set_stage_operators)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/operators/tests/test_set_w_uh_vh_operators.py b/anuga/operators/tests/test_set_w_uh_vh_operators.py
index 0f9336067..29cb399d0 100644
--- a/anuga/operators/tests/test_set_w_uh_vh_operators.py
+++ b/anuga/operators/tests/test_set_w_uh_vh_operators.py
@@ -206,6 +206,6 @@ def test_set_w_uh_vh_operator_time(self):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_set_w_uh_vh_operators, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_set_w_uh_vh_operators)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/parallel/setup.py b/anuga/parallel/setup.py
deleted file mode 100644
index 25a8b18c9..000000000
--- a/anuga/parallel/setup.py
+++ /dev/null
@@ -1,20 +0,0 @@
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    
-    config = Configuration('parallel', parent_package, top_path)
-
-    config.add_data_dir('tests')
-    config.add_data_dir('data')
-    
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
-
-
-
diff --git a/anuga/parallel/tests/test_distribute_mesh.py b/anuga/parallel/tests/test_distribute_mesh.py
index c2d801ca0..f6e6f73ec 100644
--- a/anuga/parallel/tests/test_distribute_mesh.py
+++ b/anuga/parallel/tests/test_distribute_mesh.py
@@ -2894,6 +2894,6 @@ def test_build_extract_submesh_3(self):
 # -------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Distribute_Mesh, "test")
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Distribute_Mesh)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/parallel/tests/test_parallel_boyd_box_operator.py b/anuga/parallel/tests/test_parallel_boyd_box_operator.py
index 6d04da40d..f576449ac 100644
--- a/anuga/parallel/tests/test_parallel_boyd_box_operator.py
+++ b/anuga/parallel/tests/test_parallel_boyd_box_operator.py
@@ -324,7 +324,7 @@ def assert_(condition, msg="Assertion Failed"):
 if __name__=="__main__":
     if numprocs == 1:
         runner = unittest.TextTestRunner()
-        suite = unittest.makeSuite(Test_parallel_boyd_box_operator, 'test')
+        suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_boyd_box_operator)
         #print "Running for numproc = 1"
         runner.run(suite)
     else:
diff --git a/anuga/parallel/tests/test_parallel_dist_settings.py b/anuga/parallel/tests/test_parallel_dist_settings.py
index afa577ae6..6436e53c9 100644
--- a/anuga/parallel/tests/test_parallel_dist_settings.py
+++ b/anuga/parallel/tests/test_parallel_dist_settings.py
@@ -172,7 +172,7 @@ def assert_(condition, msg="Assertion Failed"):
     if numprocs == 1: 
         if verbose: print('SEQUENTIAL START')
         runner = unittest.TextTestRunner()
-        suite = unittest.makeSuite(Test_parallel_sw_flow, 'test')
+        suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_sw_flow)
         runner.run(suite)
     else:
         #------------------------------------------
diff --git a/anuga/parallel/tests/test_parallel_distribute_domain.py b/anuga/parallel/tests/test_parallel_distribute_domain.py
index db32f0199..d8ff62945 100644
--- a/anuga/parallel/tests/test_parallel_distribute_domain.py
+++ b/anuga/parallel/tests/test_parallel_distribute_domain.py
@@ -103,6 +103,6 @@ def test_that_sequential_and_parallel_outputs_are_identical(self):
            
 if __name__ == "__main__":
     runner = unittest.TextTestRunner()
-    suite = unittest.makeSuite(Test_parallel_distribute_domain, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_distribute_domain)
     runner.run(suite)
 
diff --git a/anuga/parallel/tests/test_parallel_file_boundary.py b/anuga/parallel/tests/test_parallel_file_boundary.py
index 6531aa74f..9d285ad43 100644
--- a/anuga/parallel/tests/test_parallel_file_boundary.py
+++ b/anuga/parallel/tests/test_parallel_file_boundary.py
@@ -491,8 +491,7 @@ def assert_(condition, msg="Assertion Failed"):
     #verbose=False
     if myid ==0 and verbose: 
         print('PARALLEL START')
-    suite = unittest.makeSuite(Test_urs2sts_parallel,'parallel_test')
-    #suite = unittest.makeSuite(Test_urs2sts_parallel,'sequential_test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_urs2sts_parallel)
     runner = unittest.TextTestRunner()
     runner.run(suite)
 
diff --git a/anuga/parallel/tests/test_parallel_frac_op.py b/anuga/parallel/tests/test_parallel_frac_op.py
index 55ca24c9d..25f11ebe5 100644
--- a/anuga/parallel/tests/test_parallel_frac_op.py
+++ b/anuga/parallel/tests/test_parallel_frac_op.py
@@ -301,7 +301,7 @@ def assert_(condition, msg="Assertion Failed"):
 
     if numprocs == 1:
         runner = unittest.TextTestRunner()
-        suite = unittest.makeSuite(Test_parallel_frac_op, 'test')
+        suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_frac_op)
         #print "Running for numproc = 1"
         runner.run(suite)
     else:
diff --git a/anuga/parallel/tests/test_parallel_inlet_operator.py b/anuga/parallel/tests/test_parallel_inlet_operator.py
index bcf9025ae..0667175d1 100644
--- a/anuga/parallel/tests/test_parallel_inlet_operator.py
+++ b/anuga/parallel/tests/test_parallel_inlet_operator.py
@@ -288,7 +288,7 @@ def assert_(condition, msg="Assertion Failed"):
 if __name__=="__main__":
     if numprocs == 1:
         runner = unittest.TextTestRunner()
-        suite = unittest.makeSuite(Test_parallel_frac_op, 'test')
+        suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_frac_op)
         #print "Running for numproc = 1"
         runner.run(suite)
     else:
diff --git a/anuga/parallel/tests/test_parallel_inlet_operator_with_region.py b/anuga/parallel/tests/test_parallel_inlet_operator_with_region.py
index 0ba5b7bf2..20367b534 100644
--- a/anuga/parallel/tests/test_parallel_inlet_operator_with_region.py
+++ b/anuga/parallel/tests/test_parallel_inlet_operator_with_region.py
@@ -285,7 +285,7 @@ def assert_(condition, msg="Assertion Failed"):
 if __name__=="__main__":
     if numprocs == 1:
         runner = unittest.TextTestRunner()
-        suite = unittest.makeSuite(Test_parallel_frac_op, 'test')
+        suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_frac_op)
         #print "Running for numproc = 1"
         runner.run(suite)
     else:
diff --git a/anuga/parallel/tests/test_parallel_riverwall.py b/anuga/parallel/tests/test_parallel_riverwall.py
index 0792f1de1..f407cf0a9 100644
--- a/anuga/parallel/tests/test_parallel_riverwall.py
+++ b/anuga/parallel/tests/test_parallel_riverwall.py
@@ -88,5 +88,5 @@ def test_that_sequential_and_parallel_outputs_are_identical(self):
 
 if __name__ == "__main__":
     runner = unittest.TextTestRunner()
-    suite = unittest.makeSuite(Test_parallel_riverwall, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_riverwall)
     runner.run(suite)
diff --git a/anuga/parallel/tests/test_parallel_shallow_domain.py b/anuga/parallel/tests/test_parallel_shallow_domain.py
index 701d1a883..13a58b7f8 100644
--- a/anuga/parallel/tests/test_parallel_shallow_domain.py
+++ b/anuga/parallel/tests/test_parallel_shallow_domain.py
@@ -88,5 +88,5 @@ def test_that_sequential_and_parallel_outputs_are_identical(self):
 
 if __name__ == "__main__":
     runner = unittest.TextTestRunner()
-    suite = unittest.makeSuite(Test_parallel_shallow_domain, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_shallow_domain)
     runner.run(suite)
diff --git a/anuga/parallel/tests/test_parallel_sw_flow.py b/anuga/parallel/tests/test_parallel_sw_flow.py
index 5e9a484d8..e5ab98b24 100644
--- a/anuga/parallel/tests/test_parallel_sw_flow.py
+++ b/anuga/parallel/tests/test_parallel_sw_flow.py
@@ -86,5 +86,5 @@ def test_that_sequential_and_parallel_outputs_are_identical(self):
 
 if __name__ == "__main__":
     runner = unittest.TextTestRunner()
-    suite = unittest.makeSuite(Test_parallel_sw_flow, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_sw_flow)
     runner.run(suite)
diff --git a/anuga/parallel/tests/test_parallel_sw_flow_de0.py b/anuga/parallel/tests/test_parallel_sw_flow_de0.py
index c5e4c1da4..15626ea80 100644
--- a/anuga/parallel/tests/test_parallel_sw_flow_de0.py
+++ b/anuga/parallel/tests/test_parallel_sw_flow_de0.py
@@ -201,7 +201,7 @@ def assert_(condition, msg="Assertion Failed"):
 if __name__=="__main__":
     if numprocs == 1: 
         runner = unittest.TextTestRunner()
-        suite = unittest.makeSuite(Test_parallel_sw_flow, 'test')
+        suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_sw_flow)
         runner.run(suite)
     else:
 
diff --git a/anuga/parallel/tests/test_parallel_sw_flow_low_froude_0.py b/anuga/parallel/tests/test_parallel_sw_flow_low_froude_0.py
index ea1040702..a75bc41c0 100644
--- a/anuga/parallel/tests/test_parallel_sw_flow_low_froude_0.py
+++ b/anuga/parallel/tests/test_parallel_sw_flow_low_froude_0.py
@@ -208,7 +208,7 @@ def assert_(condition, msg="Assertion Failed"):
 if __name__=="__main__":
     if numprocs == 1:
         runner = unittest.TextTestRunner()
-        suite = unittest.makeSuite(Test_parallel_sw_flow, 'test')
+        suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_sw_flow)
         runner.run(suite)
     else:
 
diff --git a/anuga/parallel/tests/test_parallel_sw_flow_low_froude_1.py b/anuga/parallel/tests/test_parallel_sw_flow_low_froude_1.py
index 5b00106fb..f27f0bacf 100644
--- a/anuga/parallel/tests/test_parallel_sw_flow_low_froude_1.py
+++ b/anuga/parallel/tests/test_parallel_sw_flow_low_froude_1.py
@@ -204,7 +204,7 @@ def assert_(condition, msg="Assertion Failed"):
 if __name__=="__main__":
     if numprocs == 1:
         runner = unittest.TextTestRunner()
-        suite = unittest.makeSuite(Test_parallel_sw_flow, 'test')
+        suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_sw_flow)
         runner.run(suite)
     else:
 
diff --git a/anuga/parallel/tests/test_sequential_dist_sw_flow.py b/anuga/parallel/tests/test_sequential_dist_sw_flow.py
index 2621214d0..de49cc4d4 100644
--- a/anuga/parallel/tests/test_sequential_dist_sw_flow.py
+++ b/anuga/parallel/tests/test_sequential_dist_sw_flow.py
@@ -316,7 +316,7 @@ def assert_(condition, msg="Assertion Failed"):
 if __name__=="__main__":
     if numprocs == 1: 
         runner = unittest.TextTestRunner()
-        suite = unittest.makeSuite(Test_parallel_sw_flow, 'test')
+        suite = unittest.TestLoader().loadTestsFromTestCase(Test_parallel_sw_flow)
         runner.run(suite)
     else:
 
diff --git a/anuga/pmesh/meson.build b/anuga/pmesh/meson.build
index 9e7814309..1bdb7b901 100644
--- a/anuga/pmesh/meson.build
+++ b/anuga/pmesh/meson.build
@@ -8,7 +8,6 @@ python_sources = [
   'mesh.py',
   'mesh_quadtree.py',
   'ProgressBar.py',
-  'setup.py',
   'timing.py',
   'toolbarbutton.py',
   'ungen_example.py',
@@ -21,4 +20,4 @@ py3.install_sources(
 )
 
 subdir('tests')
-subdir('icons')
\ No newline at end of file
+subdir('icons')
diff --git a/anuga/pmesh/setup.py b/anuga/pmesh/setup.py
deleted file mode 100644
index 35624b5ca..000000000
--- a/anuga/pmesh/setup.py
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('pmesh', parent_package, top_path)
-
-    config.add_data_dir('tests')
-    config.add_data_dir('icons')    
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/pmesh/tests/test_mesh.py b/anuga/pmesh/tests/test_mesh.py
index 73aa192c0..a48770240 100644
--- a/anuga/pmesh/tests/test_mesh.py
+++ b/anuga/pmesh/tests/test_mesh.py
@@ -2155,7 +2155,7 @@ def list_comp(A,B):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(meshTestCase, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(meshTestCase)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
     
diff --git a/anuga/pmesh/tests/test_mesh_interface.py b/anuga/pmesh/tests/test_mesh_interface.py
index 4583f109f..a44adf628 100644
--- a/anuga/pmesh/tests/test_mesh_interface.py
+++ b/anuga/pmesh/tests/test_mesh_interface.py
@@ -996,7 +996,7 @@ def test_create_mesh_from_regions_check_segs(self):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(TestCase, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestCase)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
 
diff --git a/anuga/pmesh/tests/test_meshquad.py b/anuga/pmesh/tests/test_meshquad.py
index 6baf7a0b6..a1435b88d 100644
--- a/anuga/pmesh/tests/test_meshquad.py
+++ b/anuga/pmesh/tests/test_meshquad.py
@@ -185,6 +185,6 @@ def NOtest_num_visits(self):
 ################################################################################
 
 if __name__ == "__main__":
-    mysuite = unittest.makeSuite(Test_Quad,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Quad)
     runner = unittest.TextTestRunner()
-    runner.run(mysuite)
+    runner.run(suite)
diff --git a/anuga/rain/TODO.md b/anuga/rain/TODO.md
new file mode 100644
index 000000000..ae4767ff6
--- /dev/null
+++ b/anuga/rain/TODO.md
@@ -0,0 +1 @@
+Missing unit tests for rain, TODO JORGE
diff --git a/anuga/revision.py b/anuga/revision.py
index 882d4413e..66fcc413c 100644
--- a/anuga/revision.py
+++ b/anuga/revision.py
@@ -4,6 +4,6 @@
 The file is automatically generated and should not be modified manually.
 """
 
-__git_sha__ = "b533cb128b798f8bdb4b854e0ab132aaff985e36"
-__git_committed_datetime__ = "2023-06-01 17:25:15+10:00"
+__git_sha__ = "b8e9e28b864cd6989ced54d1bf2a735cbd046d30"
+__git_committed_datetime__ = "2025-06-26 20:19:13+10:00"
 __version__ = "3.2.0dev"
diff --git a/anuga/shallow_water/boundaries.py b/anuga/shallow_water/boundaries.py
index 6e987b230..17d531dde 100644
--- a/anuga/shallow_water/boundaries.py
+++ b/anuga/shallow_water/boundaries.py
@@ -26,7 +26,7 @@
 from anuga.fit_interpolate.interpolate import Modeltime_too_early
 from anuga.config import g as gravity
      
-from anuga.shallow_water.sw_domain_orig_ext import rotate
+from anuga.shallow_water.sw_domain_openmp_ext import rotate, evaluate_reflective_segment
 
 try:
     from numba import jit
@@ -79,6 +79,7 @@ def __init__(self, domain=None):
 
         self.conserved_quantities = np.zeros(3, float)
 
+
     def __repr__(self):
         return 'Reflective_boundary'
 
@@ -168,70 +169,10 @@ def evaluate_segment(self, domain, segment_edges):
         Xvel.boundary_values[ids] = n1*r1 - n2*r2
         Yvel.boundary_values[ids] = n2*r1 + n1*r2
 
-    def evaluate_segment(self, domain, segment_edges):
-        """Apply BC on the boundary edges defined by segment_edges
-
-        :param domain: Apply BC on this domain
-        :param segment_edges: List of boundary cells on which to apply BC
-
-        """
-
-        if segment_edges is None:
-            return
-        if domain is None:
-            return
-
-
-        ids = segment_edges
-        vol_ids  = domain.boundary_cells[ids]
-        edge_ids = domain.boundary_edges[ids]
-
-        Stage = domain.quantities['stage']
-        Elev  = domain.quantities['elevation']
-        Height= domain.quantities['height']
-        Xmom  = domain.quantities['xmomentum']
-        Ymom  = domain.quantities['ymomentum']
-        Xvel  = domain.quantities['xvelocity']
-        Yvel  = domain.quantities['yvelocity']
-
-        Normals = domain.normals
-
-        #print vol_ids
-        #print edge_ids
-        #Normals.reshape((4,3,2))
-        #print Normals.shape
-        #print Normals[vol_ids, 2*edge_ids]
-        #print Normals[vol_ids, 2*edge_ids+1]
-        
-        n1  = Normals[vol_ids,2*edge_ids]
-        n2  = Normals[vol_ids,2*edge_ids+1]
-
-        # Transfer these quantities to the boundary array
-        Stage.boundary_values[ids]  = Stage.edge_values[vol_ids,edge_ids]
-        Elev.boundary_values[ids]   = Elev.edge_values[vol_ids,edge_ids]
-        Height.boundary_values[ids] = Height.edge_values[vol_ids,edge_ids]
-
-        # Rotate and negate Momemtum
-        q1 = Xmom.edge_values[vol_ids,edge_ids]
-        q2 = Ymom.edge_values[vol_ids,edge_ids]
-
-        r1 = -q1*n1 - q2*n2
-        r2 = -q1*n2 + q2*n1
-
-        Xmom.boundary_values[ids] = n1*r1 - n2*r2
-        Ymom.boundary_values[ids] = n2*r1 + n1*r2
-
-        # Rotate and negate Velocity
-        q1 = Xvel.edge_values[vol_ids,edge_ids]
-        q2 = Yvel.edge_values[vol_ids,edge_ids]
-
-        r1 = q1*n1 + q2*n2
-        r2 = q1*n2 - q2*n1
-
-        Xvel.boundary_values[ids] = n1*r1 - n2*r2
-        Yvel.boundary_values[ids] = n2*r1 + n1*r2
 
 
+    # TODO JLGV, reflective boundary condition needs openmp version
+    # this one first
     def evaluate_segment(self, domain, segment_edges):
         """Apply BC on the boundary edges defined by segment_edges
 
@@ -239,7 +180,6 @@ def evaluate_segment(self, domain, segment_edges):
         :param segment_edges: List of boundary cells on which to apply BC
 
         """
-
         if segment_edges is None:
             return
         if domain is None:
@@ -248,45 +188,9 @@ def evaluate_segment(self, domain, segment_edges):
         ids = segment_edges
         vol_ids  = domain.boundary_cells[ids]
         edge_ids = domain.boundary_edges[ids]
+        ids_array = np.array(ids, dtype=np.int64)
 
-        Stage = domain.quantities['stage']
-        Elev  = domain.quantities['elevation']
-        Height= domain.quantities['height']
-        Xmom  = domain.quantities['xmomentum']
-        Ymom  = domain.quantities['ymomentum']
-        Xvel  = domain.quantities['xvelocity']
-        Yvel  = domain.quantities['yvelocity']
-
-        Normals = domain.normals
-        
-        n1  = Normals[vol_ids,2*edge_ids]
-        n2  = Normals[vol_ids,2*edge_ids+1]
-
-        # Transfer these quantities to the boundary array
-        Stage.boundary_values[ids]  = Stage.edge_values[vol_ids,edge_ids]
-        Elev.boundary_values[ids]   = Elev.edge_values[vol_ids,edge_ids]
-        Height.boundary_values[ids] = Height.edge_values[vol_ids,edge_ids]
-
-        # Rotate and negate Momemtum
-        q1 = Xmom.edge_values[vol_ids,edge_ids]
-        q2 = Ymom.edge_values[vol_ids,edge_ids]
-
-        r1 = -q1*n1 - q2*n2
-        r2 = -q1*n2 + q2*n1
-
-        Xmom.boundary_values[ids] = n1*r1 - n2*r2
-        Ymom.boundary_values[ids] = n2*r1 + n1*r2
-
-        # Rotate and negate Velocity
-        q1 = Xvel.edge_values[vol_ids,edge_ids]
-        q2 = Yvel.edge_values[vol_ids,edge_ids]
-
-        r1 = q1*n1 + q2*n2
-        r2 = q1*n2 - q2*n1
-
-        Xvel.boundary_values[ids] = n1*r1 - n2*r2
-        Yvel.boundary_values[ids] = n2*r1 + n1*r2
-
+        evaluate_reflective_segment(domain, ids_array, vol_ids, edge_ids)
 
 
 
@@ -473,7 +377,7 @@ def evaluate(self, vol_id, edge_id):
 
         return q
 
-
+    # TODO JLGV, needs openmp version
     def evaluate_segment(self, domain, segment_edges): 
         """Apply BC on the boundary edges defined by segment_edges
 
diff --git a/anuga/shallow_water/cuda_anuga.cu b/anuga/shallow_water/cuda_anuga.cu
index e203a5717..7d63a1be6 100755
--- a/anuga/shallow_water/cuda_anuga.cu
+++ b/anuga/shallow_water/cuda_anuga.cu
@@ -1621,7 +1621,7 @@ __global__ void _cuda_update_sw(int64_t number_of_elements,
     if (k < number_of_elements)
     {
       double x = centroid_values[k];
-      int64_t err_return = 0;
+
       if (x == 0.0) {
         semi_implicit_update[k] = 0.0;
       } else {
@@ -1632,9 +1632,7 @@ __global__ void _cuda_update_sw(int64_t number_of_elements,
 
       // Semi implicit updates
       double denominator = 1.0 - timestep*semi_implicit_update[k];
-      if (denominator <= 0.0) {
-        err_return = -1;
-      } else {
+      if (denominator > 0.0) {
         //Update conserved_quantities from semi implicit updates
         centroid_values[k] /= denominator;
       }
@@ -1642,11 +1640,6 @@ __global__ void _cuda_update_sw(int64_t number_of_elements,
       // Reset semi_implicit_update here ready for next time step
       semi_implicit_update[k] = 0.0;
 
-      if (err_return == -1)
-        {
-          // Handle error h
-        }
-
     }
   }
 
@@ -1661,7 +1654,7 @@ __global__ void _cuda_update_sw(int64_t number_of_elements,
                                               double *bed_centroid_values,
                                               double *xmom_centroid_values, 
                                               double *ymom_centroid_values, 
-                                              int64_t num_negative_cells)  // Is this the way to pass back and is it int64_t or int64_t
+                                              int64_t num_negative_cells)  // Is this the way to pass back and is it int or long
   {
     int64_t k = blockIdx.x * blockDim.x + threadIdx.x;
     num_negative_cells = 0;
@@ -1682,10 +1675,18 @@ __global__ void _cuda_update_sw(int64_t number_of_elements,
 
 
   // Protect against the water elevation falling below the triangle bed
-  __global__ void _cuda_protect_against_infinitesimal_and_negative_heights(double domain_minimum_allowed_height, int64_t number_of_elements, double* stage_centroid_values, double* bed_centroid_values, double* xmom_centroid_values, double* areas, double* stage_vertex_values) {
+  __global__ void _cuda_protect_against_infinitesimal_and_negative_heights(double domain_minimum_allowed_height,
+     int64_t number_of_elements, 
+     double* stage_centroid_values, 
+     double* bed_centroid_values, 
+     double* xmom_centroid_values, 
+     double* areas, 
+     double* stage_vertex_values)
+  {
     int64_t k3, K;
     double hc, bmin;
     double mass_error = 0.;
+    
   // This acts like minimum_allowed height, but scales with the vertical
   // distance between the bed_centroid_value and the max bed_edge_value of
   // every triangle.
@@ -1729,7 +1730,7 @@ __global__ void _cuda_update_sw(int64_t number_of_elements,
   }
 
   // COMPUTE FORCING TERMS
-  __global__ void cft_manning_friction_flat(double g, double eps, int64_t N,
+__global__ void cft_manning_friction_flat(double g, double eps, int64_t N,
         double* w, double* zv,
         double* uh, double* vh,
         double* eta, double* xmom, double* ymom) {
@@ -1752,10 +1753,7 @@ __global__ void _cuda_update_sw(int64_t number_of_elements,
             h = w[k] - z;
             if (h >= eps) {
                 S = -g * eta[k] * eta[k] * sqrt((uh[k] * uh[k] + vh[k] * vh[k]));
-                S /= pow(h, seven_thirds); //Expensive (on Ole's home computer)
-                //S /= exp((7.0/3.0)*log(h));      //seems to save about 15% over manning_friction
-                //S /= h*h*(1 + h/3.0 - h*h/9.0); //FIXME: Could use a Taylor expansion
-
+                S /= pow(h, seven_thirds); 
 
                 //Update momentum
                 xmom[k] += S * uh[k];
diff --git a/anuga/shallow_water/friction.py b/anuga/shallow_water/friction.py
index 839cde6d3..23c5d0c5c 100644
--- a/anuga/shallow_water/friction.py
+++ b/anuga/shallow_water/friction.py
@@ -19,132 +19,156 @@
 # --------------------------------------------------------------------------
 
 
-def manning_friction_implicit(domain):
+def manning_friction_semi_implicit(domain):
     
-    if domain.multiprocessor_mode in [0,1,2,3]:
-        manning_friction_implicit_cpu(domain)
-    elif domain.multiprocessor_mode == 4:
-        manning_friction_implicit_gpu(domain)
-
-
-
-
+    if domain.multiprocessor_mode == 1:
+        if domain.use_sloped_mannings:
+            # OpenMP version for sloped mannings
+            from .sw_domain_openmp_ext import manning_friction_sloped_semi_implicit_edge_based
+            manning_friction_sloped_semi_implicit_edge_based(domain)
+            #from .sw_domain_openmp_ext import manning_friction_sloped_semi_implicit
+            #manning_friction_sloped_semi_implicit(domain)
+        else:
+            # OpenMP version for flat mannings
+            from .sw_domain_openmp_ext import manning_friction_flat_semi_implicit
+            manning_friction_flat_semi_implicit(domain)
 
-def manning_friction_implicit_cpu(domain):
-    """Apply (Manning) friction to water momentum
-    Wrapper for c version.
-    FIXME SR: Thi whole module should be replaced with a call to the C code
-    in sw_domain_orig_ext.py
-    """
 
-    if domain.multiprocessor_mode == 2:
-        from .sw_domain_openmp_ext import manning_friction_flat
-        from .sw_domain_openmp_ext import manning_friction_sloped
+    elif domain.multiprocessor_mode == 2:
+        # GPU version not implemented yet, use openmp version
+        if domain.use_sloped_mannings:
+            # OpenMP version for sloped mannings
+            from .sw_domain_openmp_ext import manning_friction_sloped_semi_implicit_edge_based
+            manning_friction_sloped_semi_implicit_edge_based(domain)
+        else:
+            # OpenMP version for flat mannings
+            from .sw_domain_openmp_ext import manning_friction_flat_semi_implicit
+            manning_friction_flat_semi_implicit(domain)
     else:
-        from .sw_domain_orig_ext import manning_friction_flat
-        from .sw_domain_orig_ext import manning_friction_sloped
-
+        raise ValueError(f"""
+manning_friction_semi_implicit:
+multiprocessor_mode {domain.multiprocessor_mode} not supported
+""")
+
+# # Old code
+# def manning_friction_semi_implicit_cpu(domain):
+#     """Apply (Manning) friction to water momentum
+#     """
+    
+#     from .sw_domain_openmp_ext import manning_friction_flat
+#     from .sw_domain_openmp_ext import manning_friction_sloped
+#     from .sw_domain_openmp_ext import manning_friction_sloped_edge_based
     
-    xmom = domain.quantities['xmomentum']
-    ymom = domain.quantities['ymomentum']
+#     xmom = domain.quantities['xmomentum']
+#     ymom = domain.quantities['ymomentum']
 
-    # really only need this if using sloped mannings
-    x = domain.get_vertex_coordinates()
+#     # really only need this if using sloped mannings
+#     x = domain.get_vertex_coordinates()
 
-    w = domain.quantities['stage'].centroid_values
-    z = domain.quantities['elevation'].centroid_values
-    zv = domain.quantities['elevation'].vertex_values
+#     w = domain.quantities['stage'].centroid_values
+#     z = domain.quantities['elevation'].centroid_values
+#     zv = domain.quantities['elevation'].vertex_values
 
-    uh = xmom.centroid_values
-    vh = ymom.centroid_values
-    eta = domain.quantities['friction'].centroid_values
+#     uh = xmom.centroid_values
+#     vh = ymom.centroid_values
+#     eta = domain.quantities['friction'].centroid_values
 
-    xmom_update = xmom.semi_implicit_update
-    ymom_update = ymom.semi_implicit_update
+#     xmom_update = xmom.semi_implicit_update
+#     ymom_update = ymom.semi_implicit_update
 
-    eps = domain.minimum_allowed_height
-    g = domain.g
+#     eps = domain.minimum_allowed_height
+#     g = domain.g
 
-    if domain.use_sloped_mannings:
-        manning_friction_sloped(g, eps, x, w, uh, vh, zv, eta, xmom_update, \
-                                ymom_update)
-    else:
-        manning_friction_flat(g, eps, w, uh, vh, z, eta, xmom_update, \
-                                ymom_update)
+#     if domain.use_sloped_mannings:
+#         manning_friction_sloped(g, eps, x, w, uh, vh, zv, eta, xmom_update, \
+#                                 ymom_update)
+#     else:
+#         manning_friction_flat(g, eps, w, uh, vh, z, eta, xmom_update, \
+#                                 ymom_update)
 
 
+# def manning_friction_semi_implicit_openmp(domain):
+#     """Apply (Manning) friction to water momentum
+#     Wrapper for c version.
+#     """
 
+#     if domain.use_sloped_mannings:
+#         from .sw_domain_openmp_ext import manning_friction_sloped_semi_implicit
+#         manning_friction_sloped_semi_implicit(domain)
 
-def manning_friction_explicit(domain):
-    
-    if domain.multiprocessor_mode in [0,1,2,3]:
-        manning_friction_explicit_cpu(domain)
-    elif domain.multiprocessor_mode == 4:
-        manning_friction_explicit_gpu(domain)
+#     else:
+#         from .sw_domain_openmp_ext import manning_friction_flat_semi_implicit
+#         manning_friction_flat_semi_implicit(domain)
 
 
+# def manning_friction_explicit(domain):
+    
+#     if domain.multiprocessor_mode in [0,1,2,3]:
+#         manning_friction_explicit_cpu(domain)
+#     elif domain.multiprocessor_mode == 4:
+#         manning_friction_explicit_gpu(domain)
 
 
-def manning_friction_explicit_cpu(domain):
-    """Apply (Manning) friction to water momentum
-    Wrapper for c version
-    """
+# def manning_friction_explicit_cpu(domain):
+#     """Apply (Manning) friction to water momentum
+#     Wrapper for c version
+#     """
 
-    if domain.multiprocessor_mode == 2:
-        from .sw_domain_openmp_ext import manning_friction_flat
-        from .sw_domain_openmp_ext import manning_friction_sloped
-    else:
-        from .sw_domain_orig_ext import manning_friction_flat
-        from .sw_domain_orig_ext import manning_friction_sloped
+#     if domain.multiprocessor_mode == 2:
+#         from .sw_domain_openmp_ext import manning_friction_flat
+#         from .sw_domain_openmp_ext import manning_friction_sloped
+#     else:
+#         from .sw_domain_orig_ext import manning_friction_flat
+#         from .sw_domain_orig_ext import manning_friction_sloped
 
 
-    xmom = domain.quantities['xmomentum']
-    ymom = domain.quantities['ymomentum']
+#     xmom = domain.quantities['xmomentum']
+#     ymom = domain.quantities['ymomentum']
 
-    x = domain.get_vertex_coordinates()
+#     x = domain.get_vertex_coordinates()
 
-    w = domain.quantities['stage'].centroid_values
-    z = domain.quantities['elevation'].centroid_values
-    zv = domain.quantities['elevation'].vertex_values
+#     w = domain.quantities['stage'].centroid_values
+#     z = domain.quantities['elevation'].centroid_values
+#     zv = domain.quantities['elevation'].vertex_values
 
-    uh = xmom.centroid_values
-    vh = ymom.centroid_values
-    eta = domain.quantities['friction'].centroid_values
+#     uh = xmom.centroid_values
+#     vh = ymom.centroid_values
+#     eta = domain.quantities['friction'].centroid_values
 
-    xmom_update = xmom.explicit_update
-    ymom_update = ymom.explicit_update
+#     xmom_update = xmom.explicit_update
+#     ymom_update = ymom.explicit_update
 
-    eps = domain.minimum_allowed_height
+#     eps = domain.minimum_allowed_height
 
-    if domain.use_sloped_mannings:
-        manning_friction_sloped(domain.g, eps, x, w, uh, vh, zv, eta, xmom_update, \
-                            ymom_update)
-    else:
-        manning_friction_flat(domain.g, eps, w, uh, vh, z, eta, xmom_update, \
-                            ymom_update)
+#     if domain.use_sloped_mannings:
+#         manning_friction_sloped(domain.g, eps, x, w, uh, vh, zv, eta, xmom_update, \
+#                             ymom_update)
+#     else:
+#         manning_friction_flat(domain.g, eps, w, uh, vh, z, eta, xmom_update, \
+#                             ymom_update)
 
 
 
-#GPU version of manning_friction_implicit that'll call the kernal written in sw_domain_cuda
-def manning_friction_implicit_gpu(domain):
-    """Apply (Manning) friction to water momentum
-    Wrapper for c version
-    """
-    if domain.use_sloped_mannings:
-        domain.gpu_interface.compute_forcing_terms_manning_friction_sloped()
-    else:
-        domain.gpu_interface.compute_forcing_terms_manning_friction_flat()
+# #GPU version of manning_friction_implicit that'll call the kernel written in sw_domain_cuda
+# def manning_friction_implicit_gpu(domain):
+#     """Apply (Manning) friction to water momentum
+#     Wrapper for c version
+#     """
+#     if domain.use_sloped_mannings:
+#         domain.gpu_interface.compute_forcing_terms_manning_friction_sloped()
+#     else:
+#         domain.gpu_interface.compute_forcing_terms_manning_friction_flat()
 
 
-#GPU version of manning_friction_explicit that'll call the kernal written in sw_domain_cuda
-def manning_friction_explicit_gpu(domain):
-    """Apply (Manning) friction to water momentum
-    Wrapper for c version
-    """
-    if domain.use_sloped_mannings:
-        domain.gpu_interface.compute_forcing_terms_manning_friction_sloped()
-    else:
-        domain.gpu_interface.compute_forcing_terms_manning_friction_flat()
+# #GPU version of manning_friction_explicit that'll call the kernel written in sw_domain_cuda
+# def manning_friction_explicit_gpu(domain):
+#     """Apply (Manning) friction to water momentum
+#     Wrapper for c version
+#     """
+#     if domain.use_sloped_mannings:
+#         domain.gpu_interface.compute_forcing_terms_manning_friction_sloped()
+#     else:
+#         domain.gpu_interface.compute_forcing_terms_manning_friction_flat()
 
 
 # FIXME (Ole): This was implemented for use with one of the analytical solutions
diff --git a/anuga/shallow_water/meson.build b/anuga/shallow_water/meson.build
index 485dd01a4..d8adab74f 100644
--- a/anuga/shallow_water/meson.build
+++ b/anuga/shallow_water/meson.build
@@ -1,20 +1,6 @@
 
 inc_dir = include_directories('../utilities', incdir_numpy)
 
-openmp = dependency('openmp', required: false)
-if openmp.found()
-  if host_machine.system() == 'windows'
-    # On Windows, the mingw compiler does not support OpenMP ATOMIC operations
-    openmp_deps = dependencies
-  else
-    openmp_deps = dependencies + [openmp]
-  endif
-
-else
-  openmp_deps = dependencies
-endif
-
-# FIXME SR: Need to setup depenencies for openacc code
 
 
 # py3.extension_module('boundaries',
@@ -25,39 +11,16 @@ endif
 #   install: true,
 # )
 
-# Compile the Cython-generated C code and additional C code
-py3.extension_module('sw_domain_orig_ext',
-  sources: ['sw_domain_orig_ext.pyx'],
-  include_directories: inc_dir,
-  dependencies: dependencies,
-  subdir: 'anuga/shallow_water',
-  install: true,
-)
-
-py3.extension_module('sw_domain_simd_ext',
-  sources: ['sw_domain_simd_ext.pyx'],
-  include_directories: inc_dir,
-  dependencies: dependencies,
-  subdir: 'anuga/shallow_water',
-  install: true,
-)
 
 py3.extension_module('sw_domain_openmp_ext',
   sources: ['sw_domain_openmp_ext.pyx'],
-  c_args : ['-O3', '-march=native'],
+  c_args : openmp_c_args,
   include_directories: inc_dir,
   dependencies: openmp_deps,
   subdir: 'anuga/shallow_water',
   install: true,
 )
 
-py3.extension_module('sw_domain_openacc_ext',
-  sources: ['sw_domain_openacc_ext.pyx'],
-  include_directories: inc_dir,
-  dependencies: dependencies,
-  subdir: 'anuga/shallow_water',
-  install: true,
-)
 
 
 python_sources = [
diff --git a/anuga/shallow_water/setup.py b/anuga/shallow_water/setup.py
deleted file mode 100644
index ec77c340a..000000000
--- a/anuga/shallow_water/setup.py
+++ /dev/null
@@ -1,67 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-from Cython.Build import cythonize
-import Cython.Compiler.Options
-Cython.Compiler.Options.annotate = False
-
-#os.environ["CC"] = "nvc -O3 -acc=gpu -Minfo=accel -noswitcherror -lm -I$CUDA_HOME/include/ --device-debug --generate-line-info"
-#os.environ["CXX"] = "nvc++ -O3 -acc=gpu -Minfo=accel -noswitcherror -lm -I$CUDA_HOME/include/ --device-debug --generate-line-info -std=c++17"
-#os.environ["FC"] = "nvfortran -O3 -acc=gpu -Minfo=accel -noswitcherror -lm -I$CUDA_HOME/include/ --device-debug --generate-line-info"
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('shallow_water', parent_package, top_path)
-
-    config.add_data_dir('tests')
-    config.add_data_dir(join('tests', 'data'))
-
-    util_dir = join('..', 'utilities')
-
-    config.add_extension('sw_domain_orig_ext',
-                         sources=['sw_domain_orig_ext.pyx'],
-                         include_dirs=[util_dir])
-
-    config.add_extension('sw_domain_simd_ext',
-                         sources=['sw_domain_simd_ext.pyx'],
-                         include_dirs=[util_dir])
-      
-    
-    if sys.platform == 'win32':
-        # Looks like mingw (windows compiler) hasn't implemented openmp atomic
-        # so can't get openmp and openacc to work (suggest using linux)
-        pass
-    else:
-        config.add_extension('sw_domain_openmp_ext',
-                         sources=['sw_domain_openmp_ext.pyx'],
-                         include_dirs=[util_dir],
-                         extra_compile_args=['-fopenmp'],
-                         extra_link_args=['-fopenmp'])
-
-        config.add_extension('sw_domain_openacc_ext',
-                         sources=['sw_domain_openacc_ext.pyx'],
-                         include_dirs=[util_dir],
-                         extra_compile_args=None,
-                         extra_link_args=None)
-
-    #config.add_extension('sw_domain_cuda_ext',
-    #                     sources=['sw_domain_cuda_ext.pyx'],
-    #                     include_dirs=[util_dir],
-    #                     extra_compile_args=None,
-    #                     extra_link_args=None)
-
-
-
-    config.ext_modules = cythonize(config.ext_modules, annotate=False)
-
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/shallow_water/shallow_water_domain.py b/anuga/shallow_water/shallow_water_domain.py
index e06b2553e..d3bc90a56 100644
--- a/anuga/shallow_water/shallow_water_domain.py
+++ b/anuga/shallow_water/shallow_water_domain.py
@@ -308,15 +308,19 @@ def __init__(self,
         self.set_flow_algorithm()
 
         #-------------------------------
-        # Set multiprocessor mode
-        # 0. original with local timestep
-        # 1. simd code used by modes 2,3,4
-        # 2. Openmp
-        # 3. Openacc
-        # 4. Cuda
+        # Set default multiprocessor mode
+        # 1. Openmp
+        # 2. Cuda
         #-------------------------------
         self.gpu_interface = None
-        self.set_multiprocessor_mode(0)
+        self.set_multiprocessor_mode(1)  # Default to OpenMP
+
+        #-------------------------------
+        # If environment variable OMP_NUM_THREADS is not set, 
+        # then set to default (1 thread). If a value is given to
+        # the method, then it will override the default.
+        #------------------------------
+        self.set_omp_num_threads()
 
         #-------------------------------
         # datetime and timezone
@@ -329,8 +333,8 @@ def __init__(self,
         # Gravity is now incorporated in
         # compute_fluxes routine
         #-------------------------------
-        from .friction import manning_friction_implicit
-        self.forcing_terms.append(manning_friction_implicit)
+        from .friction import manning_friction_semi_implicit
+        self.forcing_terms.append(manning_friction_semi_implicit)
 
 
         #-------------------------------
@@ -1307,22 +1311,22 @@ def get_flow_algorithm(self):
         return self.flow_algorithm
 
 
-    def set_gravity_method(self):
-        """Gravity method is determined by the compute_fluxes_method
-        This is now not used, as gravity is combine in the compute_fluxes method
-        """
+    # def set_gravity_method(self):
+    #     """Gravity method is determined by the compute_fluxes_method
+    #     This is now not used, as gravity is combine in the compute_fluxes method
+    #     """
 
-        if  self.get_compute_fluxes_method() == 'original':
-            self.forcing_terms[0] = gravity
+    #     if  self.get_compute_fluxes_method() == 'original':
+    #         self.forcing_terms[0] = gravity
 
-        elif self.get_compute_fluxes_method() == 'wb_1':
-            self.forcing_terms[0] = gravity_wb
+    #     elif self.get_compute_fluxes_method() == 'wb_1':
+    #         self.forcing_terms[0] = gravity_wb
 
-        elif self.get_compute_fluxes_method() == 'wb_2':
-            self.forcing_terms[0] = gravity
+    #     elif self.get_compute_fluxes_method() == 'wb_2':
+    #         self.forcing_terms[0] = gravity
 
-        else:
-            raise Exception('undefined compute_fluxes method')
+    #     else:
+    #         raise Exception('undefined compute_fluxes method')
 
     def set_extrapolate_velocity(self, flag=True):
         """ Extrapolation routine uses momentum by default,
@@ -1852,19 +1856,9 @@ def compute_fluxes(self):
 
         # nvtxRangePush("Compute Fluxes (Domain)")
         # Choose the correct extension module
-        if self.multiprocessor_mode == 0:
-            from .sw_domain_orig_ext import compute_fluxes_ext_central
-
-        elif self.multiprocessor_mode == 1:
-            from .sw_domain_simd_ext import compute_fluxes_ext_central
-
-        elif self.multiprocessor_mode == 2:
+        if self.multiprocessor_mode == 1:
             from .sw_domain_openmp_ext import compute_fluxes_ext_central
-
-        elif self.multiprocessor_mode == 3:
-            from .sw_domain_openacc_ext import compute_fluxes_ext_central
-
-        elif self.multiprocessor_mode == 4:
+        elif self.multiprocessor_mode == 2:
             # change over to cuda routines as developed
             # from .sw_domain_simd_ext import compute_fluxes_ext_central
             # FIXME SR: 2023_10_16 currently compute_fluxes and distribute together
@@ -1883,30 +1877,41 @@ def distribute_to_vertices_and_edges(self):
         """ extrapolate centroid values to vertices and edges"""
 
         # Do protection step
-        nvtxRangePush('protect_against_infinities')
+        nvtxRangePush('protect against negative heights')
         self.protect_against_infinitesimal_and_negative_heights()
         nvtxRangePop()
 
         # Do extrapolation step
         # nvtxRangePush('extrapolate')
         # Choose the correct extension module
-        if self.multiprocessor_mode == 0:
-            from .sw_domain_orig_ext import extrapolate_second_order_edge_sw
-            extrapolate_second_order_edge_sw(self)
+        if self.multiprocessor_mode == 1:
+            from .sw_domain_openmp_ext import extrapolate_second_order_edge_sw
+        elif self.multiprocessor_mode == 2:
+            # change over to cuda routines as developed
+            #from .sw_domain_simd_ext import extrapolate_second_order_edge_sw
+            extrapolate_second_order_edge_sw = self.gpu_interface.extrapolate_second_order_edge_sw_kernel
+        else:
+            raise Exception('Not implemented')
 
-        elif self.multiprocessor_mode == 1:
-            from .sw_domain_simd_ext import extrapolate_second_order_edge_sw
-            extrapolate_second_order_edge_sw(self)
+        nvtxRangePush('extrapolate_second_order_edge_sw')
+        extrapolate_second_order_edge_sw(self)
+        nvtxRangePop()
 
-        elif self.multiprocessor_mode == 2:
-            from .sw_domain_openmp_ext import extrapolate_second_order_edge_sw
-            extrapolate_second_order_edge_sw(self)
+    def distribute_to_edges(self):
+        """ extrapolate centroid values edges"""
 
-        elif self.multiprocessor_mode == 3:
-            from .sw_domain_openacc_ext import extrapolate_second_order_edge_sw
-            extrapolate_second_order_edge_sw(self)
+        # Do protection step
+        nvtxRangePush('protect_against_infinities')
+        self.protect_against_infinitesimal_and_negative_heights()
+        nvtxRangePop()
 
-        elif self.multiprocessor_mode == 4:
+        # Do extrapolation step
+        # nvtxRangePush('extrapolate')
+        # Choose the correct extension module
+        if self.multiprocessor_mode == 1:
+            from .sw_domain_openmp_ext import extrapolate_second_order_edge_sw
+            extrapolate_second_order_edge_sw(self)
+        elif self.multiprocessor_mode == 2:
             # change over to cuda routines as developed
             #from .sw_domain_simd_ext import extrapolate_second_order_edge_sw
             extrapolate_second_order_edge_sw = self.gpu_interface.extrapolate_second_order_edge_sw_kernel
@@ -1914,8 +1919,29 @@ def distribute_to_vertices_and_edges(self):
         else:
             raise Exception('Not implemented')
 
-        # nvtxRangePop()
+        # nvtxRangePop()        
+
+    def distribute_edges_to_vertices(self):
+        """Distribute edge values to vertices.
+        
+        This is a wrapper for the C implementation of the distribution
+        from edges to vertices.
+        """
 
+        if self.multiprocessor_mode == 1:
+            # Using OpenMP extension
+            from .sw_domain_openmp_ext import distribute_edges_to_vertices as distribute_edges_to_vertices_ext
+        elif self.multiprocessor_mode == 2:
+            # Using CUDA extension
+            # FIXME SR: Not implemented yet so use OpenMP version
+            from .sw_domain_openmp_ext import distribute_edges_to_vertices as distribute_edges_to_vertices_ext
+            # distribute_edges_to_vertices_ext = self.gpu_interface.distribute_edges_to_vertices_kernel
+        else:
+            raise Exception('Not implemented')
+        
+        # nvtxRangePush('distribute_edges_to_vertices')
+        distribute_edges_to_vertices_ext(self)
+        # nvtxRangePop()
 
     def distribute_using_edge_limiter(self):
         """Distribution from centroids to edges specific to the SWW eqn.
@@ -2019,22 +2045,13 @@ def protect_against_infinitesimal_and_negative_heights(self):
 
         # nvtxRangePush('protect_new')
         # Choose the correct extension module
-        if self.multiprocessor_mode == 0:
-            from .sw_domain_orig_ext import protect_new
-
-        elif self.multiprocessor_mode == 1:
-            from .sw_domain_simd_ext import protect_new
-
+        if self.multiprocessor_mode == 1:
+            from .sw_domain_openmp_ext import protect_new
         elif self.multiprocessor_mode == 2:
-            from .sw_domain_openmp_ext import  protect_new
-
-        elif self.multiprocessor_mode == 3:
-            from .sw_domain_openacc_ext import  protect_new
-
-        elif self.multiprocessor_mode == 4:
             # change over to cuda routines as developed
             # # from .sw_domain_simd_ext import  protect_new
-            protect_new = self.gpu_interface.protect_against_infinitesimal_and_negative_heights_kernal
+            #from .sw_domain_openmp_ext import protect_new
+            protect_new = self.gpu_interface.protect_against_infinitesimal_and_negative_heights_kernel
         else:
             raise Exception('Not implemented')
 
@@ -2086,6 +2103,43 @@ def balance_deep_and_shallow(self):
 
         nvtxRangePop()
 
+
+    def apply_protection_against_isolated_degenerate_timesteps(self):
+
+        if self.protect_against_isolated_degenerate_timesteps is False:
+            return
+
+        # FIXME (Ole): Make this configurable
+        if num.max(self.max_speed) < 10.0:
+            return
+
+        # Setup 10 bins for speed histogram
+        from anuga.utilities.numerical_tools import histogram, create_bins
+
+        bins = create_bins(self.max_speed, 10)
+        hist = histogram(self.max_speed, bins)
+
+        # Look for characteristic signature
+        if len(hist) > 1 and hist[-1] > 0 and \
+           hist[4] == hist[5] == hist[6] == hist[7] == hist[8] == 0:
+            # Danger of isolated degenerate triangles
+
+            # Find triangles in last bin
+            # FIXME - speed up using numeric package
+            d = 0
+            for i in range(self.number_of_triangles):
+                if self.max_speed[i] > bins[-1]:
+                    msg = 'Time=%f: Ignoring isolated high ' % self.get_time()
+                    msg += 'speed triangle '
+                    msg += '#%d of %d with max speed = %f' \
+                        % (i, self.number_of_triangles, self.max_speed[i])
+
+                    self.get_quantity('xmomentum').set_values(0.0, indices=[i])
+                    self.get_quantity('ymomentum').set_values(0.0, indices=[i])
+                    self.max_speed[i] = 0.0
+                    d += 1
+
+
     def update_conserved_quantities(self):
         """Update vectors of conserved quantities using previously
         computed fluxes and specified forcing functions.
@@ -2107,58 +2161,29 @@ def update_conserved_quantities(self):
         #Stage.update(timestep)
         #Xmom.update(timestep)
         #Ymom.update(timestep)
+        
+        assert self.get_using_discontinuous_elevation()
 
-        if self.get_using_discontinuous_elevation():
-
-            # Choose the correct extension module
-            if self.multiprocessor_mode == 0:
-                Stage.update(timestep)
-                Xmom.update(timestep)
-                Ymom.update(timestep)
-                from .sw_domain_orig_ext import fix_negative_cells
-                num_negative_ids = fix_negative_cells(self)
-
-            elif self.multiprocessor_mode == 1:
-                Stage.update(timestep)
-                Xmom.update(timestep)
-                Ymom.update(timestep)
-                from .sw_domain_simd_ext import fix_negative_cells
-                num_negative_ids = fix_negative_cells(self)
-
-            elif self.multiprocessor_mode == 2:
-                Stage.update(timestep)
-                Xmom.update(timestep)
-                Ymom.update(timestep)                
-                from .sw_domain_openmp_ext import fix_negative_cells
-                num_negative_ids = fix_negative_cells(self)
-
-            elif self.multiprocessor_mode == 3:
-                Stage.update(timestep)
-                Xmom.update(timestep)
-                Ymom.update(timestep)
-                from .sw_domain_openacc_ext import fix_negative_cells
-                num_negative_ids = fix_negative_cells(self)
-                
-            elif self.multiprocessor_mode == 4:
-                
-                # nvtxRangePush('update_conserved_quantities_kernal')
-                     
-                 update_conserved_quantities_fix_negative_cells = self.gpu_interface.update_conserved_quantities_kernal
-                 num_negative_ids = update_conserved_quantities_fix_negative_cells(self)
-                # nvtxRangePop()
-                
-                # change over to cuda routines as developed
-                #from .sw_domain_simd_ext import fix_negative_cells
-                #num_negative_ids = fix_negative_cells(self)
-            else:
-                raise Exception('Not implemented')
+        # Update height based on discontinuous elevation
+        if self.multiprocessor_mode == 1:
+            
+            from .sw_domain_openmp_ext import update_conserved_quantities
+            num_negative_ids = update_conserved_quantities(self, timestep)
+
+        elif self.multiprocessor_mode == 2:
+
+            update_conserved_quantities = self.gpu_interface.update_conserved_quantities_kernel
+            num_negative_ids = update_conserved_quantities(self, timestep)
+        
+        else:
+            raise Exception('Not implemented')
 
-            if num_negative_ids > 0:
-                # FIXME: This only warns the first time -- maybe we should warn whenever loss occurs?
-                import warnings
-                msg = 'Negative cells being set to zero depth, possible loss of conservation. \n' +\
-                      'Consider using domain.report_water_volume_statistics() to check the extent of the problem'
-                warnings.warn(msg)
+        if num_negative_ids > 0:
+            # FIXME: This only warns the first time -- maybe we should warn whenever loss occurs?
+            import warnings
+            msg = f'{num_negative_ids} negative cells being set to zero depth, possible loss of conservation. \n' +\
+            'Consider using domain.report_water_volume_statistics() to check the extent of the problem'
+            warnings.warn(msg)
 
         # nvtxRangePop()
 
@@ -2264,7 +2289,6 @@ def update_centroids_of_momentum_from_velocity(self):
         """
 
         # For shallow water we need to update height xvelocity and yvelocity
-
         #Shortcuts
         UH = self.quantities['xmomentum']
         VH = self.quantities['ymomentum']
@@ -2349,6 +2373,7 @@ def evolve(self,
         # and or visualisation.
         # This is done again in the initialisation of the Generic_Domain
         # evolve loop but we do it here to ensure the values are ok for storage.
+
         self.distribute_to_vertices_and_edges()
 
         if self.store is True and (self.get_relative_time() == 0.0 or self.evolved_called is False):
@@ -2449,6 +2474,286 @@ def sww_merge(self,  *args, **kwargs):
 
         pass
 
+
+    def evolve_one_euler_step(self, yieldstep, finaltime):
+        """One Euler Time Step
+        Q^{n+1} = E(h) Q^n
+
+        Does not assume that centroid values have been extrapolated to
+        vertices and edges
+        """
+
+        #nvtx marker
+        nvtxRangePush('distribute_to_vertices_and_edges')
+
+        # From centroid values calculate edge and vertex values
+        self.distribute_to_vertices_and_edges()
+
+        #nvtx marker
+        nvtxRangePop()
+
+        #nvtx marker
+        nvtxRangePush('update_boundary')
+        # Apply boundary conditions
+        self.update_boundary()
+        #nvtx marker
+        nvtxRangePop()
+
+        #nvtx marker
+        nvtxRangePush('compute_fluxes')
+        # Compute fluxes across each element edge
+        self.compute_fluxes()
+        #nvtx marker
+        nvtxRangePop()
+
+        #nvtx marker
+        nvtxRangePush('compute_forcing_terms')
+        # Compute forcing terms
+        self.compute_forcing_terms()
+        #nvtx marker
+        nvtxRangePop()
+
+        #nvtx marker
+        nvtxRangePush('update_timestep')
+        # Update timestep to fit yieldstep and finaltime
+        self.update_timestep(yieldstep, finaltime)
+        #nvtx marker
+        nvtxRangePop()
+
+        #nvtx marker
+        nvtxRangePush('compute_flux_update_frequency')
+        if self.max_flux_update_frequency != 1:
+            # Update flux_update_frequency using the new timestep
+            self.compute_flux_update_frequency()
+        #nvtx marker
+        nvtxRangePop()
+
+        #nvtx marker
+        nvtxRangePush('update_conserved_quantities')
+        # Update conserved quantities
+        self.update_conserved_quantities()
+        #nvtx marker
+        nvtxRangePop()
+
+    def evolve_one_rk2_step(self, yieldstep, finaltime):
+        """One 2nd order RK timestep
+        Q^{n+1} = 0.5 Q^n + 0.5 E(h)^2 Q^n
+
+        Does not assume that centroid values have been extrapolated to
+        vertices and edges
+        """
+
+        # Save initial initial conserved quantities values
+        self.backup_conserved_quantities()
+
+        #==========================================
+        # First euler step
+        #==========================================
+
+        # From centroid values calculate edge and vertex values
+        self.distribute_to_vertices_and_edges()
+
+        # Apply boundary conditions
+        self.update_boundary()
+
+        # Compute fluxes across each element edge
+        self.compute_fluxes()
+
+        # Compute forcing terms
+        self.compute_forcing_terms()
+
+        # Update timestep to fit yieldstep and finaltime
+        self.update_timestep(yieldstep, finaltime)
+
+        # Update centroid values of conserved quantities
+        self.update_conserved_quantities()
+
+        # Update special conditions
+        # self.update_special_conditions()
+
+        # Update time
+        self.set_relative_time(self.get_relative_time() + self.timestep)
+
+        # Update ghosts
+        if self.ghost_layer_width < 4:
+            self.update_ghosts()
+
+        # Update vertex and edge values
+        self.distribute_to_vertices_and_edges()
+
+        # Update boundary values
+        self.update_boundary()
+
+        #=========================================
+        # Second Euler step using the same timestep
+        # calculated in the first step. Might lead to
+        # stability problems but we have not seen any
+        # example.
+        #=========================================
+
+        # Compute fluxes across each element edge
+        self.compute_fluxes()
+
+        # Compute forcing terms
+        self.compute_forcing_terms()
+
+        # Update conserved quantities
+        self.update_conserved_quantities()
+
+        #========================================
+        # Combine initial and final values
+        # of conserved quantities and cleanup
+        #========================================
+
+        # Combine steps
+        self.saxpy_conserved_quantities(0.5, 0.5)
+
+
+    def evolve_one_rk3_step(self, yieldstep, finaltime):
+        """One 3rd order RK timestep
+        Q^(1) = 3/4 Q^n + 1/4 E(h)^2 Q^n  (at time t^n + h/2)
+        Q^{n+1} = 1/3 Q^n + 2/3 E(h) Q^(1) (at time t^{n+1})
+
+        Does not assume that centroid values have been extrapolated to
+        vertices and edges
+        """
+
+        # Save initial initial conserved quantities values
+        self.backup_conserved_quantities()
+
+        initial_time = self.get_relative_time()
+
+        ######
+        # First euler step
+        ######
+
+        # From centroid values calculate edge and vertex values
+        self.distribute_to_vertices_and_edges()
+
+        # Apply boundary conditions
+        self.update_boundary()
+
+        # Compute fluxes across each element edge
+        self.compute_fluxes()
+
+        # Compute forcing terms
+        self.compute_forcing_terms()
+
+        # Update timestep to fit yieldstep and finaltime
+        self.update_timestep(yieldstep, finaltime)
+
+        # Update conserved quantities
+        self.update_conserved_quantities()
+
+        # Update special conditions
+        # self.update_special_conditions()
+
+        # Update time
+        self.set_relative_time(self.relative_time+ self.timestep)
+
+        # Update ghosts
+        self.update_ghosts()
+
+        # Update vertex and edge values
+        self.distribute_to_vertices_and_edges()
+
+        # Update boundary values
+        self.update_boundary()
+
+        ######
+        # Second Euler step using the same timestep
+        # calculated in the first step. Might lead to
+        # stability problems but we have not seen any
+        # example.
+        ######
+
+        # Compute fluxes across each element edge
+        self.compute_fluxes()
+
+        # Compute forcing terms
+        self.compute_forcing_terms()
+
+        # Update conserved quantities
+        self.update_conserved_quantities()
+
+        ######
+        # Combine steps to obtain intermediate
+        # solution at time t^n + 0.5 h
+        ######
+
+        # Combine steps
+        self.saxpy_conserved_quantities(0.25, 0.75)
+
+        # Update special conditions
+        # self.update_special_conditions()
+
+        # Set substep time
+        self.set_relative_time(initial_time + self.timestep * 0.5)
+
+        # Update ghosts
+        self.update_ghosts()
+
+        # Update vertex and edge values
+        self.distribute_to_vertices_and_edges()
+
+        # Update boundary values
+        self.update_boundary()
+
+        ######
+        # Third Euler step
+        ######
+
+        # Compute fluxes across each element edge
+        self.compute_fluxes()
+
+        # Compute forcing terms
+        self.compute_forcing_terms()
+
+        # Update conserved quantities
+        self.update_conserved_quantities()
+
+        #=======================================
+        # Combine final and initial values
+        # and cleanup
+        #=======================================
+        
+        # self.saxpy_conserved_quantities(2.0/3.0, 1.0/3.0)
+        # This caused a roundoff error that created negative water heights
+
+        # So do this instead!
+        self.saxpy_conserved_quantities(2.0, 1.0, 3.0)
+
+
+        # Set new time
+        self.set_relative_time(initial_time + self.timestep)
+
+
+    def backup_conserved_quantities(self):
+
+        # Backup conserved_quantities centroid values
+        if self.multiprocessor_mode == 1:
+            from anuga.shallow_water.sw_domain_openmp_ext import backup_conserved_quantities
+            backup_conserved_quantities(self)
+        else:
+            for name in self.conserved_quantities:
+                Q = self.quantities[name]
+                Q.backup_centroid_values()
+
+    def saxpy_conserved_quantities(self, a, b, c=None):
+
+        # saxpy conserved_quantities centroid values with backup values
+        if self.multiprocessor_mode == 1:
+            if c is None:
+                c = 1.0
+            from anuga.shallow_water.sw_domain_openmp_ext import saxpy_conserved_quantities
+            saxpy_conserved_quantities(self, a, b, c)
+        else:
+            for name in self.conserved_quantities:
+                Q = self.quantities[name]
+                Q.saxpy_centroid_values(a, b)
+                if c is not None:
+                    Q.centroid_values[:] = Q.centroid_values / c
+
     def timestepping_statistics(self,
                                 track_speeds=False,
                                 triangle_id=None,
@@ -2723,21 +3028,11 @@ def compute_flux_update_frequency(self):
 
         nvtxRangePush('compute_flux_update_frequency')
         # Choose the correct extension module
-        if self.multiprocessor_mode == 0:
-            from .sw_domain_orig_ext import compute_flux_update_frequency
-
-        elif self.multiprocessor_mode == 1:
-            from .sw_domain_simd_ext import compute_flux_update_frequency
-
-        elif self.multiprocessor_mode == 2:
+        if self.multiprocessor_mode == 1:
             from .sw_domain_openmp_ext import compute_flux_update_frequency
-
-        elif self.multiprocessor_mode == 3:
-            from .sw_domain_openacc_ext import compute_flux_update_frequency
-
-        elif self.multiprocessor_mode == 4:
+        elif self.multiprocessor_mode == 2:
             # change over to cuda routines as developed
-            from .sw_domain_simd_ext import compute_flux_update_frequency
+            from .sw_domain_openmp_ext import compute_flux_update_frequency
         else:
             raise Exception('Not implemented')
 
@@ -2853,31 +3148,67 @@ def get_inv_tri_map(self):
         return self.inv_tri_map
 
 # ==============================================================================
-# GPU interface
+# Multiprocessor Mode (1=openmp, 2=cuda (in development))
 # ==============================================================================
 
     def set_multiprocessor_mode(self, multiprocessor_mode= 0):
         """
         Set multiprocessor mode 
+         1. openmp (in development)
+         2. cupy (in development)
+        """
+
+        if multiprocessor_mode not in [1,2]:
+            raise ValueError('Invalid multiprocessor mode. Must be one of [1,2] (openmp, cupy)')
+
+        self.multiprocessor_mode = multiprocessor_mode
+
+        if self.multiprocessor_mode == 2:
+            self.set_gpu_interface()
+
+    def get_multiprocessor_mode(self):
+        """
+        Get multiprocessor mode 
         
-        0. original
-        1. simd (used for multiprocessor)
-        2. openmp (in development)
-        3. openacc (in development)
-        4. cuda (in development)
+        1. openmp (in development)
+        2. cupy (in development)
+        """
+        return self.multiprocessor_mode 
+
+    def set_omp_num_threads(self, omp_num_threads=None):
+        """
+        Set the number of OpenMP threads to use for parallel processing.
+        If OMP_NUM_THREADS is not set, this will set it to the specified 
+        omp_num_threads value.
+        By default omp_num_threads is set to 1, other , it will use the default setting.
         """
 
-        if multiprocessor_mode in [0,1,2,3,4]:
-            self.multiprocessor_mode = multiprocessor_mode
+        import os
+        if omp_num_threads is None:
+            # Use the default setting
+            omp_num_threads = os.environ.get('OMP_NUM_THREADS', None)
+            #print(f'Using OMP_NUM_THREADS from environment: {omp_num_threads}')
+
+
+        if omp_num_threads is None:
+            omp_num_threads = 1  # Default to 1 if not set
+
+        try:
+            omp_num_threads = int(omp_num_threads)
+        except ValueError:
+            raise ValueError('OMP_NUM_THREADS must be an integer')            
+
+        # Set the number of OpenMP threads
+        self.omp_num_threads = omp_num_threads
+        from .sw_domain_openmp_ext import set_omp_num_threads
+        set_omp_num_threads(omp_num_threads)
+        
+        print(f'Setting omp_num_threads to {omp_num_threads}')
+
 
-            if multiprocessor_mode == 4:
-                self.set_gpu_interface()
-        else:
-            raise Exception('multiprocessor mode {multiprocessor_mode} not supported')
-    
     def set_gpu_interface(self):
 
-        if self.multiprocessor_mode == 4 and self.gpu_interface is None:
+        if self.multiprocessor_mode == 2 and self.gpu_interface is None:
 
             # first check that cupy is available
             try:
@@ -2887,10 +3218,10 @@ def set_gpu_interface(self):
             except:
                 print('+==============================================================================+')
                 print('|                                                                              |')
-                print('| WARNING: cupy or gpu not available, so falling back to multiprocessor_mode 0 |')
+                print('| WARNING: cupy or gpu not available, so falling back to multiprocessor_mode 1 |')
                 print('|                                                                              |')
                 print('+==============================================================================+')
-                self.set_multiprocessor_mode(0)
+                self.set_multiprocessor_mode(1)
                 return
 
             from .sw_domain_cuda import GPU_interface
diff --git a/anuga/shallow_water/sw_domain.h b/anuga/shallow_water/sw_domain.h
index bf0170659..d794253da 100644
--- a/anuga/shallow_water/sw_domain.h
+++ b/anuga/shallow_water/sw_domain.h
@@ -9,25 +9,28 @@
 
 #include <stdint.h>
 #include <stdio.h>
+#include <stdbool.h>
+#include <inttypes.h>
+#include "anuga_typedefs.h"
 
 // structures
 struct domain {
     // Changing these don't change the data in python object
-    int64_t    number_of_elements;
-    int64_t    boundary_length;
-    int64_t    number_of_riverwall_edges;
-    double  epsilon;
-    double  H0;
-    double  g;
-    int64_t    optimise_dry_cells;
-    double  evolve_max_timestep;
-    int64_t    extrapolate_velocity_second_order;
-    double  minimum_allowed_height;
-    double  maximum_allowed_speed;
-    int64_t    low_froude;
-
-
-    int64_t timestep_fluxcalls;
+    anuga_int    number_of_elements;
+    anuga_int    boundary_length;
+    anuga_int    number_of_riverwall_edges;
+    double     epsilon;
+    double     H0;
+    double     g;
+    anuga_int    optimise_dry_cells;
+    double     evolve_max_timestep;
+    anuga_int    extrapolate_velocity_second_order;
+    double     minimum_allowed_height;
+    double     maximum_allowed_speed;
+    anuga_int    low_froude;
+
+
+    anuga_int timestep_fluxcalls;
 
     double beta_w;
     double beta_w_dry;
@@ -36,34 +39,36 @@ struct domain {
     double beta_vh;
     double beta_vh_dry;
 
-    int64_t max_flux_update_frequency;
-    int64_t ncol_riverwall_hydraulic_properties;
+    anuga_int max_flux_update_frequency;
+    anuga_int ncol_riverwall_hydraulic_properties;
 
     // Changing values in these arrays will change the values in the python object
-    int64_t*   neighbours;
-    int64_t*   neighbour_edges;
-    int64_t*   surrogate_neighbours;
-    double* normals;
-    double* edgelengths;
-    double* radii;
-    double* areas;
+    anuga_int*   neighbours;
+    anuga_int*   neighbour_edges;
+    anuga_int*   surrogate_neighbours;
+    double*    normals;
+    double*    edgelengths;
+    double*    radii;
+    double*    areas;
 
-    int64_t* edge_flux_type;
+    anuga_int*   edge_flux_type;
 
-    int64_t*   tri_full_flag;
-    int64_t*   already_computed_flux;
-    double* max_speed;
+    anuga_int*   tri_full_flag;
+    anuga_int*   already_computed_flux;
+    double*    max_speed;
 
     double* vertex_coordinates;
     double* edge_coordinates;
     double* centroid_coordinates;
 
-    int64_t*   number_of_boundaries;
+    anuga_int* number_of_boundaries;
     double* stage_edge_values;
     double* xmom_edge_values;
     double* ymom_edge_values;
     double* bed_edge_values;
     double* height_edge_values;
+    double* xvelocity_edge_values;
+    double* yvelocity_edge_values;
 
     double* stage_centroid_values;
     double* xmom_centroid_values;
@@ -82,14 +87,17 @@ struct domain {
     double* xmom_boundary_values;
     double* ymom_boundary_values;
     double* bed_boundary_values;
+    double* height_boundary_values;
+    double* xvelocity_boundary_values;
+    double* yvelocity_boundary_values;
 
     double* stage_explicit_update;
     double* xmom_explicit_update;
     double* ymom_explicit_update;
 
-    int64_t* flux_update_frequency;
-    int64_t* update_next_flux;
-    int64_t* update_extrapolation;
+    anuga_int* flux_update_frequency;
+    anuga_int* update_next_flux;
+    anuga_int* update_extrapolation;
     double* edge_timestep;
     double* edge_flux_work;
     double* neigh_work;
@@ -98,25 +106,30 @@ struct domain {
     double* y_centroid_work;
     double* boundary_flux_sum;
 
-    int64_t* allow_timestep_increase;
+    anuga_int* allow_timestep_increase;
 
-    int64_t* edge_river_wall_counter;
+    anuga_int* edge_river_wall_counter;
     double* riverwall_elevation;
-    int64_t* riverwall_rowIndex;
+    anuga_int* riverwall_rowIndex;
     double* riverwall_hydraulic_properties;
 
     double* stage_semi_implicit_update;
     double* xmom_semi_implicit_update;
-    double* ymom_semi_implicit_update;    
-
+    double* ymom_semi_implicit_update; 
     
+    double* friction_centroid_values;
+
+    double* stage_backup_values;
+    double* xmom_backup_values;
+    double* ymom_backup_values;
+
 };
 
 
 struct edge {
 
-    int64_t cell_id;
-    int64_t edge_id;
+    anuga_int cell_id;
+    anuga_int edge_id;
 
     // mid point values
     double w;
@@ -147,10 +160,10 @@ struct edge {
 };
 
 
-void get_edge_data(struct edge *E, struct domain *D, int64_t k, int64_t i) {
+void get_edge_data(struct edge *E, struct domain *D, anuga_int k, anuga_int i) {
     // fill edge data (conserved and bed) for ith edge of kth triangle
 
-    int64_t k3i, k3i1, k3i2;
+    anuga_int k3i, k3i1, k3i2;
 
     k3i = 3 * k + i;
     k3i1 = 3 * k + (i + 1) % 3;
@@ -180,21 +193,21 @@ void get_edge_data(struct edge *E, struct domain *D, int64_t k, int64_t i) {
 
 }
 
-int64_t print_domain_struct(struct domain *D) {
+anuga_int print_domain_struct(struct domain *D) {
 
 
-    printf("D->number_of_elements     %ld  \n", D->number_of_elements);
-    printf("D->boundary_length        %ld  \n", D->boundary_length);
-    printf("D->number_of_riverwall_edges %ld  \n", D->number_of_riverwall_edges);
+    printf("D->number_of_elements     %" PRId64 "  \n", D->number_of_elements);
+    printf("D->boundary_length        %" PRId64 "  \n", D->boundary_length);
+    printf("D->number_of_riverwall_edges %" PRId64 "  \n", D->number_of_riverwall_edges);
     printf("D->epsilon                %g \n", D->epsilon);
     printf("D->H0                     %g \n", D->H0);
     printf("D->g                      %g \n", D->g);
-    printf("D->optimise_dry_cells     %ld \n", D->optimise_dry_cells);
+    printf("D->optimise_dry_cells     %" PRId64 " \n", D->optimise_dry_cells);
     printf("D->evolve_max_timestep    %g \n", D->evolve_max_timestep);
     printf("D->minimum_allowed_height %g \n", D->minimum_allowed_height);
     printf("D->maximum_allowed_speed  %g \n", D->maximum_allowed_speed);
-    printf("D->low_froude             %ld \n", D->low_froude);
-    printf("D->extrapolate_velocity_second_order %ld \n", D->extrapolate_velocity_second_order);
+    printf("D->low_froude             %" PRId64 " \n", D->low_froude);
+    printf("D->extrapolate_velocity_second_order %" PRId64 " \n", D->extrapolate_velocity_second_order);
     printf("D->beta_w                 %g \n", D->beta_w);
     printf("D->beta_w_dry             %g \n", D->beta_w_dry);
     printf("D->beta_uh                %g \n", D->beta_uh);
@@ -204,47 +217,121 @@ int64_t print_domain_struct(struct domain *D) {
 
 
 
-    printf("D->neighbours             %p \n", D->neighbours);
-    printf("D->surrogate_neighbours   %p \n", D->surrogate_neighbours);
-    printf("D->neighbour_edges        %p \n", D->neighbour_edges);
-    printf("D->normals                %p \n", D->normals);
-    printf("D->edgelengths            %p \n", D->edgelengths);
-    printf("D->radii                  %p \n", D->radii);
-    printf("D->areas                  %p \n", D->areas);
-    printf("D->tri_full_flag          %p \n", D->tri_full_flag);
-    printf("D->already_computed_flux  %p \n", D->already_computed_flux);
-    printf("D->vertex_coordinates     %p \n", D->vertex_coordinates);
-    printf("D->edge_coordinates       %p \n", D->edge_coordinates);
-    printf("D->centroid_coordinates   %p \n", D->centroid_coordinates);
-    printf("D->max_speed              %p \n", D->max_speed);
-    printf("D->number_of_boundaries   %p \n", D->number_of_boundaries);
-    printf("D->stage_edge_values      %p \n", D->stage_edge_values);
-    printf("D->xmom_edge_values       %p \n", D->xmom_edge_values);
-    printf("D->ymom_edge_values       %p \n", D->ymom_edge_values);
-    printf("D->bed_edge_values        %p \n", D->bed_edge_values);
-    printf("D->stage_centroid_values  %p \n", D->stage_centroid_values);
-    printf("D->xmom_centroid_values   %p \n", D->xmom_centroid_values);
-    printf("D->ymom_centroid_values   %p \n", D->ymom_centroid_values);
-    printf("D->bed_centroid_values    %p \n", D->bed_centroid_values);
-    printf("D->stage_vertex_values    %p \n", D->stage_vertex_values);
-    printf("D->xmom_vertex_values     %p \n", D->xmom_vertex_values);
-    printf("D->ymom_vertex_values     %p \n", D->ymom_vertex_values);
-    printf("D->bed_vertex_values      %p \n", D->bed_vertex_values);
-    printf("D->height_vertex_values      %p \n", D->height_vertex_values);
-    printf("D->stage_boundary_values  %p \n", D->stage_boundary_values);
-    printf("D->xmom_boundary_values   %p \n", D->xmom_boundary_values);
-    printf("D->ymom_boundary_values   %p \n", D->ymom_boundary_values);
-    printf("D->bed_boundary_values    %p \n", D->bed_boundary_values);
-    printf("D->stage_explicit_update  %p \n", D->stage_explicit_update);
-    printf("D->xmom_explicit_update   %p \n", D->xmom_explicit_update);
-    printf("D->ymom_explicit_update   %p \n", D->ymom_explicit_update);
-    printf("D->edge_river_wall_counter   %p \n", D->edge_river_wall_counter);
-    printf("D->stage_semi_implicit_update  %p \n", D->stage_semi_implicit_update);
-    printf("D->xmom_semi_implicit_update   %p \n", D->xmom_semi_implicit_update);
-    printf("D->ymom_semi_implicit_update   %p \n", D->ymom_semi_implicit_update);
+    printf("D->neighbours             %p \n", (void *) D->neighbours);
+    printf("D->surrogate_neighbours   %p \n", (void *) D->surrogate_neighbours);
+    printf("D->neighbour_edges        %p \n", (void *) D->neighbour_edges);
+    printf("D->normals                %p \n", (void *) D->normals);
+    printf("D->edgelengths            %p \n", (void *) D->edgelengths);
+    printf("D->radii                  %p \n", (void *) D->radii);
+    printf("D->areas                  %p \n", (void *) D->areas);
+    printf("D->tri_full_flag          %p \n", (void *) D->tri_full_flag);
+    printf("D->already_computed_flux  %p \n", (void *) D->already_computed_flux);
+    printf("D->vertex_coordinates     %p \n", (void *) D->vertex_coordinates);
+    printf("D->edge_coordinates       %p \n", (void *) D->edge_coordinates);
+    printf("D->centroid_coordinates   %p \n", (void *) D->centroid_coordinates);
+    printf("D->max_speed              %p \n", (void *) D->max_speed);
+    printf("D->number_of_boundaries   %p \n", (void *) D->number_of_boundaries);
+    printf("D->stage_edge_values      %p \n", (void *) D->stage_edge_values);
+    printf("D->xmom_edge_values       %p \n", (void *) D->xmom_edge_values);
+    printf("D->ymom_edge_values       %p \n", (void *) D->ymom_edge_values);
+    printf("D->bed_edge_values        %p \n", (void *) D->bed_edge_values);
+    printf("D->stage_centroid_values  %p \n", (void *) D->stage_centroid_values);
+    printf("D->xmom_centroid_values   %p \n", (void *) D->xmom_centroid_values);
+    printf("D->ymom_centroid_values   %p \n", (void *) D->ymom_centroid_values);
+    printf("D->bed_centroid_values    %p \n", (void *) D->bed_centroid_values);
+    printf("D->stage_vertex_values    %p \n", (void *) D->stage_vertex_values);
+    printf("D->xmom_vertex_values     %p \n", (void *) D->xmom_vertex_values);
+    printf("D->ymom_vertex_values     %p \n", (void *) D->ymom_vertex_values);
+    printf("D->bed_vertex_values      %p \n", (void *) D->bed_vertex_values);
+    printf("D->height_vertex_values      %p \n", (void *) D->height_vertex_values);
+    printf("D->stage_boundary_values  %p \n", (void *) D->stage_boundary_values);
+    printf("D->xmom_boundary_values   %p \n", (void *) D->xmom_boundary_values);
+    printf("D->ymom_boundary_values   %p \n", (void *) D->ymom_boundary_values);
+    printf("D->bed_boundary_values    %p \n", (void *) D->bed_boundary_values);
+    printf("D->stage_explicit_update  %p \n", (void *) D->stage_explicit_update);
+    printf("D->xmom_explicit_update   %p \n", (void *) D->xmom_explicit_update);
+    printf("D->ymom_explicit_update   %p \n", (void *) D->ymom_explicit_update);
+    printf("D->edge_river_wall_counter   %p \n",   (void *) D->edge_river_wall_counter);
+    printf("D->stage_semi_implicit_update  %p \n", (void *) D->stage_semi_implicit_update);
+    printf("D->xmom_semi_implicit_update   %p \n", (void *) D->xmom_semi_implicit_update);
+    printf("D->ymom_semi_implicit_update   %p \n", (void *) D->ymom_semi_implicit_update);
+    printf("D->friction_centroid_values   %p \n", (void *) D->friction_centroid_values);
 
 
     return 0;
 }
 
+
+typedef struct {
+    double ql[3], qr[3];
+    double zl, zr;
+    double hle, hre;
+    double h_left, h_right;
+    double hc, zc, hc_n, zc_n;
+    double z_half;
+    double normal_x, normal_y;
+    double length;
+    int n; // neighbour index
+    int ki, ki2;
+    bool is_boundary;
+    bool is_riverwall;
+    int riverwall_index;
+} EdgeData;
+
+// Extract edge-related data and organize it into EdgeData
+static inline void get_edge_data_central_flux(const struct domain * __restrict D, const int k, const int i, EdgeData * __restrict E) {
+    E->ki = 3 * k + i;
+    E->ki2 = 2 * E->ki;
+
+    E->ql[0] = D->stage_edge_values[E->ki];
+    E->ql[1] = D->xmom_edge_values[E->ki];
+    E->ql[2] = D->ymom_edge_values[E->ki];
+    E->zl = D->bed_edge_values[E->ki];
+    E->hle = D->height_edge_values[E->ki];
+    E->length = D->edgelengths[E->ki];
+
+    E->n = D->neighbours[E->ki];
+    E->is_boundary = (E->n < 0);
+    E->normal_x = D->normals[E->ki2];
+    E->normal_y = D->normals[E->ki2 + 1];
+
+    E->hc = D->height_centroid_values[k];
+    E->zc = D->bed_centroid_values[k];
+    E->hc_n=E->hc;
+    E->zc_n=D->bed_centroid_values[k];
+
+    if (E->is_boundary) {
+        int m = -E->n - 1;
+        E->qr[0] = D->stage_boundary_values[m];
+        E->qr[1] = D->xmom_boundary_values[m];
+        E->qr[2] = D->ymom_boundary_values[m];
+        E->zr = E->zl;
+        E->hre = fmax(E->qr[0] - E->zr, 0.0);
+    } else {
+        E->hc_n = D->height_centroid_values[E->n];
+        E->zc_n = D->bed_centroid_values[E->n];
+        int m = D->neighbour_edges[E->ki];
+        int nm = E->n * 3 + m;
+        E->qr[0] = D->stage_edge_values[nm];
+        E->qr[1] = D->xmom_edge_values[nm];
+        E->qr[2] = D->ymom_edge_values[nm];
+        E->zr = D->bed_edge_values[nm];
+        E->hre = D->height_edge_values[nm];
+    }
+
+    E->z_half = fmax(E->zl, E->zr);
+
+    // Check for riverwall elevation override
+    E->is_riverwall = (D->edge_flux_type[E->ki] == 1);
+    if (E->is_riverwall) {
+        E->riverwall_index = D->edge_river_wall_counter[E->ki] - 1;
+        double zwall = D->riverwall_elevation[E->riverwall_index];
+        E->z_half = fmax(zwall, E->z_half);
+    }
+
+    E->h_left = fmax(E->hle + E->zl - E->z_half, 0.0);
+    E->h_right = fmax(E->hre + E->zr - E->z_half, 0.0);
+}
+
+
 #endif
diff --git a/anuga/shallow_water/sw_domain_cuda.py b/anuga/shallow_water/sw_domain_cuda.py
index 3774c3e39..d4694dcf4 100644
--- a/anuga/shallow_water/sw_domain_cuda.py
+++ b/anuga/shallow_water/sw_domain_cuda.py
@@ -147,7 +147,7 @@ def __init__(self, domain):
         self.cpu_timestep_array = np.zeros(self.cpu_number_of_elements, dtype=float) 
         self.cpu_num_negative_cells = np.zeros(1, dtype=np.int32)
 
-        # mass_error attribute of protect_kernal for returning the value
+        # mass_error attribute of protect_kernel for returning the value
         self.cpu_mass_error = np.zeros([] , dtype=float)
 
         #for compute forcing terms
@@ -225,13 +225,13 @@ def compile_gpu_kernels(self):
         self.extrapolate_kernel3 = self.mod.get_function("_cuda_extrapolate_second_order_edge_sw_loop3")
         self.extrapolate_kernel4 = self.mod.get_function("_cuda_extrapolate_second_order_edge_sw_loop4")
 
-        self.update_kernal = self.mod.get_function("_cuda_update_sw")
-        self.fix_negative_cells_kernal = self.mod.get_function("_cuda_fix_negative_cells_sw")        
+        self.update_kernel = self.mod.get_function("_cuda_update_sw")
+        self.fix_negative_cells_kernel = self.mod.get_function("_cuda_fix_negative_cells_sw")        
 
-        self.protect_kernal = self.mod.get_function("_cuda_protect_against_infinitesimal_and_negative_heights")
+        self.protect_kernel = self.mod.get_function("_cuda_protect_against_infinitesimal_and_negative_heights")
         
-        self.manning_flat_kernal = self.mod.get_function("cft_manning_friction_flat")
-        self.manning_sloped_kernal = self.mod.get_function("cft_manning_friction_sloped")
+        self.manning_flat_kernel = self.mod.get_function("cft_manning_friction_flat")
+        self.manning_sloped_kernel = self.mod.get_function("cft_manning_friction_sloped")
 
 
     #-----------------------------------------------------
@@ -241,7 +241,7 @@ def allocate_gpu_arrays(self):
 
         import cupy as cp
         
-        nvtxRangePush('to gpu')
+        nvtxRangePush('allocate gpu arrays')
 
         # FIXME SR: we should probably allocate all the cpu numpy arrays with 
         # pinned memory to speed movement of data from host to device
@@ -249,6 +249,7 @@ def allocate_gpu_arrays(self):
         self.gpu_number_of_boundaries = cp.array(self.cpu_number_of_boundaries)
 
         # these are just used for the reduction operation of the flux calculation
+        # probably should use atomic operations
         self.gpu_timestep_array          = cp.array(self.cpu_timestep_array)
         self.gpu_local_boundary_flux_sum = cp.array(self.cpu_local_boundary_flux_sum )
 
@@ -321,11 +322,11 @@ def allocate_gpu_arrays(self):
 
         self.gpu_num_negative_cells = cp.array(self.cpu_num_negative_cells)
 
-        nvtxRangePop()
+        self.gpu_x = cp.array(self.cpu_x)
 
         self.gpu_arrays_allocated = True
 
-        self.gpu_x = cp.array(self.cpu_x)
+        nvtxRangePop()
 
     def cpu_to_gpu_centroid_values(self):
         """
@@ -351,6 +352,7 @@ def gpu_to_cpu_centroid_values(self):
 
         nvtxRangePush('gpu_to_cpu_centroid_values')
         # FIXME SR: Do we need to transfer height and bed centroid values
+        # FIXME SR: Probably should use pinned memory buffer.
         cp.asnumpy(self.gpu_stage_centroid_values,  out = self.cpu_stage_centroid_values)
         cp.asnumpy(self.gpu_xmom_centroid_values,  out = self.cpu_xmom_centroid_values)
         cp.asnumpy(self.gpu_ymom_centroid_values,  out = self.cpu_ymom_centroid_values)
@@ -445,11 +447,11 @@ def gpu_to_cpu_explicit_update(self):
         cp.asnumpy(self.gpu_ymom_explicit_update, out = self.cpu_ymom_explicit_update)
         nvtxRangePop()
 
-    def cpu_to_gpu_semi_explicit_update(self):
+    def cpu_to_gpu_semi_implicit_update(self):
         """
         Move semi_explicit_update data from cpu to gpu
         """
-        nvtxRangePush("cpu_to_gpu_semi_explicit_update")
+        nvtxRangePush("cpu_to_gpu_semi_implicit_update")
         self.gpu_stage_semi_implicit_update.set(self.cpu_stage_semi_implicit_update)
         self.gpu_xmom_semi_implicit_update.set(self.cpu_xmom_semi_implicit_update)
         self.gpu_ymom_semi_implicit_update.set(self.cpu_ymom_semi_implicit_update)
@@ -492,7 +494,12 @@ def compute_fluxes_ext_central_kernel(self, timestep,  transfer_from_cpu=True, t
             self.gpu_bed_edge_values.set(self.cpu_bed_edge_values)
             self.gpu_height_edge_values.set(self.cpu_height_edge_values)            
             
-            #FIXME SR: Want about boundary values!
+            self.gpu_stage_boundary_values.set(self.cpu_stage_boundary_values)
+            self.gpu_xmom_boundary_values.set(self.cpu_xmom_boundary_values)
+            self.gpu_ymom_boundary_values.set(self.cpu_ymom_boundary_values)
+
+            # FIXME SR: Do we need to transfer this?
+            self.gpu_max_speed.set(self.cpu_max_speed)
             nvtxRangePop()
 
 
@@ -519,7 +526,7 @@ def compute_fluxes_ext_central_kernel(self, timestep,  transfer_from_cpu=True, t
 
         THREADS_PER_BLOCK = 128
         NO_OF_BLOCKS = int(math.ceil(self.cpu_number_of_elements/THREADS_PER_BLOCK))
-        nvtxRangePush('calculate flux: kernal')
+        nvtxRangePush('calculate flux: kernel')
         self.flux_kernel( (NO_OF_BLOCKS, 0, 0), 
                 (THREADS_PER_BLOCK, 0, 0), 
                 (  
@@ -775,7 +782,11 @@ def extrapolate_second_order_edge_sw_kernel(self, transfer_from_cpu=True, transf
             nvtxRangePop()
 
 
-    def update_conserved_quantities_kernal(self, transfer_from_cpu=True, transfer_gpu_results=True, verbose=False):
+    def update_conserved_quantities_kernel(self,
+                                           timestep,
+                                           transfer_from_cpu=True, 
+                                           transfer_gpu_results=True, 
+                                           verbose=False):
         """
         update conserved quantities
 
@@ -783,22 +794,24 @@ def update_conserved_quantities_kernal(self, transfer_from_cpu=True, transfer_gp
         """
         if transfer_from_cpu:
             self.cpu_to_gpu_centroid_values()
+
             self.cpu_to_gpu_explicit_update()
-            self.cpu_to_gpu_semi_explicit_update()
+            self.cpu_to_gpu_semi_implicit_update()
         
 
-        nvtxRangePush("Update_Kernal")
+        nvtxRangePush("Update_kernel")
         import math
         THREADS_PER_BLOCK = 128
         NO_OF_BLOCKS = int(math.ceil(self.cpu_number_of_elements/THREADS_PER_BLOCK))
 
 
-        # """  Commented this for the three kernal approach
-        # Here we're calling the update kernal for stage,xmom,ymom quantity
+        # """  Commented this for the three kernel approach
+        # Here we're calling the update kernel for stage,xmom,ymom quantity
+
         # nvtxRangePush("update : stage")
-        self.update_kernal((NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0), (
+        self.update_kernel((NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0), (
                 np.int64(self.cpu_number_of_elements),
-                np.float64(self.cpu_timestep),
+                np.float64(timestep),
                 self.gpu_stage_centroid_values,
                 self.gpu_stage_explicit_update,
                 self.gpu_stage_semi_implicit_update                
@@ -806,9 +819,9 @@ def update_conserved_quantities_kernal(self, transfer_from_cpu=True, transfer_gp
         # nvtxRangePop()
 
         # nvtxRangePush("update : xmom")
-        self.update_kernal((NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0), (
+        self.update_kernel((NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0), (
                 np.int64(self.cpu_number_of_elements),
-                np.float64(self.cpu_timestep),
+                np.float64(timestep),
                 self.gpu_xmom_centroid_values,
                 self.gpu_xmom_explicit_update,
                 self.gpu_xmom_semi_implicit_update                
@@ -816,9 +829,9 @@ def update_conserved_quantities_kernal(self, transfer_from_cpu=True, transfer_gp
         # nvtxRangePop()
 
         # nvtxRangePush("update : ymom")
-        self.update_kernal((NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0), (
+        self.update_kernel((NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0), (
                 np.int64(self.cpu_number_of_elements),
-                np.float64(self.cpu_timestep),
+                np.float64(timestep),
                 self.gpu_ymom_centroid_values,
                 self.gpu_ymom_explicit_update,
                 self.gpu_ymom_semi_implicit_update
@@ -840,9 +853,10 @@ def update_conserved_quantities_kernal(self, transfer_from_cpu=True, transfer_gp
         # if transfer_from_cpu:
         #     self.cpu_to_gpu_centroid_values()
 
-        nvtxRangePush("fix_negative_cells : kernal")
+        nvtxRangePush("fix_negative_cells : kernel")
         
-        self.fix_negative_cells_kernal((NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0), (
+        # FIXME SR: num_negative_cells should be calculated via an atomic operation
+        self.fix_negative_cells_kernel((NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0), (
             np.int64(self.cpu_number_of_elements),
             self.gpu_tri_full_flag,
             self.gpu_stage_centroid_values,
@@ -862,34 +876,34 @@ def update_conserved_quantities_kernal(self, transfer_from_cpu=True, transfer_gp
         nvtxRangePop()    
         
         if verbose:
-            print('gpu_stage_centroid_values after fix_negative_cells_kernal -> ', self.gpu_stage_centroid_values)
-            print('gpu_xmom_centroid_values after fix_negative_cells_kernal -> ', self.gpu_xmom_centroid_values)
-            print('gpu_ymom_centroid_values after fix_negative_cells_kernal -> ', self.gpu_ymom_centroid_values)
+            print('gpu_stage_centroid_values after fix_negative_cells_kernel -> ', self.gpu_stage_centroid_values)
+            print('gpu_xmom_centroid_values after fix_negative_cells_kernel -> ', self.gpu_xmom_centroid_values)
+            print('gpu_ymom_centroid_values after fix_negative_cells_kernel -> ', self.gpu_ymom_centroid_values)
         if verbose:
-            print('gpu_stage_centroid_values after fix_negative_cells_kernal -> ', self.gpu_stage_centroid_values)
-            print('gpu_xmom_centroid_values after fix_negative_cells_kernal -> ', self.gpu_xmom_centroid_values)
-            print('gpu_ymom_centroid_values after fix_negative_cells_kernal -> ', self.gpu_ymom_centroid_values)
+            print('gpu_stage_centroid_values after fix_negative_cells_kernel -> ', self.gpu_stage_centroid_values)
+            print('gpu_xmom_centroid_values after fix_negative_cells_kernel -> ', self.gpu_xmom_centroid_values)
+            print('gpu_ymom_centroid_values after fix_negative_cells_kernel -> ', self.gpu_ymom_centroid_values)
         
         return np.sum(self.cpu_num_negative_cells)
     
-    def protect_against_infinitesimal_and_negative_heights_kernal(self, transfer_from_cpu=True, transfer_gpu_results=True, verbose=False):
+    def protect_against_infinitesimal_and_negative_heights_kernel(self, transfer_from_cpu=True, transfer_gpu_results=True, verbose=False):
 
         """
         protect against infinities
         Testing against the CPU version
         Ensure transient data has been copied to the GPU via cpu_to_gpu routines
         """
-        nvtxRangePush("protect against infinities - kernal")
+        nvtxRangePush("protect against infinities - kernel")
 
         if transfer_from_cpu:
             self.cpu_to_gpu_centroid_values()
             self.cpu_to_gpu_explicit_update()
-            self.cpu_to_gpu_semi_explicit_update()
+            self.cpu_to_gpu_semi_implicit_update()
         
         import math
         THREADS_PER_BLOCK = 128
         NO_OF_BLOCKS = int(math.ceil(self.cpu_number_of_elements/THREADS_PER_BLOCK))
-        self.protect_kernal((NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0), 
+        self.protect_kernel((NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0), 
                             (
                                 np.float64(self.cpu_minimum_allowed_height),
                                 np.int64(self.cpu_number_of_elements),
@@ -906,20 +920,23 @@ def protect_against_infinitesimal_and_negative_heights_kernal(self, transfer_fro
 
 
     def compute_forcing_terms_manning_friction_flat(self, transfer_from_cpu=True, transfer_gpu_results=True, verbose=False):
-        nvtxRangePush("compute forcing manning flat - kernal")
+        nvtxRangePush("compute forcing manning flat - kernel")
     
-        self.gpu_stage_centroid_values.set(self.cpu_stage_centroid_values)
-        self.gpu_bed_centroid_values.set(self.cpu_bed_vertex_values)
-        self.gpu_xmom_centroid_values.set(self.cpu_xmom_centroid_values)
-        self.gpu_ymom_centroid_values.set(self.cpu_ymom_centroid_values)
-        self.gpu_friction_centroid_values.set(self.cpu_friction_centroid_values)
-        self.gpu_xmom_semi_implicit_update.set(self.cpu_xmom_semi_implicit_update)
-        self.gpu_ymom_semi_implicit_update.set(self.cpu_ymom_semi_implicit_update)
+        if transfer_from_cpu:
+            nvtxRangePush('CFT: transfer from CPU')
+            self.gpu_stage_centroid_values.set(self.cpu_stage_centroid_values)
+            self.gpu_bed_centroid_values.set(self.cpu_bed_vertex_values)
+            self.gpu_xmom_centroid_values.set(self.cpu_xmom_centroid_values)
+            self.gpu_ymom_centroid_values.set(self.cpu_ymom_centroid_values)
+            self.gpu_friction_centroid_values.set(self.cpu_friction_centroid_values)
+            self.gpu_xmom_semi_implicit_update.set(self.cpu_xmom_semi_implicit_update)
+            self.gpu_ymom_semi_implicit_update.set(self.cpu_ymom_semi_implicit_update)
+            nvtxRangePop()
 
         import math
         THREADS_PER_BLOCK = 128
         NO_OF_BLOCKS = int(math.ceil(self.cpu_number_of_elements/THREADS_PER_BLOCK))
-        self.manning_flat_kernal(
+        self.manning_flat_kernel(
             (NO_OF_BLOCKS, 0, 0), 
             (THREADS_PER_BLOCK, 0, 0),
             (
@@ -949,21 +966,24 @@ def compute_forcing_terms_manning_friction_flat(self, transfer_from_cpu=True, tr
 
 
     def compute_forcing_terms_manning_friction_sloped(self, transfer_from_cpu=True, transfer_gpu_results=True, verbose=False):
-        nvtxRangePush("compute forcing manning sloped - kernal")
+        nvtxRangePush("compute forcing manning sloped - kernel")
 
-        self.gpu_x.set(self.cpu_x)
-        self.gpu_stage_centroid_values.set(self.cpu_stage_centroid_values)
-        self.gpu_bed_centroid_values.set(self.cpu_bed_vertex_values)
-        self.gpu_xmom_centroid_values.set(self.cpu_xmom_centroid_values)
-        self.gpu_ymom_centroid_values.set(self.cpu_ymom_centroid_values)
-        self.gpu_friction_centroid_values.set(self.cpu_friction_centroid_values)
-        self.gpu_xmom_semi_implicit_update.set(self.cpu_xmom_semi_implicit_update)
-        self.gpu_ymom_semi_implicit_update.set(self.cpu_ymom_semi_implicit_update)
+        if transfer_from_cpu:
+            self.gpu_x.set(self.cpu_x)
+
+            self.gpu_stage_centroid_values.set(self.cpu_stage_centroid_values)
+            self.gpu_bed_centroid_values.set(self.cpu_bed_centroid_values)
+            self.gpu_xmom_centroid_values.set(self.cpu_xmom_centroid_values)
+            self.gpu_ymom_centroid_values.set(self.cpu_ymom_centroid_values)
+            self.gpu_friction_centroid_values.set(self.cpu_friction_centroid_values)
+
+            self.gpu_xmom_semi_implicit_update.set(self.cpu_xmom_semi_implicit_update)
+            self.gpu_ymom_semi_implicit_update.set(self.cpu_ymom_semi_implicit_update)
 
         import math
         THREADS_PER_BLOCK = 128
         NO_OF_BLOCKS = int(math.ceil(self.cpu_number_of_elements/THREADS_PER_BLOCK))
-        self.manning_flat_kernal(
+        self.manning_flat_kernel(
             (NO_OF_BLOCKS, 0, 0), (THREADS_PER_BLOCK, 0, 0),
             np.float64(self.cpu_g),
             np.float64(self.cpu_eps) ,
@@ -979,12 +999,10 @@ def compute_forcing_terms_manning_friction_sloped(self, transfer_from_cpu=True,
         )    
 
         nvtxRangePush('CFT: transfer from GPU')
-
         if transfer_gpu_results:
             self.gpu_to_cpu_centroid_values()
             cp.asnumpy(self.gpu_xmom_semi_implicit_update,    out = self.cpu_xmom_semi_implicit_update)
             cp.asnumpy(self.gpu_ymom_semi_implicit_update,    out = self.cpu_ymom_semi_implicit_update)
-
         nvtxRangePop()   
 
 
@@ -993,7 +1011,7 @@ def compute_forcing_terms_manning_friction_sloped(self, transfer_from_cpu=True,
 
 
     # This function serves functionality of assigning updated values back to Domain object for further calculation that occur off the GPU.
-    # Call this function after the kernal call to update the Domain
+    # Call this function after the kernel call to update the Domain
     # this method accepts the domain object as an argument and updates only the relevant attributes. It returns the updated domain object, keeping the rest of its attributes intact.
     def update_domain_values(self, domain):
         """
@@ -1034,6 +1052,10 @@ def update_domain_values(self, domain):
         bed = quantities["elevation"]
         height = quantities["height"]
 
+
+        #FIxME SR: I dont think we need to do this as the
+        # cpu_ variables are just references to the domain and quantity 
+        # arrays
         stage.explicit_update = self.cpu_stage_explicit_update
         xmom.explicit_update = self.cpu_xmom_explicit_update
         ymom.explicit_update = self.cpu_ymom_explicit_update
diff --git a/anuga/shallow_water/sw_domain_math.h b/anuga/shallow_water/sw_domain_math.h
new file mode 100644
index 000000000..09944e4ec
--- /dev/null
+++ b/anuga/shallow_water/sw_domain_math.h
@@ -0,0 +1,45 @@
+#ifndef ANUGA_SHALLOW_WATER_SW_DOMAIN_MATH_H
+#define ANUGA_SHALLOW_WATER_SW_DOMAIN_MATH_H
+#include "math.h"
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+
+#ifdef USE_LIB_BLAS
+#include <cblas.h>
+#endif
+
+#include "anuga_runtime.h"
+#include "anuga_typedefs.h"
+
+void anuga_daxpy(const anuga_int N, const double alpha, const double *X, const int incX, double *Y, const anuga_int incY)
+{
+#ifdef USE_LIB_BLAS
+  // Use BLAS for optimized performance
+  cblas_daxpy(N, alpha, X, incX, Y, incY);
+  return;
+  #else
+#pragma omp parallel for simd schedule(static)
+  for (anuga_int i = 0; i < N; i++)
+  {
+    Y[i*incY] += alpha * X[i*incX];
+  }
+  #endif
+}
+
+void anuga_dscal(const anuga_int N, const double alpha, double *X, const anuga_int incX)
+{
+    #ifdef USE_LIB_BLAS
+    cblas_dscal(N, alpha, X, incX);
+    return;
+    #else
+#pragma omp parallel for simd schedule(static)
+  for (anuga_int i = 0; i < N; i++)
+  {
+    X[i*incX] *= alpha;
+  }
+  #endif
+}
+#endif // ANUGA_SHALLOW_WATER_SW_DOMAIN_MATH_H 
\ No newline at end of file
diff --git a/anuga/shallow_water/sw_domain_openacc.c b/anuga/shallow_water/sw_domain_openacc.c
deleted file mode 100644
index ebb661cf4..000000000
--- a/anuga/shallow_water/sw_domain_openacc.c
+++ /dev/null
@@ -1,1936 +0,0 @@
-// Python - C extension module for shallow_water.py
-//
-// To compile (Python2.6):
-//  gcc -c swb2_domain_ext.c -I/usr/include/python2.6 -o domain_ext.o -Wall -O
-//  gcc -shared swb2_domain_ext.o  -o swb2_domain_ext.so
-//
-// or use python compile.py
-//
-// See the module swb_domain.py for more documentation on
-// how to use this module
-//
-//
-// Stephen Roberts, ANU 2009
-// Ole Nielsen, GA 2004
-// Gareth Davies, GA 2011
-
-#include "math.h"
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <stdint.h>
-
-// #if defined(__APPLE__)
-// // clang doesn't have openmp
-// #else
-// #include "omp.h"
-// #endif
-
-#include "sw_domain.h"
-
-const double pi = 3.14159265358979;
-
-// Trick to compute n modulo d (n%d in python) when d is a power of 2
-uint64_t __mod_of_power_2(uint64_t n, uint64_t d)
-{
-  return (n & (d - 1));
-}
-
-// Computational function for rotation
-int64_t __rotate(double *q, double n1, double n2)
-{
-  /*Rotate the last  2 coordinates of q (q[1], q[2])
-    from x,y coordinates to coordinates based on normal vector (n1, n2).
-
-    Result is returned in array 2x1 r
-    To rotate in opposite direction, call rotate with (q, n1, -n2)
-
-    Contents of q are changed by this function */
-
-  double q1, q2;
-
-  // Shorthands
-  q1 = q[1]; // x coordinate
-  q2 = q[2]; // y coordinate
-
-  // Rotate
-  q[1] = n1 * q1 + n2 * q2;
-  q[2] = -n2 * q1 + n1 * q2;
-
-  return 0;
-}
-
-// Innermost flux function (using stage w=z+h)
-int64_t __flux_function_central(double *q_left, double *q_right,
-                            double h_left, double h_right,
-                            double hle, double hre,
-                            double n1, double n2,
-                            double epsilon,
-                            double ze,
-                            double limiting_threshold,
-                            double g,
-                            double *edgeflux, double *max_speed,
-                            double *pressure_flux, double hc,
-                            double hc_n,
-                            int64_t low_froude)
-{
-
-  /*Compute fluxes between volumes for the shallow water wave equation
-    cast in terms of the 'stage', w = h+z using
-    the 'central scheme' as described in
-
-    Kurganov, Noelle, Petrova. 'Semidiscrete Central-Upwind Schemes For
-    Hyperbolic Conservation Laws and Hamilton-Jacobi Equations'.
-    Siam J. Sci. Comput. Vol. 23, No. 3, pp. 707-740.
-
-    The implemented formula is given in equation (3.15) on page 714
-
-    FIXME: Several variables in this interface are no longer used, clean up
-  */
-
-  int64_t i;
-
-  double uh_left, vh_left, u_left;
-  double uh_right, vh_right, u_right;
-  double s_min, s_max, soundspeed_left, soundspeed_right;
-  double denom, inverse_denominator;
-  double tmp, local_fr, v_right, v_left;
-  double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3];
-
-  if (h_left == 0. && h_right == 0.)
-  {
-    // Quick exit
-    memset(edgeflux, 0, 3 * sizeof(double));
-    *max_speed = 0.0;
-    *pressure_flux = 0.;
-    return 0;
-  }
-  // Copy conserved quantities to protect from modification
-  q_left_rotated[0] = q_left[0];
-  q_right_rotated[0] = q_right[0];
-  q_left_rotated[1] = q_left[1];
-  q_right_rotated[1] = q_right[1];
-  q_left_rotated[2] = q_left[2];
-  q_right_rotated[2] = q_right[2];
-
-  // Align x- and y-momentum with x-axis
-  __rotate(q_left_rotated, n1, n2);
-  __rotate(q_right_rotated, n1, n2);
-
-  // Compute speeds in x-direction
-  // w_left = q_left_rotated[0];
-  uh_left = q_left_rotated[1];
-  vh_left = q_left_rotated[2];
-  if (hle > 0.0)
-  {
-    tmp = 1.0 / hle;
-    u_left = uh_left * tmp; // max(h_left, 1.0e-06);
-    uh_left = h_left * u_left;
-    v_left = vh_left * tmp; // Only used to define local_fr
-    vh_left = h_left * tmp * vh_left;
-  }
-  else
-  {
-    u_left = 0.;
-    uh_left = 0.;
-    vh_left = 0.;
-    v_left = 0.;
-  }
-
-  // u_left = _compute_speed(&uh_left, &hle,
-  //             epsilon, h0, limiting_threshold);
-
-  // w_right = q_right_rotated[0];
-  uh_right = q_right_rotated[1];
-  vh_right = q_right_rotated[2];
-  if (hre > 0.0)
-  {
-    tmp = 1.0 / hre;
-    u_right = uh_right * tmp; // max(h_right, 1.0e-06);
-    uh_right = h_right * u_right;
-    v_right = vh_right * tmp; // Only used to define local_fr
-    vh_right = h_right * tmp * vh_right;
-  }
-  else
-  {
-    u_right = 0.;
-    uh_right = 0.;
-    vh_right = 0.;
-    v_right = 0.;
-  }
-  // u_right = _compute_speed(&uh_right, &hre,
-  //               epsilon, h0, limiting_threshold);
-
-  // Maximal and minimal wave speeds
-  soundspeed_left = sqrt(g * h_left);
-  soundspeed_right = sqrt(g * h_right);
-  // soundspeed_left  = sqrt(g*hle);
-  // soundspeed_right = sqrt(g*hre);
-
-  // Something that scales like the Froude number
-  // We will use this to scale the diffusive component of the UH/VH fluxes.
-
-  // local_fr = sqrt(
-  //     max(0.001, min(1.0,
-  //         (u_right*u_right + u_left*u_left + v_right*v_right + v_left*v_left)/
-  //         (soundspeed_left*soundspeed_left + soundspeed_right*soundspeed_right + 1.0e-10))));
-  if (low_froude == 1)
-  {
-    local_fr = sqrt(
-        fmax(0.001, fmin(1.0,
-                         (u_right * u_right + u_left * u_left + v_right * v_right + v_left * v_left) /
-                             (soundspeed_left * soundspeed_left + soundspeed_right * soundspeed_right + 1.0e-10))));
-  }
-  else if (low_froude == 2)
-  {
-    local_fr = sqrt((u_right * u_right + u_left * u_left + v_right * v_right + v_left * v_left) /
-                    (soundspeed_left * soundspeed_left + soundspeed_right * soundspeed_right + 1.0e-10));
-    local_fr = sqrt(fmin(1.0, 0.01 + fmax(local_fr - 0.01, 0.0)));
-  }
-  else
-  {
-    local_fr = 1.0;
-  }
-  // printf("local_fr %e \n:", local_fr);
-
-  s_max = fmax(u_left + soundspeed_left, u_right + soundspeed_right);
-  if (s_max < 0.0)
-  {
-    s_max = 0.0;
-  }
-
-  // if( hc < 1.0e-03){
-  //   s_max = 0.0;
-  // }
-
-  s_min = fmin(u_left - soundspeed_left, u_right - soundspeed_right);
-  if (s_min > 0.0)
-  {
-    s_min = 0.0;
-  }
-
-  // if( hc_n < 1.0e-03){
-  //   s_min = 0.0;
-  // }
-
-  // Flux formulas
-  flux_left[0] = u_left * h_left;
-  flux_left[1] = u_left * uh_left; //+ 0.5*g*h_left*h_left;
-  flux_left[2] = u_left * vh_left;
-
-  flux_right[0] = u_right * h_right;
-  flux_right[1] = u_right * uh_right; //+ 0.5*g*h_right*h_right;
-  flux_right[2] = u_right * vh_right;
-
-  // Flux computation
-  denom = s_max - s_min;
-  if (denom < epsilon)
-  {
-    // Both wave speeds are very small
-    memset(edgeflux, 0, 3 * sizeof(double));
-
-    *max_speed = 0.0;
-    //*pressure_flux = 0.0;
-    *pressure_flux = 0.5 * g * 0.5 * (h_left * h_left + h_right * h_right);
-  }
-  else
-  {
-    // Maximal wavespeed
-    *max_speed = fmax(s_max, -s_min);
-
-    inverse_denominator = 1.0 / fmax(denom, 1.0e-100);
-    for (i = 0; i < 3; i++)
-    {
-      edgeflux[i] = s_max * flux_left[i] - s_min * flux_right[i];
-
-      // Standard smoothing term
-      // edgeflux[i] += 1.0*(s_max*s_min)*(q_right_rotated[i] - q_left_rotated[i]);
-      // Smoothing by stage alone can cause high velocities / slow draining for nearly dry cells
-      if (i == 0)
-        edgeflux[i] += (s_max * s_min) * (fmax(q_right_rotated[i], ze) - fmax(q_left_rotated[i], ze));
-      // if(i==0) edgeflux[i] += (s_max*s_min)*(h_right - h_left);
-      if (i == 1)
-        edgeflux[i] += local_fr * (s_max * s_min) * (uh_right - uh_left);
-      if (i == 2)
-        edgeflux[i] += local_fr * (s_max * s_min) * (vh_right - vh_left);
-
-      edgeflux[i] *= inverse_denominator;
-    }
-    // Separate pressure flux, so we can apply different wet-dry hacks to it
-    *pressure_flux = 0.5 * g * (s_max * h_left * h_left - s_min * h_right * h_right) * inverse_denominator;
-
-    // Rotate back
-    __rotate(edgeflux, n1, -n2);
-  }
-
-  return 0;
-}
-
-int64_t __openacc__flux_function_central(double q_left0, double q_left1, double q_left2,
-                                   double q_right0, double q_right1, double q_right2,
-                                   double h_left, double h_right,
-                                   double hle, double hre,
-                                   double n1, double n2,
-                                   double epsilon,
-                                   double ze,
-                                   double limiting_threshold,
-                                   double g,
-                                   double *edgeflux0, double *edgeflux1, double *edgeflux2,
-                                   double *max_speed,
-                                   double *pressure_flux, double hc,
-                                   double hc_n,
-                                   int64_t low_froude)
-{
-
-  double edgeflux[3];
-  double q_left[3];
-  double q_right[3];
-
-  int64_t ierr;
-
-  edgeflux[0] = *edgeflux0;
-  edgeflux[1] = *edgeflux1;
-  edgeflux[2] = *edgeflux2;
-
-  q_left[0] = q_left0;
-  q_left[1] = q_left1;
-  q_left[2] = q_left2;
-
-  q_right[0] = q_right0;
-  q_right[1] = q_right1;
-  q_right[2] = q_right2;
-
-  ierr = __flux_function_central(q_left, q_right,
-                                 h_left, h_right,
-                                 hle, hre,
-                                 n1, n2,
-                                 epsilon,
-                                 ze,
-                                 limiting_threshold,
-                                 g,
-                                 edgeflux, max_speed,
-                                 pressure_flux, hc,
-                                 hc_n,
-                                 low_froude);
-
-  *edgeflux0 = edgeflux[0];
-  *edgeflux1 = edgeflux[1];
-  *edgeflux2 = edgeflux[2];
-
-  return ierr;
-}
-
-double __adjust_edgeflux_with_weir(double *edgeflux,
-                                   double h_left, double h_right,
-                                   double g, double weir_height,
-                                   double Qfactor,
-                                   double s1, double s2,
-                                   double h1, double h2,
-                                   double *max_speed_local)
-{
-  // Adjust the edgeflux to agree with a weir relation [including
-  // subergence], but smoothly vary to shallow water solution when
-  // the flow over the weir is much deeper than the weir, or the
-  // upstream/downstream water elevations are too similar
-  double rw, rw2; // 'Raw' weir fluxes
-  double rwRat, hdRat, hdWrRat, scaleFlux, minhd, maxhd;
-  double w1, w2; // Weights for averaging
-  double newFlux;
-  double twothirds = (2.0 / 3.0);
-  // Following constants control the 'blending' with the shallow water solution
-  // They are now user-defined
-  // double s1=0.9; // At this submergence ratio, begin blending with shallow water solution
-  // double s2=0.95; // At this submergence ratio, completely use shallow water solution
-  // double h1=1.0; // At this (tailwater height above weir) / (weir height) ratio, begin blending with shallow water solution
-  // double h2=1.5; // At this (tailwater height above weir) / (weir height) ratio, completely use the shallow water solution
-
-  if ((h_left <= 0.0) && (h_right <= 0.0))
-  {
-    return 0;
-  }
-
-  minhd = fmin(h_left, h_right);
-  maxhd = fmax(h_left, h_right);
-  // 'Raw' weir discharge = Qfactor*2/3*H*(2/3*g*H)**0.5
-  rw = Qfactor * twothirds * maxhd * sqrt(twothirds * g * maxhd);
-  // Factor for villemonte correction
-  rw2 = Qfactor * twothirds * minhd * sqrt(twothirds * g * minhd);
-  // Useful ratios
-  rwRat = rw2 / fmax(rw, 1.0e-100);
-  hdRat = minhd / fmax(maxhd, 1.0e-100);
-
-  // (tailwater height above weir)/weir_height ratio
-  hdWrRat = minhd / fmax(weir_height, 1.0e-100);
-
-  // Villemonte (1947) corrected weir flow with submergence
-  // Q = Q1*(1-Q2/Q1)**0.385
-  rw = rw * pow(1.0 - rwRat, 0.385);
-
-  if (h_right > h_left)
-  {
-    rw *= -1.0;
-  }
-
-  if ((hdRat < s2) & (hdWrRat < h2))
-  {
-    // Rescale the edge fluxes so that the mass flux = desired flux
-    // Linearly shift to shallow water solution between hdRat = s1 and s2
-    // and between hdWrRat = h1 and h2
-
-    //
-    // WEIGHT WITH RAW SHALLOW WATER FLUX BELOW
-    // This ensures that as the weir gets very submerged, the
-    // standard shallow water equations smoothly take over
-    //
-
-    // Weighted average constants to transition to shallow water eqn flow
-    w1 = fmin(fmax(hdRat - s1, 0.) / (s2 - s1), 1.0);
-
-    // Adjust again when the head is too deep relative to the weir height
-    w2 = fmin(fmax(hdWrRat - h1, 0.) / (h2 - h1), 1.0);
-
-    newFlux = (rw * (1.0 - w1) + w1 * edgeflux[0]) * (1.0 - w2) + w2 * edgeflux[0];
-
-    if (fabs(edgeflux[0]) > 1.0e-100)
-    {
-      scaleFlux = newFlux / edgeflux[0];
-    }
-    else
-    {
-      scaleFlux = 0.;
-    }
-
-    scaleFlux = fmax(scaleFlux, 0.);
-
-    edgeflux[0] = newFlux;
-
-    // FIXME: Do this in a cleaner way
-    // IDEA: Compute momentum flux implied by weir relations, and use
-    //       those in a weighted average (rather than the rescaling trick here)
-    // If we allow the scaling to momentum to be unbounded,
-    // velocity spikes can arise for very-shallow-flooded walls
-    edgeflux[1] *= fmin(scaleFlux, 10.);
-    edgeflux[2] *= fmin(scaleFlux, 10.);
-  }
-
-  // Adjust the max speed
-  if (fabs(edgeflux[0]) > 0.)
-  {
-    *max_speed_local = sqrt(g * (maxhd + weir_height)) + fabs(edgeflux[0] / (maxhd + 1.0e-12));
-  }
-  //*max_speed_local += fabs(edgeflux[0])/(maxhd+1.0e-100);
-  //*max_speed_local *= fmax(scaleFlux, 1.0);
-
-  return 0;
-}
-
-double __openacc__adjust_edgeflux_with_weir(double *edgeflux0, double *edgeflux1, double *edgeflux2,
-                                          double h_left, double h_right,
-                                          double g, double weir_height,
-                                          double Qfactor,
-                                          double s1, double s2,
-                                          double h1, double h2,
-                                          double *max_speed_local)
-{
-
-  double edgeflux[3];
-  int64_t ierr;
-
-  edgeflux[0] = *edgeflux0;
-  edgeflux[1] = *edgeflux1;
-  edgeflux[2] = *edgeflux2;
-
-  ierr = __adjust_edgeflux_with_weir(edgeflux0, h_left, h_right,
-                                     g, weir_height,
-                                     Qfactor, s1, s2, h1, h2,
-                                     max_speed_local);
-  *edgeflux0 = edgeflux[0];
-  *edgeflux1 = edgeflux[1];
-  *edgeflux2 = edgeflux[2];
-
-  return ierr;
-}
-
-// Computational function for flux computation
-double _openacc_compute_fluxes_central(struct domain *D,
-                                      double timestep)
-{
-  // Local variables
-  int64_t K = D->number_of_elements;
-  // int64_t KI, KI2, KI3, B, RW, RW5, SubSteps;
-  int64_t substep_count;
-
-  double max_speed_local, length, inv_area, zl, zr;
-  double h_left, h_right, z_half; // For andusse scheme
-  // FIXME: limiting_threshold is not used for DE1
-  double limiting_threshold = 10 * D->H0;
-  int64_t low_froude = D->low_froude;
-  double g = D->g;
-  double epsilon = D->epsilon;
-  int64_t ncol_riverwall_hydraulic_properties = D->ncol_riverwall_hydraulic_properties;
-
-  // Workspace (making them static actually made function slightly slower (Ole))
-  double ql[3];
-  double qr[3];
-  double edgeflux[3]; // Work array for summing up fluxes
-  double pressuregrad_work;
-  double edge_timestep;
-  double normal_x, normal_y;
-  // static double local_timestep;
-
-  double hle, hre, zc, zc_n, Qfactor, s1, s2, h1, h2;
-  double pressure_flux, hc, hc_n;
-  double h_left_tmp, h_right_tmp;
-  double speed_max_last, weir_height;
-  int64_t RiverWall_count;
-
-  //
-  int64_t k, i, m, n, ii;
-  int64_t ki, nm = 0, ki2; // Index shorthands
-
-  static int64_t call = 0; // Static local variable flagging already computed flux
-  static int64_t timestep_fluxcalls = 1;
-  static int64_t base_call = 1;
-
-  call++; // Flag 'id' of flux calculation for this timestep
-
-  if (D->timestep_fluxcalls != timestep_fluxcalls)
-  {
-    timestep_fluxcalls = D->timestep_fluxcalls;
-    base_call = call;
-  }
-
-  // Which substep of the timestepping method are we on?
-  substep_count = (call - base_call) % D->timestep_fluxcalls;
-
-  double local_timestep = 1.0e+100;
-  double boundary_flux_sum_substep = 0.0; 
-
-// For all triangles
-#pragma acc  parallel loop \
-                                     firstprivate(ncol_riverwall_hydraulic_properties, epsilon, g, low_froude, limiting_threshold) \
-                                     private(i, ki, ki2, n, m, nm, ii,                                                  \
-                                     max_speed_local, length, inv_area, zl, zr,                                         \
-                                     h_left, h_right,                                                                   \
-                                     z_half, ql,  pressuregrad_work,                                                    \
-                                     qr, edgeflux, edge_timestep, normal_x, normal_y,                                   \
-                                     hle, hre, zc, zc_n, Qfactor, s1, s2, h1, h2, pressure_flux, hc, hc_n,              \
-                                     h_left_tmp, h_right_tmp, speed_max_last, weir_height, RiverWall_count)             \
-                                     reduction(min : local_timestep) reduction(+:boundary_flux_sum_substep)
-
-
- 
-  for (k = 0; k < K; k++)
-  {
-    speed_max_last = 0.0;
-    // Set explicit_update to zero for all conserved_quantities.
-    // This assumes compute_fluxes called before forcing terms
-    D->stage_explicit_update[k] = 0.0;
-    D->xmom_explicit_update[k] = 0.0;
-    D->ymom_explicit_update[k] = 0.0;
-
-    // Loop through neighbours and compute edge flux for each
-    for (i = 0; i < 3; i++)
-    {
-      ki = 3 * k + i; // Linear index to edge i of triangle k
-      ki2 = 2 * ki;   // k*6 + i*2
-
-      // Get left hand side values from triangle k, edge i
-      ql[0] = D->stage_edge_values[ki];
-      ql[1] = D->xmom_edge_values[ki];
-      ql[2] = D->ymom_edge_values[ki];
-      zl    = D->bed_edge_values[ki];
-      hle   = D->height_edge_values[ki];
-
-      hc = D->height_centroid_values[k];
-      zc = D->bed_centroid_values[k];
-
-      // Get right hand side values either from neighbouring triangle
-      // or from boundary array (Quantities at neighbour on nearest face).
-      n = D->neighbours[ki];
-      hc_n = hc;
-      zc_n = D->bed_centroid_values[k];
-      if (n < 0)
-      {
-        // Neighbour is a boundary condition
-        m = -n - 1; // Convert negative flag to boundary index
-
-        qr[0] = D->stage_boundary_values[m];
-        qr[1] = D->xmom_boundary_values[m];
-        qr[2] = D->ymom_boundary_values[m];
-        zr = zl;                   // Extend bed elevation to boundary
-        hre = fmax(qr[0] - zr, 0.0); // hle;
-      }
-      else
-      {
-        // Neighbour is a real triangle
-        hc_n = D->height_centroid_values[n];
-        zc_n = D->bed_centroid_values[n];
-
-        m = D->neighbour_edges[ki];
-        nm = n * 3 + m; // Linear index (triangle n, edge m)
-
-        qr[0] = D->stage_edge_values[nm];
-        qr[1] = D->xmom_edge_values[nm];
-        qr[2] = D->ymom_edge_values[nm];
-        zr = D->bed_edge_values[nm];
-        hre = D->height_edge_values[nm];
-      }
-
-      // Audusse magic for well balancing
-      z_half = fmax(zl, zr);
-
-      // Account for riverwalls
-      if (D->edge_flux_type[ki] == 1)
-      {
-        RiverWall_count = D->edge_river_wall_counter[ki];
-
-        // Set central bed to riverwall elevation
-        z_half = fmax(D->riverwall_elevation[RiverWall_count - 1], z_half);
-      }
-
-      // Define h left/right for Audusse flux method
-      h_left = fmax(hle + zl - z_half, 0.);
-      h_right = fmax(hre + zr - z_half, 0.);
-
-      normal_x = D->normals[ki2];
-      normal_y = D->normals[ki2 + 1];
-
-      // Edge flux computation (triangle k, edge i)
-      __flux_function_central(ql, qr,
-                              h_left, h_right,
-                              hle, hre,
-                              normal_x, normal_y,
-                              epsilon, z_half, limiting_threshold, g,
-                              edgeflux, &max_speed_local, &pressure_flux,
-                              hc, hc_n, low_froude);
-
-      // Force weir discharge to match weir theory
-      if (D->edge_flux_type[ki] == 1)
-      {
-
-        RiverWall_count = D->edge_river_wall_counter[ki];
-
-        // printf("RiverWall_count %ld\n", RiverWall_count);
-
-        ii = D->riverwall_rowIndex[RiverWall_count - 1] * ncol_riverwall_hydraulic_properties;
-
-        // Get Qfactor index - multiply the idealised weir discharge by this constant factor
-        // Get s1, submergence ratio at which we start blending with the shallow water solution
-        // Get s2, submergence ratio at which we entirely use the shallow water solution
-        // Get h1, tailwater head / weir height at which we start blending with the shallow water solution
-        // Get h2, tailwater head / weir height at which we entirely use the shallow water solution
-        Qfactor = D->riverwall_hydraulic_properties[ii];
-        s1 = D->riverwall_hydraulic_properties[ii + 1];
-        s2 = D->riverwall_hydraulic_properties[ii + 2];
-        h1 = D->riverwall_hydraulic_properties[ii + 3];
-        h2 = D->riverwall_hydraulic_properties[ii + 4];
-
-        weir_height = fmax(D->riverwall_elevation[RiverWall_count - 1] - fmin(zl, zr), 0.); // Reference weir height
-
-        // Use first-order h's for weir -- as the 'upstream/downstream' heads are
-        //  measured away from the weir itself
-        h_left_tmp = fmax(D->stage_centroid_values[k] - z_half, 0.);
-
-        if (n >= 0)
-        {
-          h_right_tmp = fmax(D->stage_centroid_values[n] - z_half, 0.);
-        }
-        else
-        {
-          h_right_tmp = fmax(hc_n + zr - z_half, 0.);
-        }
-
-        // If the weir is not higher than both neighbouring cells, then
-        // do not try to match the weir equation. If we do, it seems we
-        // can get mass conservation issues (caused by large weir
-        // fluxes in such situations)
-        if (D->riverwall_elevation[RiverWall_count - 1] > fmax(zc, zc_n))
-        {
-          // Weir flux adjustment
-          __adjust_edgeflux_with_weir(edgeflux, h_left_tmp, h_right_tmp, g,
-                                             weir_height, Qfactor,
-                                             s1, s2, h1, h2, &max_speed_local);
-        }
-      }
-
-      // Multiply edgeflux by edgelength
-      length = D->edgelengths[ki];
-      edgeflux[0] = -edgeflux[0]*length;
-      edgeflux[1] = -edgeflux[1]*length;
-      edgeflux[2] = -edgeflux[2]*length;
-
-      // bedslope_work contains all gravity related terms
-      pressuregrad_work = length * (-g * 0.5 * (h_left * h_left - hle * hle - (hle + hc) * (zl - zc)) + pressure_flux);
-
-      // Update timestep based on edge i and possibly neighbour n
-      // NOTE: We should only change the timestep on the 'first substep'
-      // of the timestepping method [substep_count==0]
-      if (substep_count == 0)
-      {
-
-        // Compute the 'edge-timesteps' (useful for setting flux_update_frequency)
-        edge_timestep = D->radii[k] *1.0 / fmax(max_speed_local, epsilon);
-
-        // Update the timestep
-        if ((D->tri_full_flag[k] == 1))
-        {
-          if (max_speed_local > epsilon)
-          {
-            // Apply CFL condition for triangles joining this edge (triangle k and triangle n)
-
-            // CFL for triangle k
-            local_timestep = fmin(local_timestep, edge_timestep);
-
-            speed_max_last = fmax(speed_max_last, max_speed_local);
-          }
-        }
-      }
-
-
-      D->stage_explicit_update[k] += edgeflux[0];
-      D->xmom_explicit_update[k]  += edgeflux[1];
-      D->ymom_explicit_update[k]  += edgeflux[2];
-
-      // If this cell is not a ghost, and the neighbour is a
-      // boundary condition OR a ghost cell, then add the flux to the
-      // boundary_flux_integral
-      if (((n < 0) & (D->tri_full_flag[k] == 1)) | ((n >= 0) && ((D->tri_full_flag[k] == 1) & (D->tri_full_flag[n] == 0))))
-      {
-        // boundary_flux_sum is an array with length = timestep_fluxcalls
-        // For each sub-step, we put the boundary flux sum in.
-        boundary_flux_sum_substep += edgeflux[0];
-      }
-
-      D->xmom_explicit_update[k] -= D->normals[ki2] * pressuregrad_work;
-      D->ymom_explicit_update[k] -= D->normals[ki2 + 1] * pressuregrad_work;
-
-    } // End edge i (and neighbour n)
-
-    // Keep track of maximal speeds
-    if (substep_count == 0)
-      D->max_speed[k] = speed_max_last; // max_speed;
-
-    // Normalise triangle k by area and store for when all conserved
-    // quantities get updated
-    inv_area = 1.0 / D->areas[k];
-    D->stage_explicit_update[k] *= inv_area;
-    D->xmom_explicit_update[k] *= inv_area;
-    D->ymom_explicit_update[k] *= inv_area;  
-
-  } // End triangle k
-
-
-
-  // variable to accumulate D->boundary_flux_sum[substep_count]
-  D->boundary_flux_sum[substep_count] = boundary_flux_sum_substep;  
-
-  // Ensure we only update the timestep on the first call within each rk2/rk3 step
-  if (substep_count == 0)
-    timestep = local_timestep;
-
-  return timestep;
-}
-
-// Computational function for flux computation
-// with riverWall_count pulled out of triangle loop
-double _compute_fluxes_central_parallel_data_flow(struct domain *D, double timestep)
-{
-
-  // Local variables
-  double max_speed_local, length, inv_area, zl, zr;
-  double h_left, h_right, z_half; // For andusse scheme
-  // FIXME: limiting_threshold is not used for DE1
-  double limiting_threshold = 10 * D->H0;
-  int64_t low_froude = D->low_froude;
-  //
-  int64_t k, i, m, n, ii;
-  int64_t ki, nm = 0, ki2, ki3; // Index shorthands
-  // Workspace (making them static actually made function slightly slower (Ole))
-  double ql[3], qr[3], edgeflux[3]; // Work array for summing up fluxes
-  double bedslope_work;
-  static double local_timestep;
-  int64_t RiverWall_count, substep_count;
-  double hle, hre, zc, zc_n, Qfactor, s1, s2, h1, h2;
-  double pressure_flux, hc, hc_n, tmp;
-  double h_left_tmp, h_right_tmp;
-  static int64_t call = 0; // Static local variable flagging already computed flux
-  static int64_t timestep_fluxcalls = 1;
-  static int64_t base_call = 1;
-  double speed_max_last, weir_height;
-
-  call++; // Flag 'id' of flux calculation for this timestep
-
-  if (D->timestep_fluxcalls != timestep_fluxcalls)
-  {
-    timestep_fluxcalls = D->timestep_fluxcalls;
-    base_call = call;
-  }
-
-  // Set explicit_update to zero for all conserved_quantities.
-  // This assumes compute_fluxes called before forcing terms
-
-  // #pragma omp parallel for private(k)
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    D->stage_explicit_update[k] = 0.0;
-    D->xmom_explicit_update[k] = 0.0;
-    D->ymom_explicit_update[k] = 0.0;
-  }
-  // memset((char*) D->stage_explicit_update, 0, D->number_of_elements * sizeof (double));
-  // memset((char*) D->xmom_explicit_update, 0, D->number_of_elements * sizeof (double));
-  // memset((char*) D->ymom_explicit_update, 0, D->number_of_elements * sizeof (double));
-
-  // Counter for riverwall edges
-  RiverWall_count = 0;
-  // Which substep of the timestepping method are we on?
-  substep_count = (call - base_call) % D->timestep_fluxcalls;
-
-  // printf("call = %d substep_count = %d base_call = %d \n",call,substep_count, base_call);
-
-  // Fluxes are not updated every timestep,
-  // but all fluxes ARE updated when the following condition holds
-  if (D->allow_timestep_increase[0] == 1)
-  {
-    // We can only increase the timestep if all fluxes are allowed to be updated
-    // If this is not done the timestep can't increase (since local_timestep is static)
-    local_timestep = 1.0e+100;
-  }
-
-  // For all triangles
-  // Pull the edge_river_wall count outside parallel loop as in needs to be done sequentially
-  // move it to the initiation of the riverwall so only calculated once
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    for (i = 0; i < 3; i++)
-    {
-      ki = 3 * k + i;
-      D->edge_river_wall_counter[ki] = 0;
-      if (D->edge_flux_type[ki] == 1)
-      {
-        // Update counter of riverwall edges
-        RiverWall_count += 1;
-        D->edge_river_wall_counter[ki] = RiverWall_count;
-
-        // printf("RiverWall_count %d   edge_counter %d \n", RiverWall_count, D->edge_river_wall_counter[ki]);
-      }
-    }
-  }
-
-  RiverWall_count = 0;
-
-  // For all triangles
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    speed_max_last = 0.0;
-
-    // Loop through neighbours and compute edge flux for each
-    for (i = 0; i < 3; i++)
-    {
-      ki = 3 * k + i; // Linear index to edge i of triangle k
-      ki2 = 2 * ki;   // k*6 + i*2
-      ki3 = 3 * ki;
-
-      // Get left hand side values from triangle k, edge i
-      ql[0] = D->stage_edge_values[ki];
-      ql[1] = D->xmom_edge_values[ki];
-      ql[2] = D->ymom_edge_values[ki];
-      zl = D->bed_edge_values[ki];
-      hc = D->height_centroid_values[k];
-      zc = D->bed_centroid_values[k];
-      hle = D->height_edge_values[ki];
-
-      // Get right hand side values either from neighbouring triangle
-      // or from boundary array (Quantities at neighbour on nearest face).
-      n = D->neighbours[ki];
-      hc_n = hc;
-      zc_n = D->bed_centroid_values[k];
-      if (n < 0)
-      {
-        // Neighbour is a boundary condition
-        m = -n - 1; // Convert negative flag to boundary index
-
-        qr[0] = D->stage_boundary_values[m];
-        qr[1] = D->xmom_boundary_values[m];
-        qr[2] = D->ymom_boundary_values[m];
-        zr = zl;                    // Extend bed elevation to boundary
-        hre = fmax(qr[0] - zr, 0.); // hle;
-      }
-      else
-      {
-        // Neighbour is a real triangle
-        hc_n = D->height_centroid_values[n];
-        zc_n = D->bed_centroid_values[n];
-        m = D->neighbour_edges[ki];
-        nm = n * 3 + m; // Linear index (triangle n, edge m)
-
-        qr[0] = D->stage_edge_values[nm];
-        qr[1] = D->xmom_edge_values[nm];
-        qr[2] = D->ymom_edge_values[nm];
-        zr = D->bed_edge_values[nm];
-        hre = D->height_edge_values[nm];
-      }
-
-      // Audusse magic
-      z_half = fmax(zl, zr);
-
-      //// Account for riverwalls
-      if (D->edge_flux_type[ki] == 1)
-      {
-        if (n >= 0 && D->edge_flux_type[nm] != 1)
-        {
-          printf("Riverwall Error\n");
-        }
-        // Update counter of riverwall edges == index of
-        // riverwall_elevation + riverwall_rowIndex
-
-        // RiverWall_count += 1;
-        RiverWall_count = D->edge_river_wall_counter[ki];
-
-        // Set central bed to riverwall elevation
-        z_half = fmax(D->riverwall_elevation[RiverWall_count - 1], z_half);
-      }
-
-      // Define h left/right for Audusse flux method
-      h_left = fmax(hle + zl - z_half, 0.);
-      h_right = fmax(hre + zr - z_half, 0.);
-
-      // Edge flux computation (triangle k, edge i)
-      __flux_function_central(ql, qr,
-                              h_left, h_right,
-                              hle, hre,
-                              D->normals[ki2], D->normals[ki2 + 1],
-                              D->epsilon, z_half, limiting_threshold, D->g,
-                              edgeflux, &max_speed_local, &pressure_flux, hc, hc_n, low_froude);
-
-      // Force weir discharge to match weir theory
-      if (D->edge_flux_type[ki] == 1)
-      {
-        ii = D->riverwall_rowIndex[RiverWall_count - 1] * D->ncol_riverwall_hydraulic_properties;
-
-        // Get Qfactor index - multiply the idealised weir discharge by this constant factor
-        // Get s1, submergence ratio at which we start blending with the shallow water solution
-        // Get s2, submergence ratio at which we entirely use the shallow water solution
-        // Get h1, tailwater head / weir height at which we start blending with the shallow water solution
-        // Get h2, tailwater head / weir height at which we entirely use the shallow water solution
-        Qfactor = D->riverwall_hydraulic_properties[ii];
-        s1 = D->riverwall_hydraulic_properties[ii + 1];
-        s2 = D->riverwall_hydraulic_properties[ii + 2];
-        h1 = D->riverwall_hydraulic_properties[ii + 3];
-        h2 = D->riverwall_hydraulic_properties[ii + 4];
-
-        weir_height = fmax(D->riverwall_elevation[RiverWall_count - 1] - fmin(zl, zr), 0.); // Reference weir height
-
-        // Use first-order h's for weir -- as the 'upstream/downstream' heads are
-        //  measured away from the weir itself
-        h_left_tmp = fmax(D->stage_centroid_values[k] - z_half, 0.);
-
-        if (n >= 0)
-        {
-          h_right_tmp = fmax(D->stage_centroid_values[n] - z_half, 0.);
-        }
-        else
-        {
-          h_right_tmp = fmax(hc_n + zr - z_half, 0.);
-        }
-
-        // If the weir is not higher than both neighbouring cells, then
-        // do not try to match the weir equation. If we do, it seems we
-        // can get mass conservation issues (caused by large weir
-        // fluxes in such situations)
-        if (D->riverwall_elevation[RiverWall_count - 1] > fmax(zc, zc_n))
-        {
-          // Weir flux adjustment
-          __adjust_edgeflux_with_weir(edgeflux, h_left_tmp, h_right_tmp, D->g,
-                                      weir_height, Qfactor,
-                                      s1, s2, h1, h2, &max_speed_local);
-        }
-      }
-
-      // Multiply edgeflux by edgelength
-      length = D->edgelengths[ki];
-      edgeflux[0] *= length;
-      edgeflux[1] *= length;
-      edgeflux[2] *= length;
-
-      D->edge_flux_work[ki3 + 0] = -edgeflux[0];
-      D->edge_flux_work[ki3 + 1] = -edgeflux[1];
-      D->edge_flux_work[ki3 + 2] = -edgeflux[2];
-
-      // bedslope_work contains all gravity related terms
-      bedslope_work = length * (-D->g * 0.5 * (h_left * h_left - hle * hle - (hle + hc) * (zl - zc)) + pressure_flux);
-
-      D->pressuregrad_work[ki] = bedslope_work;
-
-      // Update timestep based on edge i and possibly neighbour n
-      // NOTE: We should only change the timestep on the 'first substep'
-      //  of the timestepping method [substep_count==0]
-      if (substep_count == 0)
-      {
-
-        // Compute the 'edge-timesteps' (useful for setting flux_update_frequency)
-        tmp = 1.0 / fmax(max_speed_local, D->epsilon);
-        D->edge_timestep[ki] = D->radii[k] * tmp;
-
-        // Update the timestep
-        if ((D->tri_full_flag[k] == 1))
-        {
-
-          speed_max_last = fmax(speed_max_last, max_speed_local);
-
-          if (max_speed_local > D->epsilon)
-          {
-            // Apply CFL condition for triangles joining this edge (triangle k and triangle n)
-
-            // CFL for triangle k
-            local_timestep = fmin(local_timestep, D->edge_timestep[ki]);
-
-            // if (n >= 0) {
-            //     // Apply CFL condition for neigbour n (which is on the ith edge of triangle k)
-            //    local_timestep = fmin(local_timestep, D->edge_timestep[nm]);
-            // }
-          }
-        }
-      }
-
-    } // End edge i (and neighbour n)
-
-    // Keep track of maximal speeds
-    if (substep_count == 0)
-      D->max_speed[k] = speed_max_last; // max_speed;
-
-  } // End triangle k
-
-  // Now add up stage, xmom, ymom explicit updates
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    hc = fmax(D->stage_centroid_values[k] - D->bed_centroid_values[k], 0.);
-
-    for (i = 0; i < 3; i++)
-    {
-      // FIXME: Make use of neighbours to efficiently set things
-      ki = 3 * k + i;
-      ki2 = ki * 2;
-      ki3 = ki * 3;
-      n = D->neighbours[ki];
-
-      D->stage_explicit_update[k] += D->edge_flux_work[ki3 + 0];
-      D->xmom_explicit_update[k] += D->edge_flux_work[ki3 + 1];
-      D->ymom_explicit_update[k] += D->edge_flux_work[ki3 + 2];
-
-      // If this cell is not a ghost, and the neighbour is a
-      // boundary condition OR a ghost cell, then add the flux to the
-      // boundary_flux_integral
-      if (((n < 0) & (D->tri_full_flag[k] == 1)) | ((n >= 0) && ((D->tri_full_flag[k] == 1) & (D->tri_full_flag[n] == 0))))
-      {
-        // boundary_flux_sum is an array with length = timestep_fluxcalls
-        // For each sub-step, we put the boundary flux sum in.
-        D->boundary_flux_sum[substep_count] += D->edge_flux_work[ki3];
-      }
-
-      D->xmom_explicit_update[k] -= D->normals[ki2] * D->pressuregrad_work[ki];
-      D->ymom_explicit_update[k] -= D->normals[ki2 + 1] * D->pressuregrad_work[ki];
-
-    } // end edge i
-
-    // Normalise triangle k by area and store for when all conserved
-    // quantities get updated
-    inv_area = 1.0 / D->areas[k];
-    D->stage_explicit_update[k] *= inv_area;
-    D->xmom_explicit_update[k] *= inv_area;
-    D->ymom_explicit_update[k] *= inv_area;
-
-  } // end cell k
-
-  // Ensure we only update the timestep on the first call within each rk2/rk3 step
-  if (substep_count == 0)
-    timestep = local_timestep;
-
-  return timestep;
-}
-
-
-
-
-
-// Protect against the water elevation falling below the triangle bed
-double _openacc_protect(struct domain *D)
-{
-
-  int64_t k, k3, K;
-  double hc, bmin;
-  double mass_error = 0.;
-
-  // double *wc;
-  // double *zc;
-  // double *wv;
-  // double *xmomc;
-  // double *ymomc;
-  // double *areas;
-
-  double minimum_allowed_height;
-
-  minimum_allowed_height = D->minimum_allowed_height;
-
-  K = D->number_of_elements;
-
-  // wc = D->stage_centroid_values;
-  // zc = D->bed_centroid_values;
-  // wv = D->stage_vertex_values;
-  // xmomc = D->xmom_centroid_values;
-  // ymomc = D->xmom_centroid_values;
-  // areas = D->areas;
-
-  // This acts like minimum_allowed height, but scales with the vertical
-  // distance between the bed_centroid_value and the max bed_edge_value of
-  // every triangle.
-  // double minimum_relative_height=0.05;
-  // int64_t mass_added = 0;
-
-  // Protect against inifintesimal and negative heights
-  // if (maximum_allowed_speed < epsilon) {
-// #pragma omp parallel for private(k, k3, hc, bmin ) schedule(static) reduction(+ : mass_error) firstprivate (minimum_allowed_height)
-  for (k = 0; k < K; k++)
-  {
-    k3 = 3*k;
-    hc = D->stage_centroid_values[k] - D->bed_centroid_values[k];
-    if (hc < minimum_allowed_height * 1.0)
-    {
-      // Set momentum to zero and ensure h is non negative
-      D->xmom_centroid_values[k] = 0.;
-      D->xmom_centroid_values[k] = 0.;
-      if (hc <= 0.0)
-      {
-        bmin = D->bed_centroid_values[k];
-        // Minimum allowed stage = bmin
-
-        // WARNING: ADDING MASS if wc[k]<bmin
-        if (D->stage_centroid_values[k] < bmin)
-        {
-          mass_error += (bmin - D->stage_centroid_values[k]) * D->areas[k];
-          // mass_added = 1; //Flag to warn of added mass
-
-          D->stage_centroid_values[k] = bmin;
-
-          // FIXME: Set vertex values as well. Seems that this shouldn't be
-          // needed. However, from memory this is important at the first
-          // time step, for 'dry' areas where the designated stage is
-          // less than the bed centroid value
-          D->stage_vertex_values[k3] = bmin;     // min(bmin, wc[k]); //zv[3*k]-minimum_allowed_height);
-          D->stage_vertex_values[k3 + 1] = bmin; // min(bmin, wc[k]); //zv[3*k+1]-minimum_allowed_height);
-          D->stage_vertex_values[k3 + 2] = bmin; // min(bmin, wc[k]); //zv[3*k+2]-minimum_allowed_height);
-        }
-      }
-    }
-  }
-
-  // if(mass_added == 1){
-  //   printf("Cumulative mass protection: %f m^3 \n", mass_error);
-  // }
-
-  return mass_error;
-}
-
-
-static inline int64_t __find_qmin_and_qmax(double dq0, double dq1, double dq2,
-                         double *qmin, double *qmax)
-{
-  // Considering the centroid of an FV triangle and the vertices of its
-  // auxiliary triangle, find
-  // qmin=min(q)-qc and qmax=max(q)-qc,
-  // where min(q) and max(q) are respectively min and max over the
-  // four values (at the centroid of the FV triangle and the auxiliary
-  // triangle vertices),
-  // and qc is the centroid
-  // dq0=q(vertex0)-q(centroid of FV triangle)
-  // dq1=q(vertex1)-q(vertex0)
-  // dq2=q(vertex2)-q(vertex0)
-
-  // This is a simple implementation
-  *qmax = fmax(fmax(dq0, fmax(dq0 + dq1, dq0 + dq2)), 0.0);
-  *qmin = fmin(fmin(dq0, fmin(dq0 + dq1, dq0 + dq2)), 0.0);
-
-  return 0;
-}
-
-static inline int64_t __limit_gradient(double *dqv, double qmin, double qmax, double beta_w)
-{
-  // Given provisional jumps dqv from the FV triangle centroid to its
-  // vertices/edges, and jumps qmin (qmax) between the centroid of the FV
-  // triangle and the minimum (maximum) of the values at the auxiliary triangle
-  // vertices (which are centroids of neighbour mesh triangles), calculate a
-  // multiplicative factor phi by which the provisional vertex jumps are to be
-  // limited
-
-  int64_t i;
-  double r = 1000.0, r0 = 1.0, phi = 1.0;
-  static double TINY = 1.0e-100; // to avoid machine accuracy problems.
-  // FIXME: Perhaps use the epsilon used elsewhere.
-
-  // Any provisional jump with magnitude < TINY does not contribute to
-  // the limiting process.
-  // return 0;
-
-  for (i = 0; i < 3; i++)
-  {
-    if (dqv[i] < -TINY)
-      r0 = qmin / dqv[i];
-
-    if (dqv[i] > TINY)
-      r0 = qmax / dqv[i];
-
-    r = fmin(r0, r);
-  }
-
-  phi = fmin(r * beta_w, 1.0);
-  // phi=1.;
-  dqv[0] = dqv[0] * phi;
-  dqv[1] = dqv[1] * phi;
-  dqv[2] = dqv[2] * phi;
-
-  return 0;
-}
-
-static inline void __calc_edge_values(double beta_tmp, double cv_k, double cv_k0, double cv_k1, double cv_k2,
-                        double dxv0, double dxv1, double dxv2, double dyv0, double dyv1, double dyv2,
-                        double dx1, double dx2, double dy1, double dy2, double inv_area2,
-                        double *edge_values)
-{
-  double dqv[3];
-  double dq0, dq1, dq2;
-  double a, b;
-  double qmin, qmax;
-
-  if (beta_tmp > 0.)
-  {
-    // Calculate the difference between vertex 0 of the auxiliary
-    // triangle and the centroid of triangle k
-    dq0 = cv_k0 - cv_k;
-
-    // Calculate differentials between the vertices
-    // of the auxiliary triangle (centroids of neighbouring triangles)
-    dq1 = cv_k1 - cv_k0;
-    dq2 = cv_k2 - cv_k0;
-
-    // Calculate the gradient of stage on the auxiliary triangle
-    a = dy2 * dq1 - dy1 * dq2;
-    a *= inv_area2;
-    b = dx1 * dq2 - dx2 * dq1;
-    b *= inv_area2;
-    // Calculate provisional jumps in stage from the centroid
-    // of triangle k to its vertices, to be limited
-    dqv[0] = a * dxv0 + b * dyv0;
-    dqv[1] = a * dxv1 + b * dyv1;
-    dqv[2] = a * dxv2 + b * dyv2;
-
-    // Now we want to find min and max of the centroid and the
-    // vertices of the auxiliary triangle and compute jumps
-    // from the centroid to the min and max
-    __find_qmin_and_qmax(dq0, dq1, dq2, &qmin, &qmax);
-
-    // Limit the gradient
-    __limit_gradient(dqv, qmin, qmax, beta_tmp);
-
-    edge_values[0] = cv_k + dqv[0];
-    edge_values[1] = cv_k + dqv[1];
-    edge_values[2] = cv_k + dqv[2];
-  }
-  else
-  {
-    // Fast alternative when beta_tmp==0
-    edge_values[0] = cv_k;
-    edge_values[1] = cv_k;
-    edge_values[2] = cv_k;
-  }
-}
-
-static inline void __calc_edge_values_2_bdy(double beta, double cv_k, double cv_k0, 
-                        double dxv0, double dxv1, double dxv2, double dyv0, double dyv1, double dyv2,
-                        double dx1, double dx2, double dy1, double dy2, double inv_area2,
-                        double *edge_values)
-{
-  double dqv[3];
-  double dq0, dq1, dq2;
-  double a, b;
-  double qmin, qmax;
-
-
-  // Compute differentials
-  dq1 = cv_k0 - cv_k;
-
-  // Calculate the gradient between the centroid of triangle k
-  // and that of its neighbour
-  a = dq1 * dx2;
-  b = dq1 * dy2;
-
-  // Calculate provisional edge jumps, to be limited
-  dqv[0] = a * dxv0 + b * dyv0;
-  dqv[1] = a * dxv1 + b * dyv1;
-  dqv[2] = a * dxv2 + b * dyv2;
-
-  // Now limit the jumps
-  if (dq1 >= 0.0)
-  {
-    qmin = 0.0;
-    qmax = dq1;
-  }
-  else
-  {
-    qmin = dq1;
-    qmax = 0.0;
-  }
-
-  // Limit the gradient
-  __limit_gradient(dqv, qmin, qmax, beta);
-
-  edge_values[0] = cv_k + dqv[0];
-  edge_values[1] = cv_k + dqv[1];
-  edge_values[2] = cv_k + dqv[2];
-
-}
-
-
-
-
-
-// Computational routine
-int64_t _openacc_extrapolate_second_order_edge_sw(struct domain *D)
-{
-
-  // Local variables
-  double a, b; // Gradient vector used to calculate edge values from centroids
-  int64_t k, k0, k1, k2, k3, k6, coord_index, i;
-  double x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2; // Vertices of the auxiliary triangle
-  double dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, dq1, area2, inv_area2;
-  double dqv[3], qmin, qmax, hmin, hmax;
-  double hc, h0, h1, h2, beta_tmp, hfactor;
-  double dk, dk_inv, a_tmp, b_tmp, c_tmp, d_tmp;
-  double edge_values[3];
-  double cv_k, cv_k0, cv_k1, cv_k2;
-
-  double x_centroid_work;
-  double xmom_centroid_values;
-  double y_centroid_work;
-  double ymom_centroid_values;
-
-  double minimum_allowed_height = D->minimum_allowed_height;
-  int64_t number_of_elements = D->number_of_elements;
-  int64_t extrapolate_velocity_second_order = D->extrapolate_velocity_second_order;
-
-
-  // Parameters used to control how the limiter is forced to first-order near
-  // wet-dry regions
-  a_tmp = 0.3; // Highest depth ratio with hfactor=1
-  b_tmp = 0.1; // Highest depth ratio with hfactor=0
-  c_tmp = 1.0 / (a_tmp - b_tmp);
-  d_tmp = 1.0 - (c_tmp * a_tmp);
-
-  // Replace momentum centroid with velocity centroid to allow velocity
-  // extrapolation This will be changed back at the end of the routine
-
-  // Need to calculate height xmom and ymom centroid values for all triangles 
-  // before extrapolation and limiting
-
-// #pragma omp parallel for simd shared(D) default(none) private(dk, dk_inv) firstprivate(number_of_elements, minimum_allowed_height, extrapolate_velocity_second_order)
-    for (k = 0; k < number_of_elements; k++)
-    {
-    dk = fmax(D->stage_centroid_values[k] - D->bed_centroid_values[k], 0.0);
-
-    D->height_centroid_values[k] = dk;
-    D->x_centroid_work[k] = 0.0;
-    D->y_centroid_work[k] = 0.0;
-
-    if (dk <= minimum_allowed_height)
-      {
-        D->x_centroid_work[k] = 0.0;
-        D->xmom_centroid_values[k] = 0.0;
-        D->y_centroid_work[k] = 0.0;
-        D->ymom_centroid_values[k] = 0.0;
-      }
-
-    if (extrapolate_velocity_second_order == 1)
-    {
-      if (dk > minimum_allowed_height)
-      {
-        dk_inv = 1.0 / dk;
-        D->x_centroid_work[k] = D->xmom_centroid_values[k];
-        D->xmom_centroid_values[k] = D->xmom_centroid_values[k] * dk_inv;
-
-        D->y_centroid_work[k] = D->ymom_centroid_values[k];
-        D->ymom_centroid_values[k] = D->ymom_centroid_values[k] * dk_inv;
-      }
-    }
-    } // end of for
-
-
-
-  // Begin extrapolation routine
-
-// #pragma omp parallel for simd private(k0, k1, k2, k3, k6, coord_index, i, \
-                          dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, \
-                          x_centroid_work, xmom_centroid_values, y_centroid_work, ymom_centroid_values, \
-                          dq1, area2, inv_area2, \
-                          cv_k, cv_k0, cv_k1, cv_k2, edge_values, \
-                          x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2, \
-                          dqv, qmin, qmax, hmin, hmax, \
-                          hc, h0, h1, h2, beta_tmp, hfactor, \
-                          dk, dk_inv, a, b) default(none) shared(D) \
-                          firstprivate(number_of_elements, minimum_allowed_height, extrapolate_velocity_second_order, c_tmp, d_tmp)
-  for (k = 0; k < number_of_elements; k++)
-  {
-
-    //printf("%ld, %e \n",k, D->height_centroid_values[k]);
-    //printf("%ld,  %e, %e, %e, %e \n",k, x_centroid_work,xmom_centroid_values,y_centroid_work,ymom_centroid_values);
-    //printf("%ld,  %e, %e, %e, %e \n",k, D->x_centroid_work[k],D->xmom_centroid_values[k],D->y_centroid_work[k],D->ymom_centroid_values[k]);
-
-
-    // Useful indices
-    k2 = k * 2;
-    k3 = k * 3;
-    k6 = k * 6;
-
-    // Get the edge coordinates
-    xv0 = D->edge_coordinates[k6 + 0];
-    yv0 = D->edge_coordinates[k6 + 1];
-    xv1 = D->edge_coordinates[k6 + 2];
-    yv1 = D->edge_coordinates[k6 + 3];
-    xv2 = D->edge_coordinates[k6 + 4];
-    yv2 = D->edge_coordinates[k6 + 5];
-
-    // Get the centroid coordinates
-    x = D->centroid_coordinates[k2 + 0];
-    y = D->centroid_coordinates[k2 + 1];
-
-    // Store x- and y- differentials for the edges of
-    // triangle k relative to the centroid
-    dxv0 = xv0 - x;
-    dxv1 = xv1 - x;
-    dxv2 = xv2 - x;
-    dyv0 = yv0 - y;
-    dyv1 = yv1 - y;
-    dyv2 = yv2 - y;
-
-    // If no boundaries, auxiliary triangle is formed
-    // from the centroids of the three neighbours
-    // If one boundary, auxiliary triangle is formed
-    // from this centroid and its two neighbours
-
-    k0 = D->surrogate_neighbours[k3 + 0];
-    k1 = D->surrogate_neighbours[k3 + 1];
-    k2 = D->surrogate_neighbours[k3 + 2];
-
-    // Get the auxiliary triangle's vertex coordinates
-    // (normally the centroids of neighbouring triangles)
-    coord_index = 2 * k0;
-    x0 = D->centroid_coordinates[coord_index + 0];
-    y0 = D->centroid_coordinates[coord_index + 1];
-
-    coord_index = 2 * k1;
-    x1 = D->centroid_coordinates[coord_index + 0];
-    y1 = D->centroid_coordinates[coord_index + 1];
-
-    coord_index = 2 * k2;
-    x2 = D->centroid_coordinates[coord_index + 0];
-    y2 = D->centroid_coordinates[coord_index + 1];
-
-    // Store x- and y- differentials for the vertices
-    // of the auxiliary triangle
-    dx1 = x1 - x0;
-    dx2 = x2 - x0;
-    dy1 = y1 - y0;
-    dy2 = y2 - y0;
-
-    // Calculate 2*area of the auxiliary triangle
-    // The triangle is guaranteed to be counter-clockwise
-    area2 = dy2 * dx1 - dy1 * dx2;
-
-    if (((D->height_centroid_values[k0] < minimum_allowed_height) | (k0 == k)) &
-        ((D->height_centroid_values[k1] < minimum_allowed_height) | (k1 == k)) &
-        ((D->height_centroid_values[k2] < minimum_allowed_height) | (k2 == k)))
-    {
-      // printf("Surrounded by dry cells\n");
-      D->x_centroid_work[k] = 0.;
-      D->xmom_centroid_values[k] = 0.;
-      D->y_centroid_work[k] = 0.;
-      D->ymom_centroid_values[k] = 0.;
-    }
-
-    // Limit the edge values
-    if (D->number_of_boundaries[k] == 3)
-    {
-      // Very unlikely
-      // No neighbours, set gradient on the triangle to zero
-
-      //printf("%ld 3 boundaries\n",k);
-
-      D->stage_edge_values[k3 + 0] = D->stage_centroid_values[k];
-      D->stage_edge_values[k3 + 1] = D->stage_centroid_values[k];
-      D->stage_edge_values[k3 + 2] = D->stage_centroid_values[k];
-
-      D->xmom_edge_values[k3 + 0] = D->xmom_centroid_values[k];
-      D->xmom_edge_values[k3 + 1] = D->xmom_centroid_values[k];
-      D->xmom_edge_values[k3 + 2] = D->xmom_centroid_values[k];
-
-      D->ymom_edge_values[k3 + 0] = D->ymom_centroid_values[k];
-      D->ymom_edge_values[k3 + 1] = D->ymom_centroid_values[k];
-      D->ymom_edge_values[k3 + 2] = D->ymom_centroid_values[k];
-
-      dk = D->height_centroid_values[k];
-      D->height_edge_values[k3 + 0] = dk;
-      D->height_edge_values[k3 + 1] = dk;
-      D->height_edge_values[k3 + 2] = dk;
-
-    }
-    else if (D->number_of_boundaries[k] <= 1)
-    {
-      //==============================================
-      // Number of boundaries <= 1
-      // 'Typical case'
-      //==============================================
-      //printf("%ld boundaries <= 1\n",k);
-
-      // Calculate heights of neighbouring cells
-      hc = D->height_centroid_values[k];
-      h0 = D->height_centroid_values[k0];
-      h1 = D->height_centroid_values[k1];
-      h2 = D->height_centroid_values[k2];
-
-      hmin = fmin(fmin(h0, fmin(h1, h2)), hc);
-      hmax = fmax(fmax(h0, fmax(h1, h2)), hc);
-
-      // Look for strong changes in cell depth as an indicator of near-wet-dry
-      // Reduce hfactor linearly from 1-0 between depth ratio (hmin/hc) of [a_tmp , b_tmp]
-      // NOTE: If we have a more 'second order' treatment in near dry areas (e.g. with b_tmp being negative), then
-      //       the water tends to dry more rapidly (which is in agreement with analytical results),
-      //       but is also more 'artefacty' in important cases (tendency for high velocities, etc).
-      //
-      // So hfactor = depth_ratio*(c_tmp) + d_tmp, but is clipped between 0 and 1.
-      hfactor = fmax(0., fmin(c_tmp * fmax(hmin, 0.0) / fmax(hc, 1.0e-06) + d_tmp,
-                              fmin(c_tmp * fmax(hc, 0.) / fmax(hmax, 1.0e-06) + d_tmp, 1.0)));
-      // Set hfactor to zero smothly as hmin--> minimum_allowed_height. This
-      // avoids some 'chatter' for very shallow flows
-      hfactor = fmin(1.2 * fmax(hmin - D->minimum_allowed_height, 0.) / (fmax(hmin, 0.) + 1. * D->minimum_allowed_height), hfactor);
-
-      inv_area2 = 1.0 / area2;
-
-      //-----------------------------------
-      // stage
-      //-----------------------------------
-      beta_tmp = D->beta_w_dry + (D->beta_w - D->beta_w_dry) * hfactor;
-
-      cv_k  = D->stage_centroid_values[k];
-      cv_k0 = D->stage_centroid_values[k0];
-      cv_k1 = D->stage_centroid_values[k1];
-      cv_k2 = D->stage_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->stage_edge_values[k3 + 0] = edge_values[0];
-      D->stage_edge_values[k3 + 1] = edge_values[1];
-      D->stage_edge_values[k3 + 2] = edge_values[2];  
-
-      //-----------------------------------
-      // height
-      //-----------------------------------
-
-      cv_k  = D->height_centroid_values[k];
-      cv_k0 = D->height_centroid_values[k0];
-      cv_k1 = D->height_centroid_values[k1];
-      cv_k2 = D->height_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->height_edge_values[k3 + 0] = edge_values[0];
-      D->height_edge_values[k3 + 1] = edge_values[1];
-      D->height_edge_values[k3 + 2] = edge_values[2]; 
-
-    
-      //-----------------------------------
-      // xmomentum
-      //-----------------------------------
-
-      beta_tmp = D->beta_uh_dry + (D->beta_uh - D->beta_uh_dry) * hfactor;
-
-      cv_k  = D->xmom_centroid_values[k];
-      cv_k0 = D->xmom_centroid_values[k0];
-      cv_k1 = D->xmom_centroid_values[k1];
-      cv_k2 = D->xmom_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->xmom_edge_values[k3 + 0] = edge_values[0];
-      D->xmom_edge_values[k3 + 1] = edge_values[1];
-      D->xmom_edge_values[k3 + 2] = edge_values[2]; 
-
-      //-----------------------------------
-      // ymomentum
-      //-----------------------------------
-
-      beta_tmp = D->beta_vh_dry + (D->beta_vh - D->beta_vh_dry) * hfactor;
-
-      cv_k  = D->ymom_centroid_values[k];
-      cv_k0 = D->ymom_centroid_values[k0];
-      cv_k1 = D->ymom_centroid_values[k1];
-      cv_k2 = D->ymom_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->ymom_edge_values[k3 + 0] = edge_values[0];
-      D->ymom_edge_values[k3 + 1] = edge_values[1];
-      D->ymom_edge_values[k3 + 2] = edge_values[2]; 
-
-    } // End number_of_boundaries <=1
-    else
-    {
-      //printf("%ld 2 boundaries\n",k);
-      //==============================================
-      // Number of boundaries == 2
-      //==============================================
-
-      // One internal neighbour and gradient is in direction of the neighbour's centroid
-
-      // Find the only internal neighbour (k1?)
-      for (k2 = k3; k2 < k3 + 3; k2++)
-      {
-        // Find internal neighbour of triangle k
-        // k2 indexes the edges of triangle k
-
-        if (D->surrogate_neighbours[k2] != k)
-        {
-          break;
-        }
-      }
-
-      // if ((k2 == k3 + 3))
-      // {
-      //   // If we didn't find an internal neighbour
-      //   // report_python_error(AT, "Internal neighbour not found");
-      //   return -1;
-      // }
-
-      k1 = D->surrogate_neighbours[k2];
-
-      // The coordinates of the triangle are already (x,y).
-      // Get centroid of the neighbour (x1,y1)
-      coord_index = 2 * k1;
-      x1 = D->centroid_coordinates[coord_index + 0];
-      y1 = D->centroid_coordinates[coord_index + 1];
-
-      // Compute x- and y- distances between the centroid of
-      // triangle k and that of its neighbour
-      dx1 = x1 - x;
-      dy1 = y1 - y;
-
-      // Set area2 as the square of the distance
-      area2 = dx1 * dx1 + dy1 * dy1;
-
-      // Set dx2=(x1-x0)/((x1-x0)^2+(y1-y0)^2)
-      // and dy2=(y1-y0)/((x1-x0)^2+(y1-y0)^2) which
-      // respectively correspond to the x- and y- gradients
-      // of the conserved quantities
-      dx2 = 1.0 / area2;
-      dy2 = dx2 * dy1;
-      dx2 *= dx1;
-
-      //-----------------------------------
-      // stage
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->stage_centroid_values[k1] - D->stage_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
-
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->stage_edge_values[k3 + 0] = D->stage_centroid_values[k] + dqv[0];
-      D->stage_edge_values[k3 + 1] = D->stage_centroid_values[k] + dqv[1];
-      D->stage_edge_values[k3 + 2] = D->stage_centroid_values[k] + dqv[2];
-
-      //-----------------------------------
-      // height
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->height_centroid_values[k1] - D->height_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
-
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->height_edge_values[k3 + 0] = D->height_centroid_values[k] + dqv[0];
-      D->height_edge_values[k3 + 1] = D->height_centroid_values[k] + dqv[1];
-      D->height_edge_values[k3 + 2] = D->height_centroid_values[k] + dqv[2];
-
-      //-----------------------------------
-      // xmomentum
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->xmom_centroid_values[k1] - D->xmom_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
-
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->xmom_edge_values[k3 + 0] = D->xmom_centroid_values[k] + dqv[0];
-      D->xmom_edge_values[k3 + 1] = D->xmom_centroid_values[k] + dqv[1];
-      D->xmom_edge_values[k3 + 2] = D->xmom_centroid_values[k] + dqv[2];
-
-      //-----------------------------------
-      // ymomentum
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->ymom_centroid_values[k1] - D->ymom_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
-
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->ymom_edge_values[k3 + 0] = D->ymom_centroid_values[k] + dqv[0];
-      D->ymom_edge_values[k3 + 1] = D->ymom_centroid_values[k] + dqv[1];
-      D->ymom_edge_values[k3 + 2] = D->ymom_centroid_values[k] + dqv[2];
-
-    } // else [number_of_boundaries]
-
-  // printf("%ld, bed    %e, %e, %e\n",k, D->bed_edge_values[k3],D->bed_edge_values[k3 + 1],D->bed_edge_values[k3 + 2] );
-  // printf("%ld, stage  %e, %e, %e\n",k, D->stage_edge_values[k3],D->stage_edge_values[k3 + 1],D->stage_edge_values[k3 + 2] );
-  // printf("%ld, height %e, %e, %e\n",k, D->height_edge_values[k3],D->height_edge_values[k3 + 1],D->height_edge_values[k3 + 2] );
-  // printf("%ld, xmom   %e, %e, %e\n",k, D->xmom_edge_values[k3],D->xmom_edge_values[k3 + 1],D->xmom_edge_values[k3 + 2] );
-  // printf("%ld, ymom   %e, %e, %e\n",k, D->ymom_edge_values[k3],D->ymom_edge_values[k3 + 1],D->ymom_edge_values[k3 + 2] );
-
-    // If needed, convert from velocity to momenta
-    if (D->extrapolate_velocity_second_order == 1)
-    {
-      // Re-compute momenta at edges
-      for (i = 0; i < 3; i++)
-      {
-        dk = D->height_edge_values[k3 + i];
-        D->xmom_edge_values[k3 + i] = D->xmom_edge_values[k3 + i] * dk;
-        D->ymom_edge_values[k3 + i] = D->ymom_edge_values[k3 + i] * dk;
-      }
-    }
-
-    // Compute new bed elevation
-    D->bed_edge_values[k3 + 0] = D->stage_edge_values[k3 + 0] - D->height_edge_values[k3 + 0];
-    D->bed_edge_values[k3 + 1] = D->stage_edge_values[k3 + 1] - D->height_edge_values[k3 + 1];
-    D->bed_edge_values[k3 + 2] = D->stage_edge_values[k3 + 2] - D->height_edge_values[k3 + 2];
-
-
-    // FIXME SR: Do we need vertex values every inner timestep?
-
-    // Compute stage vertex values
-    D->stage_vertex_values[k3 + 0] = D->stage_edge_values[k3 + 1] + D->stage_edge_values[k3 + 2] - D->stage_edge_values[k3 + 0];
-    D->stage_vertex_values[k3 + 1] = D->stage_edge_values[k3 + 0] + D->stage_edge_values[k3 + 2] - D->stage_edge_values[k3 + 1];
-    D->stage_vertex_values[k3 + 2] = D->stage_edge_values[k3 + 0] + D->stage_edge_values[k3 + 1] - D->stage_edge_values[k3 + 2];
-
-    // Compute height vertex values
-    D->height_vertex_values[k3 + 0] = D->height_edge_values[k3 + 1] + D->height_edge_values[k3 + 2] - D->height_edge_values[k3 + 0];
-    D->height_vertex_values[k3 + 1] = D->height_edge_values[k3 + 0] + D->height_edge_values[k3 + 2] - D->height_edge_values[k3 + 1];
-    D->height_vertex_values[k3 + 2] = D->height_edge_values[k3 + 0] + D->height_edge_values[k3 + 1] - D->height_edge_values[k3 + 2];
-
-    // Compute momenta at vertices
-    D->xmom_vertex_values[k3 + 0] = D->xmom_edge_values[k3 + 1] + D->xmom_edge_values[k3 + 2] - D->xmom_edge_values[k3 + 0];
-    D->xmom_vertex_values[k3 + 1] = D->xmom_edge_values[k3 + 0] + D->xmom_edge_values[k3 + 2] - D->xmom_edge_values[k3 + 1];
-    D->xmom_vertex_values[k3 + 2] = D->xmom_edge_values[k3 + 0] + D->xmom_edge_values[k3 + 1] - D->xmom_edge_values[k3 + 2];
-
-    D->ymom_vertex_values[k3 + 0] = D->ymom_edge_values[k3 + 1] + D->ymom_edge_values[k3 + 2] - D->ymom_edge_values[k3 + 0];
-    D->ymom_vertex_values[k3 + 1] = D->ymom_edge_values[k3 + 0] + D->ymom_edge_values[k3 + 2] - D->ymom_edge_values[k3 + 1];
-    D->ymom_vertex_values[k3 + 2] = D->ymom_edge_values[k3 + 0] + D->ymom_edge_values[k3 + 1] - D->ymom_edge_values[k3 + 2];
-
-    
-    D->bed_vertex_values[k3 + 0] = D->bed_edge_values[k3 + 1] + D->bed_edge_values[k3 + 2] - D->bed_edge_values[k3 + 0];
-    D->bed_vertex_values[k3 + 1] = D->bed_edge_values[k3 + 0] + D->bed_edge_values[k3 + 2] - D->bed_edge_values[k3 + 1];
-    D->bed_vertex_values[k3 + 2] = D->bed_edge_values[k3 + 0] + D->bed_edge_values[k3 + 1] - D->bed_edge_values[k3 + 2];
-
-
-
-  }   // for k=0 to number_of_elements-1
-
-// Fix xmom and ymom centroid values
-// #pragma omp parallel for simd private(k3, i, dk) firstprivate(extrapolate_velocity_second_order)
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    if (extrapolate_velocity_second_order == 1)
-    {
-      // Convert velocity back to momenta at centroids
-      D->xmom_centroid_values[k] = D->x_centroid_work[k];
-      D->ymom_centroid_values[k] = D->y_centroid_work[k];
-    }
-
-    // // Compute stage vertex values
-    // D->stage_vertex_values[k3] = D->stage_edge_values[k3 + 1] + D->stage_edge_values[k3 + 2] - D->stage_edge_values[k3];
-    // D->stage_vertex_values[k3 + 1] = D->stage_edge_values[k3] + D->stage_edge_values[k3 + 2] - D->stage_edge_values[k3 + 1];
-    // D->stage_vertex_values[k3 + 2] = D->stage_edge_values[k3] + D->stage_edge_values[k3 + 1] - D->stage_edge_values[k3 + 2];
-
-    // // Compute height vertex values
-    // D->height_vertex_values[k3] = D->height_edge_values[k3 + 1] + D->height_edge_values[k3 + 2] - D->height_edge_values[k3];
-    // D->height_vertex_values[k3 + 1] = D->height_edge_values[k3] + D->height_edge_values[k3 + 2] - D->height_edge_values[k3 + 1];
-    // D->height_vertex_values[k3 + 2] = D->height_edge_values[k3] + D->height_edge_values[k3 + 1] - D->height_edge_values[k3 + 2];
-
-    // // If needed, convert from velocity to momenta
-    // if (D->extrapolate_velocity_second_order == 1)
-    // {
-    //   // Re-compute momenta at edges
-    //   for (i = 0; i < 3; i++)
-    //   {
-    //     dk = D->height_edge_values[k3 + i];
-    //     D->xmom_edge_values[k3 + i] = D->xmom_edge_values[k3 + i] * dk;
-    //     D->ymom_edge_values[k3 + i] = D->ymom_edge_values[k3 + i] * dk;
-    //   }
-    // }
-
-    // // Compute momenta at vertices
-    // D->xmom_vertex_values[k3 + 0] = D->xmom_edge_values[k3 + 1] + D->xmom_edge_values[k3 + 2] - D->xmom_edge_values[k3 + 0];
-    // D->xmom_vertex_values[k3 + 1] = D->xmom_edge_values[k3 + 0] + D->xmom_edge_values[k3 + 2] - D->xmom_edge_values[k3 + 1];
-    // D->xmom_vertex_values[k3 + 2] = D->xmom_edge_values[k3 + 0] + D->xmom_edge_values[k3 + 1] - D->xmom_edge_values[k3 + 2];
-
-    // D->ymom_vertex_values[k3 + 0] = D->ymom_edge_values[k3 + 1] + D->ymom_edge_values[k3 + 2] - D->ymom_edge_values[k3 + 0];
-    // D->ymom_vertex_values[k3 + 1] = D->ymom_edge_values[k3 + 0] + D->ymom_edge_values[k3 + 2] - D->ymom_edge_values[k3 + 1];
-    // D->ymom_vertex_values[k3 + 2] = D->ymom_edge_values[k3 + 0] + D->ymom_edge_values[k3 + 1] - D->ymom_edge_values[k3 + 2];
-
-    // // Compute new bed elevation
-    // D->bed_edge_values[k3 + 0] = D->stage_edge_values[k3 + 0] - D->height_edge_values[k3 + 0];
-    // D->bed_edge_values[k3 + 1] = D->stage_edge_values[k3 + 1] - D->height_edge_values[k3 + 1];
-    // D->bed_edge_values[k3 + 2] = D->stage_edge_values[k3 + 2] - D->height_edge_values[k3 + 2];
-
-    // D->bed_vertex_values[k3 + 0] = D->bed_edge_values[k3 + 1] + D->bed_edge_values[k3 + 2] - D->bed_edge_values[k3 + 0];
-    // D->bed_vertex_values[k3 + 1] = D->bed_edge_values[k3 + 0] + D->bed_edge_values[k3 + 2] - D->bed_edge_values[k3 + 1];
-    // D->bed_vertex_values[k3 + 2] = D->bed_edge_values[k3 + 0] + D->bed_edge_values[k3 + 1] - D->bed_edge_values[k3 + 2];
-
-  }
-
-  return 0;
-}
-
-
-// Computational function for flux computation
-int64_t _openacc_fix_negative_cells(struct domain *D)
-{
-  int64_t k;
-  int64_t tff;
-  int64_t num_negative_cells = 0;
-
-  // #pragma omp parallel for private(k, tff) reduction(+:num_negative_cells)
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    tff = D->tri_full_flag[k];
-    if ((D->stage_centroid_values[k] - D->bed_centroid_values[k] < 0.0) & (tff > 0)) 
-    {
-      num_negative_cells = num_negative_cells + 1;
-      D->stage_centroid_values[k] = D->bed_centroid_values[k];
-      D->xmom_centroid_values[k] = 0.0;
-      D->ymom_centroid_values[k] = 0.0;
-    }
-  }
-  return num_negative_cells;
-}
diff --git a/anuga/shallow_water/sw_domain_openacc_ext.pyx b/anuga/shallow_water/sw_domain_openacc_ext.pyx
deleted file mode 100644
index 3776769c5..000000000
--- a/anuga/shallow_water/sw_domain_openacc_ext.pyx
+++ /dev/null
@@ -1,416 +0,0 @@
-#cython: wraparound=False, boundscheck=True, cdivision=True, profile=False, nonecheck=False, overflowcheck=False, cdivision_warnings=False, unraisable_tracebacks=False
-
-import cython
-from libc.stdint cimport int64_t
-
-# import both numpy and the Cython declarations for numpy
-import numpy as np
-cimport numpy as np
-
-cdef extern from "sw_domain_openacc.c" nogil:
-	struct domain:
-		int64_t number_of_elements
-		int64_t boundary_length
-		int64_t number_of_riverwall_edges
-		double epsilon
-		double H0
-		double g
-		int64_t optimise_dry_cells
-		double evolve_max_timestep
-		int64_t extrapolate_velocity_second_order
-		double minimum_allowed_height
-		double maximum_allowed_speed
-		int64_t low_froude
-		int64_t timestep_fluxcalls
-		double beta_w
-		double beta_w_dry
-		double beta_uh
-		double beta_uh_dry
-		double beta_vh
-		double beta_vh_dry
-		int64_t max_flux_update_frequency
-		int64_t ncol_riverwall_hydraulic_properties
-		int64_t* neighbours
-		int64_t* neighbour_edges
-		int64_t* surrogate_neighbours
-		double* normals
-		double* edgelengths
-		double* radii
-		double* areas
-		int64_t* edge_flux_type
-		int64_t* tri_full_flag
-		int64_t* already_computed_flux
-		double* max_speed
-		double* vertex_coordinates
-		double* edge_coordinates
-		double* centroid_coordinates
-		int64_t* number_of_boundaries
-		double* stage_edge_values
-		double* xmom_edge_values
-		double* ymom_edge_values
-		double* bed_edge_values
-		double* height_edge_values
-		double* stage_centroid_values
-		double* xmom_centroid_values
-		double* ymom_centroid_values
-		double* bed_centroid_values
-		double* height_centroid_values
-		double* stage_vertex_values
-		double* xmom_vertex_values
-		double* ymom_vertex_values
-		double* bed_vertex_values
-		double* height_vertex_values
-		double* stage_boundary_values
-		double* xmom_boundary_values
-		double* ymom_boundary_values
-		double* bed_boundary_values
-		double* stage_explicit_update
-		double* xmom_explicit_update
-		double* ymom_explicit_update
-		int64_t* flux_update_frequency
-		int64_t* update_next_flux
-		int64_t* update_extrapolation
-		double* edge_timestep
-		double* edge_flux_work
-		double* neigh_work
-		double* pressuregrad_work
-		double* x_centroid_work
-		double* y_centroid_work
-		double* boundary_flux_sum
-		int64_t* allow_timestep_increase
-		double* riverwall_elevation
-		int64_t* riverwall_rowIndex
-		double* riverwall_hydraulic_properties
-		int64_t* edge_river_wall_counter
-		double* stage_semi_implicit_update
-		double* xmom_semi_implicit_update
-		double* ymom_semi_implicit_update
-
-	struct edge:
-		pass
-
-	double _openacc_compute_fluxes_central(domain* D, double timestep)
-	double _openacc_protect(domain* D)
-	int64_t _openacc_extrapolate_second_order_edge_sw(domain* D)
-	int64_t _openacc_fix_negative_cells(domain* D)
-
-
-
-cdef int64_t pointer_flag = 0
-cdef int64_t parameter_flag = 0
-
-cdef inline get_python_domain_parameters(domain *D, object domain_object):
-
-	D.number_of_elements = domain_object.number_of_elements
-	D.boundary_length = domain_object.boundary_length 
-	D.number_of_riverwall_edges = domain_object.number_of_riverwall_edges
-	D.epsilon = domain_object.epsilon
-	D.H0 = domain_object.H0
-	D.g = domain_object.g
-	D.optimise_dry_cells = domain_object.optimise_dry_cells
-	D.evolve_max_timestep = domain_object.evolve_max_timestep
-	D.minimum_allowed_height = domain_object.minimum_allowed_height
-	D.maximum_allowed_speed = domain_object.maximum_allowed_speed
-	D.timestep_fluxcalls = domain_object.timestep_fluxcalls
-	D.low_froude = domain_object.low_froude
-	D.extrapolate_velocity_second_order = domain_object.extrapolate_velocity_second_order
-	D.beta_w = domain_object.beta_w
-	D.beta_w_dry = domain_object.beta_w_dry
-	D.beta_uh = domain_object.beta_uh
-	D.beta_uh_dry = domain_object.beta_uh_dry
-	D.beta_vh = domain_object.beta_vh
-	D.beta_vh_dry = domain_object.beta_vh_dry
-	D.max_flux_update_frequency = domain_object.max_flux_update_frequency
-		
-
-cdef inline get_python_domain_pointers(domain *D, object domain_object):
-
-	cdef int64_t[:,::1]   neighbours
-	cdef int64_t[:,::1]   neighbour_edges
-	cdef double[:,::1] normals
-	cdef double[:,::1] edgelengths
-	cdef double[::1]   radii
-	cdef double[::1]   areas
-	cdef int64_t[::1]     edge_flux_type
-	cdef int64_t[::1]     tri_full_flag
-	cdef int64_t[:,::1]   already_computed_flux
-	cdef double[:,::1] vertex_coordinates
-	cdef double[:,::1] edge_coordinates
-	cdef double[:,::1] centroid_coordinates
-	cdef int64_t[::1]     number_of_boundaries
-	cdef int64_t[:,::1]   surrogate_neighbours
-	cdef double[::1]   max_speed
-	cdef int64_t[::1]     flux_update_frequency
-	cdef int64_t[::1]     update_next_flux
-	cdef int64_t[::1]     update_extrapolation
-	cdef int64_t[::1]     allow_timestep_increase
-	cdef double[::1]   edge_timestep
-	cdef double[::1]   edge_flux_work
-	cdef double[::1]   neigh_work
-	cdef double[::1]   pressuregrad_work
-	cdef double[::1]   x_centroid_work
-	cdef double[::1]   y_centroid_work
-	cdef double[::1]   boundary_flux_sum
-	cdef double[::1]   riverwall_elevation
-	cdef int64_t[::1]     riverwall_rowIndex
-	cdef double[:,::1] riverwall_hydraulic_properties
-	cdef int64_t[::1]     edge_river_wall_counter
-	cdef double[:,::1] edge_values
-	cdef double[::1]   centroid_values
-	cdef double[:,::1] vertex_values
-	cdef double[::1]   boundary_values
-	cdef double[::1]   explicit_update
-	cdef double[::1]   semi_implicit_update	
-	
-	cdef object quantities
-	cdef object riverwallData
-
-	#------------------------------------------------------
-	# Domain structures
-	#------------------------------------------------------
-	neighbours = domain_object.neighbours
-	D.neighbours = &neighbours[0,0]
-	
-	surrogate_neighbours = domain_object.surrogate_neighbours
-	D.surrogate_neighbours = &surrogate_neighbours[0,0]
-
-	neighbour_edges = domain_object.neighbour_edges
-	D.neighbour_edges = &neighbour_edges[0,0]
-
-	normals = domain_object.normals
-	D.normals = &normals[0,0]
-
-	edgelengths = domain_object.edgelengths
-	D.edgelengths = &edgelengths[0,0]
-
-	radii = domain_object.radii
-	D.radii = &radii[0]
-
-	areas = domain_object.areas
-	D.areas = &areas[0]
-
-	edge_flux_type = domain_object.edge_flux_type
-	D.edge_flux_type = &edge_flux_type[0]
-
-	tri_full_flag = domain_object.tri_full_flag
-	D.tri_full_flag = &tri_full_flag[0]
-
-	already_computed_flux = domain_object.already_computed_flux
-	D.already_computed_flux = &already_computed_flux[0,0]
-
-	vertex_coordinates = domain_object.vertex_coordinates
-	D.vertex_coordinates = &vertex_coordinates[0,0]
-
-	edge_coordinates = domain_object.edge_coordinates
-	D.edge_coordinates = &edge_coordinates[0,0]
-
-	centroid_coordinates = domain_object.centroid_coordinates
-	D.centroid_coordinates = &centroid_coordinates[0,0]
-
-	max_speed = domain_object.max_speed
-	D.max_speed = &max_speed[0]
-
-	number_of_boundaries = domain_object.number_of_boundaries
-	D.number_of_boundaries = &number_of_boundaries[0]
-
-	flux_update_frequency = domain_object.flux_update_frequency
-	D.flux_update_frequency = &flux_update_frequency[0]
-
-	update_next_flux = domain_object.update_next_flux
-	D.update_next_flux = &update_next_flux[0]
-
-	update_extrapolation = domain_object.update_extrapolation
-	D.update_extrapolation = &update_extrapolation[0]
-
-	allow_timestep_increase = domain_object.allow_timestep_increase
-	D.allow_timestep_increase = &allow_timestep_increase[0]
-
-	edge_timestep = domain_object.edge_timestep
-	D.edge_timestep = &edge_timestep[0]
-
-	edge_flux_work = domain_object.edge_flux_work
-	D.edge_flux_work = &edge_flux_work[0]
-
-	neigh_work = domain_object.neigh_work
-	D.neigh_work = &neigh_work[0]
-
-	pressuregrad_work = domain_object.pressuregrad_work
-	D.pressuregrad_work = &pressuregrad_work[0]
-
-	x_centroid_work = domain_object.x_centroid_work
-	D.x_centroid_work = &x_centroid_work[0]
-
-	y_centroid_work = domain_object.y_centroid_work
-	D.y_centroid_work = &y_centroid_work[0]
-
-	boundary_flux_sum = domain_object.boundary_flux_sum
-	D.boundary_flux_sum = &boundary_flux_sum[0]
-
-	edge_river_wall_counter = domain_object.edge_river_wall_counter
-	D.edge_river_wall_counter  = &edge_river_wall_counter[0]
-
-	#------------------------------------------------------
-	# Quantity structures
-	#------------------------------------------------------
-	quantities = domain_object.quantities
-	stage = quantities["stage"]
-	xmomentum = quantities["xmomentum"]
-	ymomentum = quantities["ymomentum"]
-	elevation = quantities["elevation"]
-	height = quantities["height"]
-
-	edge_values = stage.edge_values
-	D.stage_edge_values = &edge_values[0,0]
-
-	edge_values = xmomentum.edge_values
-	D.xmom_edge_values = &edge_values[0,0]
-
-	edge_values = ymomentum.edge_values
-	D.ymom_edge_values = &edge_values[0,0]
-
-	edge_values = elevation.edge_values
-	D.bed_edge_values = &edge_values[0,0]
-
-	edge_values = height.edge_values
-	D.height_edge_values = &edge_values[0,0]
-
-	centroid_values = stage.centroid_values
-	D.stage_centroid_values = &centroid_values[0]
-
-	centroid_values = xmomentum.centroid_values
-	D.xmom_centroid_values = &centroid_values[0]
-
-	centroid_values = ymomentum.centroid_values
-	D.ymom_centroid_values = &centroid_values[0]
-
-	centroid_values = elevation.centroid_values
-	D.bed_centroid_values = &centroid_values[0]
-
-	centroid_values = height.centroid_values
-	D.height_centroid_values = &centroid_values[0]
-
-	vertex_values = stage.vertex_values
-	D.stage_vertex_values = &vertex_values[0,0]
-
-	vertex_values = xmomentum.vertex_values
-	D.xmom_vertex_values = &vertex_values[0,0]
-
-	vertex_values = ymomentum.vertex_values
-	D.ymom_vertex_values = &vertex_values[0,0]
-
-	vertex_values = elevation.vertex_values
-	D.bed_vertex_values = &vertex_values[0,0]
-
-	vertex_values = height.vertex_values
-	D.height_vertex_values = &vertex_values[0,0]
-
-	boundary_values = stage.boundary_values
-	D.stage_boundary_values = &boundary_values[0]
-
-	boundary_values = xmomentum.boundary_values
-	D.xmom_boundary_values = &boundary_values[0]
-
-	boundary_values = ymomentum.boundary_values
-	D.ymom_boundary_values = &boundary_values[0]
-
-	boundary_values = elevation.boundary_values
-	D.bed_boundary_values = &boundary_values[0]
-
-	explicit_update = stage.explicit_update
-	D.stage_explicit_update = &explicit_update[0]
-
-	explicit_update = xmomentum.explicit_update
-	D.xmom_explicit_update = &explicit_update[0]
-
-	explicit_update = ymomentum.explicit_update
-	D.ymom_explicit_update = &explicit_update[0]
-
-	semi_implicit_update = stage.semi_implicit_update
-	D.stage_semi_implicit_update = &semi_implicit_update[0]
-
-	semi_implicit_update = xmomentum.semi_implicit_update
-	D.xmom_semi_implicit_update = &semi_implicit_update[0]
-
-	semi_implicit_update = ymomentum.semi_implicit_update
-	D.ymom_semi_implicit_update = &semi_implicit_update[0]	
-
-	#------------------------------------------------------
-	# Riverwall structures
-	#------------------------------------------------------
-	riverwallData = domain_object.riverwallData
-
-	riverwall_elevation = riverwallData.riverwall_elevation
-	D.riverwall_elevation = &riverwall_elevation[0]
-
-	riverwall_rowIndex = riverwallData.hydraulic_properties_rowIndex
-	D.riverwall_rowIndex = &riverwall_rowIndex[0]
-
-	D.ncol_riverwall_hydraulic_properties = riverwallData.ncol_hydraulic_properties
-
-	riverwall_hydraulic_properties = riverwallData.hydraulic_properties
-	D.riverwall_hydraulic_properties = &riverwall_hydraulic_properties[0,0]
-
-
-
-#===============================================================================
-
-def compute_fluxes_ext_central(object domain_object, double timestep):
-
-	cdef domain D
-
-	get_python_domain_parameters(&D, domain_object)
-	get_python_domain_pointers(&D, domain_object)
-
-	with nogil:
-		timestep =  _openacc_compute_fluxes_central(&D, timestep)
-
-	return timestep
-
-def extrapolate_second_order_edge_sw(object domain_object):
-
-	cdef domain D
-	cdef int64_t e
-
-	get_python_domain_parameters(&D, domain_object)
-	get_python_domain_pointers(&D, domain_object)
-
-	with nogil:
-		e = _openacc_extrapolate_second_order_edge_sw(&D)
-
-	if e == -1:
-		return None
-
-def protect_new(object domain_object):
-
-	cdef domain D
-
-	cdef double mass_error
-
-	get_python_domain_parameters(&D, domain_object)
-	get_python_domain_pointers(&D, domain_object)
-
-	with nogil:
-		mass_error = _openacc_protect(&D)
-
-
-	return mass_error
-
-def compute_flux_update_frequency(object domain_object, double timestep):
-
-	pass
-
-def fix_negative_cells(object domain_object):
-
-	cdef domain D
-	cdef int64_t num_negative_cells
-
-	get_python_domain_parameters(&D, domain_object)
-	get_python_domain_pointers(&D, domain_object)
-
-	with nogil:
-		num_negative_cells = _openacc_fix_negative_cells(&D)
-
-	return num_negative_cells
-
-
-
diff --git a/anuga/shallow_water/sw_domain_openmp.c b/anuga/shallow_water/sw_domain_openmp.c
index ff23db666..62c5acbf1 100644
--- a/anuga/shallow_water/sw_domain_openmp.c
+++ b/anuga/shallow_water/sw_domain_openmp.c
@@ -15,30 +15,27 @@
 // Gareth Davies, GA 2011
 
 #include "math.h"
+#include <math.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
 #include <stdint.h>
 
-#if defined(__APPLE__)
-// clang doesn't have openmp
-#else
-#include "omp.h"
-#endif
-
+#include "sw_domain_math.h"
 #include "util_ext.h"
 #include "sw_domain.h"
+#include "anuga_constants.h"
 
-const double pi = 3.14159265358979;
+// FIXME: Perhaps use the epsilon used elsewhere.
 
 // Trick to compute n modulo d (n%d in python) when d is a power of 2
-uint64_t __mod_of_power_2(uint64_t n, uint64_t d)
+anuga_uint __mod_of_power_2(anuga_uint n, anuga_uint d)
 {
   return (n & (d - 1));
 }
 
 // Computational function for rotation
-int64_t __rotate(double *q, double n1, double n2)
+anuga_int __rotate(double *q, const double n1, const double n2)
 {
   /*Rotate the last  2 coordinates of q (q[1], q[2])
     from x,y coordinates to coordinates based on normal vector (n1, n2).
@@ -60,20 +57,82 @@ int64_t __rotate(double *q, double n1, double n2)
 
   return 0;
 }
+// general function to replace the repeated if statements for the velocity terms
+static inline void compute_velocity_terms(
+    const double h, const double h_edge,
+    const double uh_raw, const double vh_raw,
+    double *__restrict u, double *__restrict uh, double *__restrict v, double *__restrict vh)
+{
+  if (h_edge > 0.0)
+  {
+    double inv_h_edge = 1.0 / h_edge;
+
+    *u = uh_raw * inv_h_edge;
+    *uh = h * (*u);
+
+    *v = vh_raw * inv_h_edge;
+    *vh = h * inv_h_edge * vh_raw;
+  }
+  else
+  {
+    *u = 0.0;
+    *uh = 0.0;
+    *v = 0.0;
+    *vh = 0.0;
+  }
+}
+
+static inline double compute_local_froude(
+    const anuga_int low_froude,
+    const double u_left, const double u_right,
+    const double v_left, const double v_right,
+    const double soundspeed_left, const double soundspeed_right)
+{
+  double numerator = u_right * u_right + u_left * u_left +
+                     v_right * v_right + v_left * v_left;
+  double denominator = soundspeed_left * soundspeed_left +
+                       soundspeed_right * soundspeed_right + 1.0e-10;
+
+  if (low_froude == 1)
+  {
+    return sqrt(fmax(0.001, fmin(1.0, numerator / denominator)));
+  }
+  else if (low_froude == 2)
+  {
+    double fr = sqrt(numerator / denominator);
+    return sqrt(fmin(1.0, 0.01 + fmax(fr - 0.01, 0.0)));
+  }
+  else
+  {
+    return 1.0;
+  }
+}
+
+static inline double compute_s_max(const double u_left, const double u_right,
+                                   const double c_left, const double c_right)
+{
+  double s = fmax(u_left + c_left, u_right + c_right);
+  return (s < 0.0) ? 0.0 : s;
+}
+
+static inline double compute_s_min(const double u_left, const double u_right,
+                                   const double c_left, const double c_right)
+{
+  double s = fmin(u_left - c_left, u_right - c_right);
+  return (s > 0.0) ? 0.0 : s;
+}
 
 // Innermost flux function (using stage w=z+h)
-int64_t __flux_function_central(double *q_left, double *q_right,
-                            double h_left, double h_right,
-                            double hle, double hre,
-                            double n1, double n2,
-                            double epsilon,
-                            double ze,
-                            double limiting_threshold,
-                            double g,
-                            double *edgeflux, double *max_speed,
-                            double *pressure_flux, double hc,
-                            double hc_n,
-                            int64_t low_froude)
+anuga_int __flux_function_central(double *__restrict q_left, double *__restrict q_right,
+                                const double h_left, const double h_right,
+                                const double hle, const double hre,
+                                const double n1, const double n2,
+                                const double epsilon,
+                                const double ze,
+                                const double g,
+                                double *__restrict edgeflux, double *__restrict max_speed,
+                                double *__restrict pressure_flux,
+                                const anuga_int low_froude)
 {
 
   /*Compute fluxes between volumes for the shallow water wave equation
@@ -89,30 +148,21 @@ int64_t __flux_function_central(double *q_left, double *q_right,
     FIXME: Several variables in this interface are no longer used, clean up
   */
 
-  int64_t i;
-
   double uh_left, vh_left, u_left;
   double uh_right, vh_right, u_right;
-  double s_min, s_max, soundspeed_left, soundspeed_right;
-  double denom, inverse_denominator;
-  double tmp, local_fr, v_right, v_left;
+  double soundspeed_left, soundspeed_right;
+  double denom;
+  double v_right, v_left;
   double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3];
 
-  if (h_left == 0. && h_right == 0.)
+
+  for (anuga_int i = 0; i < 3; i++)
   {
-    // Quick exit
-    memset(edgeflux, 0, 3 * sizeof(double));
-    *max_speed = 0.0;
-    *pressure_flux = 0.;
-    return 0;
+    // Rotate the conserved quantities to align with the normal vector
+    // This is done to align the x- and y-momentum with the x-axis
+    q_left_rotated[i] = q_left[i];
+    q_right_rotated[i] = q_right[i];
   }
-  // Copy conserved quantities to protect from modification
-  q_left_rotated[0] = q_left[0];
-  q_right_rotated[0] = q_right[0];
-  q_left_rotated[1] = q_left[1];
-  q_right_rotated[1] = q_right[1];
-  q_left_rotated[2] = q_left[2];
-  q_right_rotated[2] = q_right[2];
 
   // Align x- and y-momentum with x-axis
   __rotate(q_left_rotated, n1, n2);
@@ -122,97 +172,25 @@ int64_t __flux_function_central(double *q_left, double *q_right,
   // w_left = q_left_rotated[0];
   uh_left = q_left_rotated[1];
   vh_left = q_left_rotated[2];
-  if (hle > 0.0)
-  {
-    tmp = 1.0 / hle;
-    u_left = uh_left * tmp; // max(h_left, 1.0e-06);
-    uh_left = h_left * u_left;
-    v_left = vh_left * tmp; // Only used to define local_fr
-    vh_left = h_left * tmp * vh_left;
-  }
-  else
-  {
-    u_left = 0.;
-    uh_left = 0.;
-    vh_left = 0.;
-    v_left = 0.;
-  }
-
-  // u_left = _compute_speed(&uh_left, &hle,
-  //             epsilon, h0, limiting_threshold);
+  compute_velocity_terms(h_left, hle, q_left_rotated[1], q_left_rotated[2],
+                         &u_left, &uh_left, &v_left, &vh_left);
 
-  // w_right = q_right_rotated[0];
   uh_right = q_right_rotated[1];
   vh_right = q_right_rotated[2];
-  if (hre > 0.0)
-  {
-    tmp = 1.0 / hre;
-    u_right = uh_right * tmp; // max(h_right, 1.0e-06);
-    uh_right = h_right * u_right;
-    v_right = vh_right * tmp; // Only used to define local_fr
-    vh_right = h_right * tmp * vh_right;
-  }
-  else
-  {
-    u_right = 0.;
-    uh_right = 0.;
-    vh_right = 0.;
-    v_right = 0.;
-  }
-  // u_right = _compute_speed(&uh_right, &hre,
-  //               epsilon, h0, limiting_threshold);
+  compute_velocity_terms(h_right, hre, q_right_rotated[1], q_right_rotated[2],
+                         &u_right, &uh_right, &v_right, &vh_right);
 
   // Maximal and minimal wave speeds
   soundspeed_left = sqrt(g * h_left);
   soundspeed_right = sqrt(g * h_right);
-  // soundspeed_left  = sqrt(g*hle);
-  // soundspeed_right = sqrt(g*hre);
-
   // Something that scales like the Froude number
   // We will use this to scale the diffusive component of the UH/VH fluxes.
+  double local_fr = compute_local_froude(
+      low_froude, u_left, u_right, v_left, v_right,
+      soundspeed_left, soundspeed_right);
 
-  // local_fr = sqrt(
-  //     max(0.001, min(1.0,
-  //         (u_right*u_right + u_left*u_left + v_right*v_right + v_left*v_left)/
-  //         (soundspeed_left*soundspeed_left + soundspeed_right*soundspeed_right + 1.0e-10))));
-  if (low_froude == 1)
-  {
-    local_fr = sqrt(
-        fmax(0.001, fmin(1.0,
-                         (u_right * u_right + u_left * u_left + v_right * v_right + v_left * v_left) /
-                             (soundspeed_left * soundspeed_left + soundspeed_right * soundspeed_right + 1.0e-10))));
-  }
-  else if (low_froude == 2)
-  {
-    local_fr = sqrt((u_right * u_right + u_left * u_left + v_right * v_right + v_left * v_left) /
-                    (soundspeed_left * soundspeed_left + soundspeed_right * soundspeed_right + 1.0e-10));
-    local_fr = sqrt(fmin(1.0, 0.01 + fmax(local_fr - 0.01, 0.0)));
-  }
-  else
-  {
-    local_fr = 1.0;
-  }
-  // printf("local_fr %e \n:", local_fr);
-
-  s_max = fmax(u_left + soundspeed_left, u_right + soundspeed_right);
-  if (s_max < 0.0)
-  {
-    s_max = 0.0;
-  }
-
-  // if( hc < 1.0e-03){
-  //   s_max = 0.0;
-  // }
-
-  s_min = fmin(u_left - soundspeed_left, u_right - soundspeed_right);
-  if (s_min > 0.0)
-  {
-    s_min = 0.0;
-  }
-
-  // if( hc_n < 1.0e-03){
-  //   s_min = 0.0;
-  // }
+  double s_max = compute_s_max(u_left, u_right, soundspeed_left, soundspeed_right);
+  double s_min = compute_s_min(u_left, u_right, soundspeed_left, soundspeed_right);
 
   // Flux formulas
   flux_left[0] = u_left * h_left;
@@ -225,6 +203,8 @@ int64_t __flux_function_central(double *q_left, double *q_right,
 
   // Flux computation
   denom = s_max - s_min;
+  double inverse_denominator = 1.0 / fmax(denom, 1.0e-100);
+  double s_max_s_min = s_max * s_min;
   if (denom < epsilon)
   {
     // Both wave speeds are very small
@@ -238,25 +218,20 @@ int64_t __flux_function_central(double *q_left, double *q_right,
   {
     // Maximal wavespeed
     *max_speed = fmax(s_max, -s_min);
-
-    inverse_denominator = 1.0 / fmax(denom, 1.0e-100);
-    for (i = 0; i < 3; i++)
     {
-      edgeflux[i] = s_max * flux_left[i] - s_min * flux_right[i];
-
-      // Standard smoothing term
-      // edgeflux[i] += 1.0*(s_max*s_min)*(q_right_rotated[i] - q_left_rotated[i]);
-      // Smoothing by stage alone can cause high velocities / slow draining for nearly dry cells
-      if (i == 0)
-        edgeflux[i] += (s_max * s_min) * (fmax(q_right_rotated[i], ze) - fmax(q_left_rotated[i], ze));
-      // if(i==0) edgeflux[i] += (s_max*s_min)*(h_right - h_left);
-      if (i == 1)
-        edgeflux[i] += local_fr * (s_max * s_min) * (uh_right - uh_left);
-      if (i == 2)
-        edgeflux[i] += local_fr * (s_max * s_min) * (vh_right - vh_left);
-
-      edgeflux[i] *= inverse_denominator;
+      double flux_0 = s_max * flux_left[0] - s_min * flux_right[0];
+      flux_0 += s_max_s_min * (fmax(q_right_rotated[0], ze) - fmax(q_left_rotated[0], ze));
+      edgeflux[0] = flux_0 * inverse_denominator;
+
+      double flux_1 = s_max * flux_left[1] - s_min * flux_right[1];
+      flux_1 += local_fr * s_max_s_min * (uh_right - uh_left);
+      edgeflux[1] = flux_1 * inverse_denominator;
+
+      double flux_2 = s_max * flux_left[2] - s_min * flux_right[2];
+      flux_2 += local_fr * s_max_s_min * (vh_right - vh_left);
+      edgeflux[2] = flux_2 * inverse_denominator;
     }
+
     // Separate pressure flux, so we can apply different wet-dry hacks to it
     *pressure_flux = 0.5 * g * (s_max * h_left * h_left - s_min * h_right * h_right) * inverse_denominator;
 
@@ -267,27 +242,25 @@ int64_t __flux_function_central(double *q_left, double *q_right,
   return 0;
 }
 
-int64_t __openmp__flux_function_central(double q_left0, double q_left1, double q_left2,
-                                   double q_right0, double q_right1, double q_right2,
-                                   double h_left, double h_right,
-                                   double hle, double hre,
-                                   double n1, double n2,
-                                   double epsilon,
-                                   double ze,
-                                   double limiting_threshold,
-                                   double g,
-                                   double *edgeflux0, double *edgeflux1, double *edgeflux2,
-                                   double *max_speed,
-                                   double *pressure_flux, double hc,
-                                   double hc_n,
-                                   int64_t low_froude)
+anuga_int __openmp__flux_function_central(double q_left0, double q_left1, double q_left2,
+                                        double q_right0, double q_right1, double q_right2,
+                                        double h_left, double h_right,
+                                        double hle, double hre,
+                                        double n1, double n2,
+                                        double epsilon,
+                                        double ze,
+                                        double g,
+                                        double *edgeflux0, double *edgeflux1, double *edgeflux2,
+                                        double *max_speed,
+                                        double *pressure_flux,
+                                        anuga_int low_froude)
 {
 
   double edgeflux[3];
   double q_left[3];
   double q_right[3];
 
-  int64_t ierr;
+  anuga_int ierr;
 
   edgeflux[0] = *edgeflux0;
   edgeflux[1] = *edgeflux1;
@@ -307,11 +280,9 @@ int64_t __openmp__flux_function_central(double q_left0, double q_left1, double q
                                  n1, n2,
                                  epsilon,
                                  ze,
-                                 limiting_threshold,
                                  g,
                                  edgeflux, max_speed,
-                                 pressure_flux, hc,
-                                 hc_n,
+                                 pressure_flux,
                                  low_froude);
 
   *edgeflux0 = edgeflux[0];
@@ -322,11 +293,11 @@ int64_t __openmp__flux_function_central(double q_left0, double q_left1, double q
 }
 
 double __adjust_edgeflux_with_weir(double *edgeflux,
-                                   double h_left, double h_right,
-                                   double g, double weir_height,
-                                   double Qfactor,
-                                   double s1, double s2,
-                                   double h1, double h2,
+                                   const double h_left, double h_right,
+                                   const double g, double weir_height,
+                                   const double Qfactor,
+                                   const double s1, double s2,
+                                   const double h1, double h2,
                                    double *max_speed_local)
 {
   // Adjust the edgeflux to agree with a weir relation [including
@@ -426,22 +397,22 @@ double __adjust_edgeflux_with_weir(double *edgeflux,
 }
 
 double __openmp__adjust_edgeflux_with_weir(double *edgeflux0, double *edgeflux1, double *edgeflux2,
-                                          double h_left, double h_right,
-                                          double g, double weir_height,
-                                          double Qfactor,
-                                          double s1, double s2,
-                                          double h1, double h2,
-                                          double *max_speed_local)
+                                           double h_left, double h_right,
+                                           double g, double weir_height,
+                                           double Qfactor,
+                                           double s1, double s2,
+                                           double h1, double h2,
+                                           double *max_speed_local)
 {
 
   double edgeflux[3];
-  int64_t ierr;
+  anuga_int ierr;
 
   edgeflux[0] = *edgeflux0;
   edgeflux[1] = *edgeflux1;
   edgeflux[2] = *edgeflux2;
 
-  ierr = __adjust_edgeflux_with_weir(edgeflux0, h_left, h_right,
+  ierr = __adjust_edgeflux_with_weir(edgeflux, h_left, h_right,
                                      g, weir_height,
                                      Qfactor, s1, s2, h1, h2,
                                      max_speed_local);
@@ -452,46 +423,50 @@ double __openmp__adjust_edgeflux_with_weir(double *edgeflux0, double *edgeflux1,
   return ierr;
 }
 
-// Computational function for flux computation
-double _openmp_compute_fluxes_central(struct domain *D,
+// Apply weir discharge theory correction to the edge flux
+void apply_weir_discharge_correction(const struct domain * __restrict D, const EdgeData * __restrict E,
+                                     const anuga_int k, const anuga_int ncol_riverwall_hydraulic_properties,
+                                     const double g, double * __restrict edgeflux, double * __restrict max_speed) {
+
+    anuga_int RiverWall_count = D->edge_river_wall_counter[E->ki];
+    anuga_int ii = D->riverwall_rowIndex[RiverWall_count - 1] * ncol_riverwall_hydraulic_properties;
+
+    double Qfactor = D->riverwall_hydraulic_properties[ii];
+    double s1 = D->riverwall_hydraulic_properties[ii + 1];
+    double s2 = D->riverwall_hydraulic_properties[ii + 2];
+    double h1 = D->riverwall_hydraulic_properties[ii + 3];
+    double h2 = D->riverwall_hydraulic_properties[ii + 4];
+
+    double weir_height = fmax(D->riverwall_elevation[RiverWall_count - 1] - fmin(E->zl, E->zr), 0.);
+
+    double h_left_tmp = fmax(D->stage_centroid_values[k] - E->z_half, 0.);
+    double h_right_tmp = E->is_boundary
+                         ? fmax(E->hc_n + E->zr - E->z_half, 0.)
+                         : fmax(D->stage_centroid_values[E->n] - E->z_half, 0.);
+
+    if (D->riverwall_elevation[RiverWall_count - 1] > fmax(E->zc, E->zc_n)) {
+        __adjust_edgeflux_with_weir(edgeflux, h_left_tmp, h_right_tmp, g,
+                                    weir_height, Qfactor, s1, s2, h1, h2, max_speed);
+    }
+}
+
+double _openmp_compute_fluxes_central(const struct domain *__restrict D,
                                       double timestep)
 {
-  // Local variables
-  int64_t K = D->number_of_elements;
-  // int64_t KI, KI2, KI3, B, RW, RW5, SubSteps;
-  int64_t substep_count;
-
-  double max_speed_local, length, inv_area, zl, zr;
-  double h_left, h_right, z_half; // For andusse scheme
-  // FIXME: limiting_threshold is not used for DE1
-  double limiting_threshold = 10 * D->H0;
-  int64_t low_froude = D->low_froude;
+  // Local variables 
+  anuga_int number_of_elements = D->number_of_elements;
+  // anuga_int KI, KI2, KI3, B, RW, RW5, SubSteps;
+  anuga_int substep_count;
+
+  // // FIXME: limiting_threshold is not used for DE1
+  anuga_int low_froude = D->low_froude;
   double g = D->g;
   double epsilon = D->epsilon;
-  int64_t ncol_riverwall_hydraulic_properties = D->ncol_riverwall_hydraulic_properties;
-
-  // Workspace (making them static actually made function slightly slower (Ole))
-  double ql[3];
-  double qr[3];
-  double edgeflux[3]; // Work array for summing up fluxes
-  double pressuregrad_work;
-  double edge_timestep;
-  double normal_x, normal_y;
-  // static double local_timestep;
-
-  double hle, hre, zc, zc_n, Qfactor, s1, s2, h1, h2;
-  double pressure_flux, hc, hc_n;
-  double h_left_tmp, h_right_tmp;
-  double speed_max_last, weir_height;
-  int64_t RiverWall_count;
-
-  //
-  int64_t k, i, m, n, ii;
-  int64_t ki, nm = 0, ki2; // Index shorthands
+  anuga_int ncol_riverwall_hydraulic_properties = D->ncol_riverwall_hydraulic_properties;
 
-  static int64_t call = 0; // Static local variable flagging already computed flux
-  static int64_t timestep_fluxcalls = 1;
-  static int64_t base_call = 1;
+  static anuga_int call = 0; // Static local variable flagging already computed flux
+  static anuga_int timestep_fluxcalls = 1;
+  static anuga_int base_call = 1;
 
   call++; // Flag 'id' of flux calculation for this timestep
 
@@ -505,22 +480,21 @@ double _openmp_compute_fluxes_central(struct domain *D,
   substep_count = (call - base_call) % D->timestep_fluxcalls;
 
   double local_timestep = 1.0e+100;
-  double boundary_flux_sum_substep = 0.0; 
+  double boundary_flux_sum_substep = 0.0;
+  // double max_speed_local;
 
+      double edgeflux[3];
+      double pressure_flux;
+      double max_speed_local;
+      EdgeData edge_data;
 // For all triangles
-#pragma omp parallel for simd default(none) schedule(static) shared(D, substep_count, K) \
-                                     firstprivate(ncol_riverwall_hydraulic_properties, epsilon, g, low_froude, limiting_threshold) \
-                                     private(i, ki, ki2, n, m, nm, ii,                                                  \
-                                     max_speed_local, length, inv_area, zl, zr,                                         \
-                                     h_left, h_right,                                                                   \
-                                     z_half, ql,  pressuregrad_work,                                                    \
-                                     qr, edgeflux, edge_timestep, normal_x, normal_y,                                   \
-                                     hle, hre, zc, zc_n, Qfactor, s1, s2, h1, h2, pressure_flux, hc, hc_n,              \
-                                     h_left_tmp, h_right_tmp, speed_max_last, weir_height, RiverWall_count)             \
-                                     reduction(min : local_timestep) reduction(+:boundary_flux_sum_substep)
-  for (k = 0; k < K; k++)
+#pragma omp parallel for simd default(none) schedule(static) shared(D, substep_count, number_of_elements) \
+    firstprivate(ncol_riverwall_hydraulic_properties, epsilon, g, low_froude)                              \
+    private(edgeflux, pressure_flux, max_speed_local, edge_data) \
+    reduction(min : local_timestep) reduction(+ : boundary_flux_sum_substep)
+  for (anuga_int k = 0; k < number_of_elements; k++)
   {
-    speed_max_last = 0.0;
+    double speed_max_last = 0.0;
     // Set explicit_update to zero for all conserved_quantities.
     // This assumes compute_fluxes called before forcing terms
     D->stage_explicit_update[k] = 0.0;
@@ -528,590 +502,111 @@ double _openmp_compute_fluxes_central(struct domain *D,
     D->ymom_explicit_update[k] = 0.0;
 
     // Loop through neighbours and compute edge flux for each
-    for (i = 0; i < 3; i++)
+    for (anuga_int i = 0; i < 3; i++)
     {
-      ki = 3 * k + i; // Linear index to edge i of triangle k
-      ki2 = 2 * ki;   // k*6 + i*2
-
-      // Get left hand side values from triangle k, edge i
-      ql[0] = D->stage_edge_values[ki];
-      ql[1] = D->xmom_edge_values[ki];
-      ql[2] = D->ymom_edge_values[ki];
-      zl    = D->bed_edge_values[ki];
-      hle   = D->height_edge_values[ki];
-
-      hc = D->height_centroid_values[k];
-      zc = D->bed_centroid_values[k];
-
-      // Get right hand side values either from neighbouring triangle
-      // or from boundary array (Quantities at neighbour on nearest face).
-      n = D->neighbours[ki];
-      hc_n = hc;
-      zc_n = D->bed_centroid_values[k];
-      if (n < 0)
+      get_edge_data_central_flux(D,k,i,&edge_data);
+
+      // Edge flux computation (triangle k, edge i)
+      if (edge_data.h_left == 0.0 && edge_data.h_right == 0.0)
       {
-        // Neighbour is a boundary condition
-        m = -n - 1; // Convert negative flag to boundary index
-
-        qr[0] = D->stage_boundary_values[m];
-        qr[1] = D->xmom_boundary_values[m];
-        qr[2] = D->ymom_boundary_values[m];
-        zr = zl;                   // Extend bed elevation to boundary
-        hre = fmax(qr[0] - zr, 0.0); // hle;
+        // If both heights are zero, then no flux
+        edgeflux[0] = 0.0;
+        edgeflux[1] = 0.0;
+        edgeflux[2] = 0.0;
+        max_speed_local = 0.0;
+        pressure_flux = 0.0;
       }
       else
       {
-        // Neighbour is a real triangle
-        hc_n = D->height_centroid_values[n];
-        zc_n = D->bed_centroid_values[n];
-
-        m = D->neighbour_edges[ki];
-        nm = n * 3 + m; // Linear index (triangle n, edge m)
-
-        qr[0] = D->stage_edge_values[nm];
-        qr[1] = D->xmom_edge_values[nm];
-        qr[2] = D->ymom_edge_values[nm];
-        zr = D->bed_edge_values[nm];
-        hre = D->height_edge_values[nm];
-      }
-
-      // Audusse magic for well balancing
-      z_half = fmax(zl, zr);
-
-      // Account for riverwalls
-      if (D->edge_flux_type[ki] == 1)
-      {
-        RiverWall_count = D->edge_river_wall_counter[ki];
-
-        // Set central bed to riverwall elevation
-        z_half = fmax(D->riverwall_elevation[RiverWall_count - 1], z_half);
+        // Compute the fluxes using the central scheme
+        __flux_function_central(edge_data.ql, edge_data.qr,
+                                edge_data.h_left, edge_data.h_right,
+                                edge_data.hle, edge_data.hre,
+                                edge_data.normal_x, edge_data.normal_y,
+                                epsilon, edge_data.z_half, g,
+                                edgeflux, &max_speed_local, &pressure_flux,
+                                low_froude);
       }
 
-      // Define h left/right for Audusse flux method
-      h_left = fmax(hle + zl - z_half, 0.);
-      h_right = fmax(hre + zr - z_half, 0.);
-
-      normal_x = D->normals[ki2];
-      normal_y = D->normals[ki2 + 1];
+    // Weir flux adjustment
+    if (edge_data.is_riverwall) {
+      apply_weir_discharge_correction(D, &edge_data, k, ncol_riverwall_hydraulic_properties, g, edgeflux, &max_speed_local);
+    }
 
-      // Edge flux computation (triangle k, edge i)
-      __flux_function_central(ql, qr,
-                              h_left, h_right,
-                              hle, hre,
-                              normal_x, normal_y,
-                              epsilon, z_half, limiting_threshold, g,
-                              edgeflux, &max_speed_local, &pressure_flux,
-                              hc, hc_n, low_froude);
-
-      // Force weir discharge to match weir theory
-      if (D->edge_flux_type[ki] == 1)
+      // Multiply edgeflux by edgelength
+      for (anuga_int j = 0; j < 3; j++)
       {
-
-        RiverWall_count = D->edge_river_wall_counter[ki];
-
-        // printf("RiverWall_count %ld\n", RiverWall_count);
-
-        ii = D->riverwall_rowIndex[RiverWall_count - 1] * ncol_riverwall_hydraulic_properties;
-
-        // Get Qfactor index - multiply the idealised weir discharge by this constant factor
-        // Get s1, submergence ratio at which we start blending with the shallow water solution
-        // Get s2, submergence ratio at which we entirely use the shallow water solution
-        // Get h1, tailwater head / weir height at which we start blending with the shallow water solution
-        // Get h2, tailwater head / weir height at which we entirely use the shallow water solution
-        Qfactor = D->riverwall_hydraulic_properties[ii];
-        s1 = D->riverwall_hydraulic_properties[ii + 1];
-        s2 = D->riverwall_hydraulic_properties[ii + 2];
-        h1 = D->riverwall_hydraulic_properties[ii + 3];
-        h2 = D->riverwall_hydraulic_properties[ii + 4];
-
-        weir_height = fmax(D->riverwall_elevation[RiverWall_count - 1] - fmin(zl, zr), 0.); // Reference weir height
-
-        // Use first-order h's for weir -- as the 'upstream/downstream' heads are
-        //  measured away from the weir itself
-        h_left_tmp = fmax(D->stage_centroid_values[k] - z_half, 0.);
-
-        if (n >= 0)
-        {
-          h_right_tmp = fmax(D->stage_centroid_values[n] - z_half, 0.);
-        }
-        else
-        {
-          h_right_tmp = fmax(hc_n + zr - z_half, 0.);
-        }
-
-        // If the weir is not higher than both neighbouring cells, then
-        // do not try to match the weir equation. If we do, it seems we
-        // can get mass conservation issues (caused by large weir
-        // fluxes in such situations)
-        if (D->riverwall_elevation[RiverWall_count - 1] > fmax(zc, zc_n))
-        {
-          // Weir flux adjustment
-          __adjust_edgeflux_with_weir(edgeflux, h_left_tmp, h_right_tmp, g,
-                                             weir_height, Qfactor,
-                                             s1, s2, h1, h2, &max_speed_local);
-        }
+        edgeflux[j] *= -1.0 * edge_data.length;
       }
-
-      // Multiply edgeflux by edgelength
-      length = D->edgelengths[ki];
-      edgeflux[0] = -edgeflux[0]*length;
-      edgeflux[1] = -edgeflux[1]*length;
-      edgeflux[2] = -edgeflux[2]*length;
-
-      // bedslope_work contains all gravity related terms
-      pressuregrad_work = length * (-g * 0.5 * (h_left * h_left - hle * hle - (hle + hc) * (zl - zc)) + pressure_flux);
-
       // Update timestep based on edge i and possibly neighbour n
       // NOTE: We should only change the timestep on the 'first substep'
       // of the timestepping method [substep_count==0]
-      if (substep_count == 0)
+      if (substep_count == 0 && D->tri_full_flag[k] == 1 && max_speed_local > epsilon)
       {
-
         // Compute the 'edge-timesteps' (useful for setting flux_update_frequency)
-        edge_timestep = D->radii[k] *1.0 / fmax(max_speed_local, epsilon);
-
+        double edge_timestep = D->radii[k] * 1.0 / fmax(max_speed_local, epsilon);
         // Update the timestep
-        if ((D->tri_full_flag[k] == 1))
-        {
-          if (max_speed_local > epsilon)
-          {
-            // Apply CFL condition for triangles joining this edge (triangle k and triangle n)
-
-            // CFL for triangle k
-            local_timestep = fmin(local_timestep, edge_timestep);
-
-            speed_max_last = fmax(speed_max_last, max_speed_local);
-          }
-        }
+        // Apply CFL condition for triangles joining this edge (triangle k and triangle n)
+        // CFL for triangle k
+        local_timestep = fmin(local_timestep, edge_timestep);
+        speed_max_last = fmax(speed_max_last, max_speed_local);
       }
 
-
       D->stage_explicit_update[k] += edgeflux[0];
-      D->xmom_explicit_update[k]  += edgeflux[1];
-      D->ymom_explicit_update[k]  += edgeflux[2];
-
+      D->xmom_explicit_update[k] += edgeflux[1];
+      D->ymom_explicit_update[k] += edgeflux[2];
       // If this cell is not a ghost, and the neighbour is a
       // boundary condition OR a ghost cell, then add the flux to the
       // boundary_flux_integral
-      if (((n < 0) & (D->tri_full_flag[k] == 1)) | ((n >= 0) && ((D->tri_full_flag[k] == 1) & (D->tri_full_flag[n] == 0))))
+      if (((edge_data.n < 0) & (D->tri_full_flag[k] == 1)) | ((edge_data.n >= 0) && ((D->tri_full_flag[k] == 1) & (D->tri_full_flag[edge_data.n] == 0))))
       {
         // boundary_flux_sum is an array with length = timestep_fluxcalls
         // For each sub-step, we put the boundary flux sum in.
         boundary_flux_sum_substep += edgeflux[0];
       }
 
-      D->xmom_explicit_update[k] -= D->normals[ki2] * pressuregrad_work;
-      D->ymom_explicit_update[k] -= D->normals[ki2 + 1] * pressuregrad_work;
+      // bedslope_work contains all gravity related terms
+      double pressuregrad_work = edge_data.length * (-g * 0.5 * (edge_data.h_left * edge_data.h_left - edge_data.hle * edge_data.hle - (edge_data.hle + edge_data.hc) * (edge_data.zl - edge_data.zc)) + pressure_flux);
+      D->xmom_explicit_update[k] -= D->normals[edge_data.ki2] * pressuregrad_work;
+      D->ymom_explicit_update[k] -= D->normals[edge_data.ki2 + 1] * pressuregrad_work;
 
     } // End edge i (and neighbour n)
 
     // Keep track of maximal speeds
-    if (substep_count == 0)
+    if (substep_count == 0){
       D->max_speed[k] = speed_max_last; // max_speed;
-
+    }
     // Normalise triangle k by area and store for when all conserved
     // quantities get updated
-    inv_area = 1.0 / D->areas[k];
+    double inv_area = 1.0 / D->areas[k];
     D->stage_explicit_update[k] *= inv_area;
     D->xmom_explicit_update[k] *= inv_area;
-    D->ymom_explicit_update[k] *= inv_area;  
+    D->ymom_explicit_update[k] *= inv_area;
 
   } // End triangle k
 
-
-
-//   // Now add up stage, xmom, ymom explicit updates
-
-// #pragma omp parallel for private(k, i, ki, ki2, ki3, n, inv_area) reduction(+:boundary_flux_sum_substep)
-//   for (k = 0; k < K; k++)
-//   {
-//     for (i = 0; i < 3; i++)
-//     {
-//       // FIXME: Make use of neighbours to efficiently set things
-//       ki = 3 * k + i;
-//       ki2 = ki * 2;
-//       ki3 = ki * 3;
-//       n = D->neighbours[ki];
-
-//       D->stage_explicit_update[k] += D->edge_flux_work[ki3 + 0];
-//       D->xmom_explicit_update[k] += D->edge_flux_work[ki3 + 1];
-//       D->ymom_explicit_update[k] += D->edge_flux_work[ki3 + 2];
-
-//       // If this cell is not a ghost, and the neighbour is a
-//       // boundary condition OR a ghost cell, then add the flux to the
-//       // boundary_flux_integral
-//       if (((n < 0) & (D->tri_full_flag[k] == 1)) | ((n >= 0) && ((D->tri_full_flag[k] == 1) & (D->tri_full_flag[n] == 0))))
-//       {
-//         // boundary_flux_sum is an array with length = timestep_fluxcalls
-//         // For each sub-step, we put the boundary flux sum in.
-//         boundary_flux_sum_substep += D->edge_flux_work[ki3];
-//       }
-
-//       D->xmom_explicit_update[k] -= D->normals[ki2] * D->pressuregrad_work[ki];
-//       D->ymom_explicit_update[k] -= D->normals[ki2 + 1] * D->pressuregrad_work[ki];
-
-//     } // end edge i
-
-//     // Normalise triangle k by area and store for when all conserved
-//     // quantities get updated
-//     inv_area = 1.0 / D->areas[k];
-//     D->stage_explicit_update[k] *= inv_area;
-//     D->xmom_explicit_update[k] *= inv_area;
-//     D->ymom_explicit_update[k] *= inv_area;
-
-//   } // end cell k
+  //   // Now add up stage, xmom, ymom explicit updates
 
   // variable to accumulate D->boundary_flux_sum[substep_count]
-  D->boundary_flux_sum[substep_count] = boundary_flux_sum_substep;  
+  D->boundary_flux_sum[substep_count] = boundary_flux_sum_substep;
 
   // Ensure we only update the timestep on the first call within each rk2/rk3 step
-  if (substep_count == 0)
+  if (substep_count == 0){
     timestep = local_timestep;
-
-  return timestep;
-}
-
-// Computational function for flux computation
-// with riverWall_count pulled out of triangle loop
-double _compute_fluxes_central_parallel_data_flow(struct domain *D, double timestep)
-{
-
-  // Local variables
-  double max_speed_local, length, inv_area, zl, zr;
-  double h_left, h_right, z_half; // For andusse scheme
-  // FIXME: limiting_threshold is not used for DE1
-  double limiting_threshold = 10 * D->H0;
-  int64_t low_froude = D->low_froude;
-  //
-  int64_t k, i, m, n, ii;
-  int64_t ki, nm = 0, ki2, ki3; // Index shorthands
-  // Workspace (making them static actually made function slightly slower (Ole))
-  double ql[3], qr[3], edgeflux[3]; // Work array for summing up fluxes
-  double bedslope_work;
-  static double local_timestep;
-  int64_t RiverWall_count, substep_count;
-  double hle, hre, zc, zc_n, Qfactor, s1, s2, h1, h2;
-  double pressure_flux, hc, hc_n, tmp;
-  double h_left_tmp, h_right_tmp;
-  static int64_t call = 0; // Static local variable flagging already computed flux
-  static int64_t timestep_fluxcalls = 1;
-  static int64_t base_call = 1;
-  double speed_max_last, weir_height;
-
-  call++; // Flag 'id' of flux calculation for this timestep
-
-  if (D->timestep_fluxcalls != timestep_fluxcalls)
-  {
-    timestep_fluxcalls = D->timestep_fluxcalls;
-    base_call = call;
-  }
-
-  // Set explicit_update to zero for all conserved_quantities.
-  // This assumes compute_fluxes called before forcing terms
-
-  #pragma omp parallel for private(k)
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    D->stage_explicit_update[k] = 0.0;
-    D->xmom_explicit_update[k] = 0.0;
-    D->ymom_explicit_update[k] = 0.0;
-  }
-  // memset((char*) D->stage_explicit_update, 0, D->number_of_elements * sizeof (double));
-  // memset((char*) D->xmom_explicit_update, 0, D->number_of_elements * sizeof (double));
-  // memset((char*) D->ymom_explicit_update, 0, D->number_of_elements * sizeof (double));
-
-  // Counter for riverwall edges
-  RiverWall_count = 0;
-  // Which substep of the timestepping method are we on?
-  substep_count = (call - base_call) % D->timestep_fluxcalls;
-
-  // printf("call = %d substep_count = %d base_call = %d \n",call,substep_count, base_call);
-
-  // Fluxes are not updated every timestep,
-  // but all fluxes ARE updated when the following condition holds
-  if (D->allow_timestep_increase[0] == 1)
-  {
-    // We can only increase the timestep if all fluxes are allowed to be updated
-    // If this is not done the timestep can't increase (since local_timestep is static)
-    local_timestep = 1.0e+100;
-  }
-
-  // For all triangles
-  // Pull the edge_river_wall count outside parallel loop as in needs to be done sequentially
-  // move it to the initiation of the riverwall so only calculated once
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    for (i = 0; i < 3; i++)
-    {
-      ki = 3 * k + i;
-      D->edge_river_wall_counter[ki] = 0;
-      if (D->edge_flux_type[ki] == 1)
-      {
-        // Update counter of riverwall edges
-        RiverWall_count += 1;
-        D->edge_river_wall_counter[ki] = RiverWall_count;
-
-        // printf("RiverWall_count %d   edge_counter %d \n", RiverWall_count, D->edge_river_wall_counter[ki]);
-      }
-    }
   }
 
-  RiverWall_count = 0;
-
-  // For all triangles
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    speed_max_last = 0.0;
-
-    // Loop through neighbours and compute edge flux for each
-    for (i = 0; i < 3; i++)
-    {
-      ki = 3 * k + i; // Linear index to edge i of triangle k
-      ki2 = 2 * ki;   // k*6 + i*2
-      ki3 = 3 * ki;
-
-      // Get left hand side values from triangle k, edge i
-      ql[0] = D->stage_edge_values[ki];
-      ql[1] = D->xmom_edge_values[ki];
-      ql[2] = D->ymom_edge_values[ki];
-      zl = D->bed_edge_values[ki];
-      hc = D->height_centroid_values[k];
-      zc = D->bed_centroid_values[k];
-      hle = D->height_edge_values[ki];
-
-      // Get right hand side values either from neighbouring triangle
-      // or from boundary array (Quantities at neighbour on nearest face).
-      n = D->neighbours[ki];
-      hc_n = hc;
-      zc_n = D->bed_centroid_values[k];
-      if (n < 0)
-      {
-        // Neighbour is a boundary condition
-        m = -n - 1; // Convert negative flag to boundary index
-
-        qr[0] = D->stage_boundary_values[m];
-        qr[1] = D->xmom_boundary_values[m];
-        qr[2] = D->ymom_boundary_values[m];
-        zr = zl;                    // Extend bed elevation to boundary
-        hre = fmax(qr[0] - zr, 0.); // hle;
-      }
-      else
-      {
-        // Neighbour is a real triangle
-        hc_n = D->height_centroid_values[n];
-        zc_n = D->bed_centroid_values[n];
-        m = D->neighbour_edges[ki];
-        nm = n * 3 + m; // Linear index (triangle n, edge m)
-
-        qr[0] = D->stage_edge_values[nm];
-        qr[1] = D->xmom_edge_values[nm];
-        qr[2] = D->ymom_edge_values[nm];
-        zr = D->bed_edge_values[nm];
-        hre = D->height_edge_values[nm];
-      }
-
-      // Audusse magic
-      z_half = fmax(zl, zr);
-
-      //// Account for riverwalls
-      if (D->edge_flux_type[ki] == 1)
-      {
-        if (n >= 0 && D->edge_flux_type[nm] != 1)
-        {
-          printf("Riverwall Error\n");
-        }
-        // Update counter of riverwall edges == index of
-        // riverwall_elevation + riverwall_rowIndex
-
-        // RiverWall_count += 1;
-        RiverWall_count = D->edge_river_wall_counter[ki];
-
-        // Set central bed to riverwall elevation
-        z_half = fmax(D->riverwall_elevation[RiverWall_count - 1], z_half);
-      }
-
-      // Define h left/right for Audusse flux method
-      h_left = fmax(hle + zl - z_half, 0.);
-      h_right = fmax(hre + zr - z_half, 0.);
-
-      // Edge flux computation (triangle k, edge i)
-      __flux_function_central(ql, qr,
-                              h_left, h_right,
-                              hle, hre,
-                              D->normals[ki2], D->normals[ki2 + 1],
-                              D->epsilon, z_half, limiting_threshold, D->g,
-                              edgeflux, &max_speed_local, &pressure_flux, hc, hc_n, low_froude);
-
-      // Force weir discharge to match weir theory
-      if (D->edge_flux_type[ki] == 1)
-      {
-        ii = D->riverwall_rowIndex[RiverWall_count - 1] * D->ncol_riverwall_hydraulic_properties;
-
-        // Get Qfactor index - multiply the idealised weir discharge by this constant factor
-        // Get s1, submergence ratio at which we start blending with the shallow water solution
-        // Get s2, submergence ratio at which we entirely use the shallow water solution
-        // Get h1, tailwater head / weir height at which we start blending with the shallow water solution
-        // Get h2, tailwater head / weir height at which we entirely use the shallow water solution
-        Qfactor = D->riverwall_hydraulic_properties[ii];
-        s1 = D->riverwall_hydraulic_properties[ii + 1];
-        s2 = D->riverwall_hydraulic_properties[ii + 2];
-        h1 = D->riverwall_hydraulic_properties[ii + 3];
-        h2 = D->riverwall_hydraulic_properties[ii + 4];
-
-        weir_height = fmax(D->riverwall_elevation[RiverWall_count - 1] - fmin(zl, zr), 0.); // Reference weir height
-
-        // Use first-order h's for weir -- as the 'upstream/downstream' heads are
-        //  measured away from the weir itself
-        h_left_tmp = fmax(D->stage_centroid_values[k] - z_half, 0.);
-
-        if (n >= 0)
-        {
-          h_right_tmp = fmax(D->stage_centroid_values[n] - z_half, 0.);
-        }
-        else
-        {
-          h_right_tmp = fmax(hc_n + zr - z_half, 0.);
-        }
-
-        // If the weir is not higher than both neighbouring cells, then
-        // do not try to match the weir equation. If we do, it seems we
-        // can get mass conservation issues (caused by large weir
-        // fluxes in such situations)
-        if (D->riverwall_elevation[RiverWall_count - 1] > fmax(zc, zc_n))
-        {
-          // Weir flux adjustment
-          __adjust_edgeflux_with_weir(edgeflux, h_left_tmp, h_right_tmp, D->g,
-                                      weir_height, Qfactor,
-                                      s1, s2, h1, h2, &max_speed_local);
-        }
-      }
-
-      // Multiply edgeflux by edgelength
-      length = D->edgelengths[ki];
-      edgeflux[0] *= length;
-      edgeflux[1] *= length;
-      edgeflux[2] *= length;
-
-      D->edge_flux_work[ki3 + 0] = -edgeflux[0];
-      D->edge_flux_work[ki3 + 1] = -edgeflux[1];
-      D->edge_flux_work[ki3 + 2] = -edgeflux[2];
-
-      // bedslope_work contains all gravity related terms
-      bedslope_work = length * (-D->g * 0.5 * (h_left * h_left - hle * hle - (hle + hc) * (zl - zc)) + pressure_flux);
-
-      D->pressuregrad_work[ki] = bedslope_work;
-
-      // Update timestep based on edge i and possibly neighbour n
-      // NOTE: We should only change the timestep on the 'first substep'
-      //  of the timestepping method [substep_count==0]
-      if (substep_count == 0)
-      {
-
-        // Compute the 'edge-timesteps' (useful for setting flux_update_frequency)
-        tmp = 1.0 / fmax(max_speed_local, D->epsilon);
-        D->edge_timestep[ki] = D->radii[k] * tmp;
-
-        // Update the timestep
-        if ((D->tri_full_flag[k] == 1))
-        {
-
-          speed_max_last = fmax(speed_max_last, max_speed_local);
-
-          if (max_speed_local > D->epsilon)
-          {
-            // Apply CFL condition for triangles joining this edge (triangle k and triangle n)
-
-            // CFL for triangle k
-            local_timestep = fmin(local_timestep, D->edge_timestep[ki]);
-
-            // if (n >= 0) {
-            //     // Apply CFL condition for neigbour n (which is on the ith edge of triangle k)
-            //    local_timestep = fmin(local_timestep, D->edge_timestep[nm]);
-            // }
-          }
-        }
-      }
-
-    } // End edge i (and neighbour n)
-
-    // Keep track of maximal speeds
-    if (substep_count == 0)
-      D->max_speed[k] = speed_max_last; // max_speed;
-
-  } // End triangle k
-
-  // Now add up stage, xmom, ymom explicit updates
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    hc = fmax(D->stage_centroid_values[k] - D->bed_centroid_values[k], 0.);
-
-    for (i = 0; i < 3; i++)
-    {
-      // FIXME: Make use of neighbours to efficiently set things
-      ki = 3 * k + i;
-      ki2 = ki * 2;
-      ki3 = ki * 3;
-      n = D->neighbours[ki];
-
-      D->stage_explicit_update[k] += D->edge_flux_work[ki3 + 0];
-      D->xmom_explicit_update[k] += D->edge_flux_work[ki3 + 1];
-      D->ymom_explicit_update[k] += D->edge_flux_work[ki3 + 2];
-
-      // If this cell is not a ghost, and the neighbour is a
-      // boundary condition OR a ghost cell, then add the flux to the
-      // boundary_flux_integral
-      if (((n < 0) & (D->tri_full_flag[k] == 1)) | ((n >= 0) && ((D->tri_full_flag[k] == 1) & (D->tri_full_flag[n] == 0))))
-      {
-        // boundary_flux_sum is an array with length = timestep_fluxcalls
-        // For each sub-step, we put the boundary flux sum in.
-        D->boundary_flux_sum[substep_count] += D->edge_flux_work[ki3];
-      }
-
-      D->xmom_explicit_update[k] -= D->normals[ki2] * D->pressuregrad_work[ki];
-      D->ymom_explicit_update[k] -= D->normals[ki2 + 1] * D->pressuregrad_work[ki];
-
-    } // end edge i
-
-    // Normalise triangle k by area and store for when all conserved
-    // quantities get updated
-    inv_area = 1.0 / D->areas[k];
-    D->stage_explicit_update[k] *= inv_area;
-    D->xmom_explicit_update[k] *= inv_area;
-    D->ymom_explicit_update[k] *= inv_area;
-
-  } // end cell k
-
-  // Ensure we only update the timestep on the first call within each rk2/rk3 step
-  if (substep_count == 0)
-    timestep = local_timestep;
-
   return timestep;
 }
 
-
-
-
-
 // Protect against the water elevation falling below the triangle bed
-double _openmp_protect(struct domain *D)
+double _openmp_protect(const struct domain *__restrict D)
 {
 
-  int64_t k, k3, K;
-  double hc, bmin;
   double mass_error = 0.;
 
-  // double *wc;
-  // double *zc;
-  // double *wv;
-  // double *xmomc;
-  // double *ymomc;
-  // double *areas;
-
-  double minimum_allowed_height;
-
-  minimum_allowed_height = D->minimum_allowed_height;
+  double minimum_allowed_height = D->minimum_allowed_height;
 
-  K = D->number_of_elements;
+  anuga_int number_of_elements = D->number_of_elements;
 
   // wc = D->stage_centroid_values;
   // zc = D->bed_centroid_values;
@@ -1124,15 +619,15 @@ double _openmp_protect(struct domain *D)
   // distance between the bed_centroid_value and the max bed_edge_value of
   // every triangle.
   // double minimum_relative_height=0.05;
-  // int64_t mass_added = 0;
+  // anuga_int mass_added = 0;
 
   // Protect against inifintesimal and negative heights
   // if (maximum_allowed_speed < epsilon) {
-#pragma omp parallel for private(k, k3, hc, bmin ) schedule(static) reduction(+ : mass_error) firstprivate (minimum_allowed_height)
-  for (k = 0; k < K; k++)
+#pragma omp parallel for schedule(static) reduction(+ : mass_error) firstprivate(minimum_allowed_height)
+  for (anuga_int k = 0; k < number_of_elements; k++)
   {
-    k3 = 3*k;
-    hc = D->stage_centroid_values[k] - D->bed_centroid_values[k];
+    anuga_int k3 = 3 * k;
+    double hc = D->stage_centroid_values[k] - D->bed_centroid_values[k];
     if (hc < minimum_allowed_height * 1.0)
     {
       // Set momentum to zero and ensure h is non negative
@@ -1140,7 +635,7 @@ double _openmp_protect(struct domain *D)
       D->xmom_centroid_values[k] = 0.;
       if (hc <= 0.0)
       {
-        bmin = D->bed_centroid_values[k];
+        double bmin = D->bed_centroid_values[k];
         // Minimum allowed stage = bmin
 
         // WARNING: ADDING MASS if wc[k]<bmin
@@ -1170,9 +665,8 @@ double _openmp_protect(struct domain *D)
   return mass_error;
 }
 
-
-static inline int64_t __find_qmin_and_qmax(double dq0, double dq1, double dq2,
-                         double *qmin, double *qmax)
+static inline anuga_int __find_qmin_and_qmax_dq1_dq2(const double dq0, const double dq1, const double dq2,
+                                                   double *qmin, double *qmax)
 {
   // Considering the centroid of an FV triangle and the vertices of its
   // auxiliary triangle, find
@@ -1192,7 +686,7 @@ static inline int64_t __find_qmin_and_qmax(double dq0, double dq1, double dq2,
   return 0;
 }
 
-static inline int64_t __limit_gradient(double *dqv, double qmin, double qmax, double beta_w)
+static inline anuga_int __limit_gradient(double *__restrict dqv, double qmin, double qmax, const double beta_w)
 {
   // Given provisional jumps dqv from the FV triangle centroid to its
   // vertices/edges, and jumps qmin (qmax) between the centroid of the FV
@@ -1201,324 +695,443 @@ static inline int64_t __limit_gradient(double *dqv, double qmin, double qmax, do
   // multiplicative factor phi by which the provisional vertex jumps are to be
   // limited
 
-  int64_t i;
-  double r = 1000.0, r0 = 1.0, phi = 1.0;
-  static double TINY = 1.0e-100; // to avoid machine accuracy problems.
-  // FIXME: Perhaps use the epsilon used elsewhere.
-
-  // Any provisional jump with magnitude < TINY does not contribute to
-  // the limiting process.
-  // return 0;
+  double r = 1000.0;
+  //#pragma omp parallel for simd reduction(min : r) default(none) shared(dqv, qmin, qmax, beta_w, TINY)
+  double dq_x = dqv[0];
+  double dq_y = dqv[1];
+  double dq_z = dqv[2];
 
-  for (i = 0; i < 3; i++)
+  if(dq_x < -TINY)
   {
-    if (dqv[i] < -TINY)
-      r0 = qmin / dqv[i];
-
-    if (dqv[i] > TINY)
-      r0 = qmax / dqv[i];
-
-    r = fmin(r0, r);
+    double r0 = qmin / dq_x;
+    r = fmin(r, r0);
+  }
+  else if (dq_x > TINY)
+  {
+    double r0 = qmax / dq_x;
+    r = fmin(r, r0);
+  }
+  if(dq_y < -TINY)
+  {
+    double r0 = qmin / dq_y;
+    r = fmin(r, r0);
+  }
+  else if (dq_y > TINY)
+  {
+    double r0 = qmax / dq_y;
+    r = fmin(r, r0);
+  }
+  if(dq_z < -TINY)
+  {
+    double r0 = qmin / dq_z;
+    r = fmin(r, r0);
   }
+  else if (dq_z > TINY)
+  {
+    double r0 = qmax / dq_z;
+    r = fmin(r, r0);
+  }
+
 
-  phi = fmin(r * beta_w, 1.0);
-  // phi=1.;
-  dqv[0] = dqv[0] * phi;
-  dqv[1] = dqv[1] * phi;
-  dqv[2] = dqv[2] * phi;
+  double phi = fmin(r * beta_w, 1.0);
 
+  for (anuga_int i = 0; i < 3; i++)
+  {
+    dqv[i] *= phi;
+  }
   return 0;
 }
 
-static inline void __calc_edge_values(double beta_tmp, double cv_k, double cv_k0, double cv_k1, double cv_k2,
-                        double dxv0, double dxv1, double dxv2, double dyv0, double dyv1, double dyv2,
-                        double dx1, double dx2, double dy1, double dy2, double inv_area2,
-                        double *edge_values)
+#pragma omp declare simd
+static inline void __calc_edge_values_with_gradient(
+    const double cv_k, const double cv_k0, const double cv_k1, const double cv_k2,
+    const double dxv0, const double dxv1, const double dxv2, const double dyv0, const double dyv1, const double dyv2,
+    const double dx1, const double dx2, const double dy1, const double dy2, const double inv_area2,
+    const double beta_tmp, double *__restrict edge_values)
 {
   double dqv[3];
-  double dq0, dq1, dq2;
-  double a, b;
+  double dq0 = cv_k0 - cv_k;
+  double dq1 = cv_k1 - cv_k0;
+  double dq2 = cv_k2 - cv_k0;
+
+  double a = (dy2 * dq1 - dy1 * dq2) * inv_area2;
+  double b = (dx1 * dq2 - dx2 * dq1) * inv_area2;
+
+  dqv[0] = a * dxv0 + b * dyv0;
+  dqv[1] = a * dxv1 + b * dyv1;
+  dqv[2] = a * dxv2 + b * dyv2;
+
   double qmin, qmax;
+  __find_qmin_and_qmax_dq1_dq2(dq0, dq1, dq2, &qmin, &qmax);
+  __limit_gradient(dqv, qmin, qmax, beta_tmp);
 
-  if (beta_tmp > 0.)
+  edge_values[0] = cv_k + dqv[0];
+  edge_values[1] = cv_k + dqv[1];
+  edge_values[2] = cv_k + dqv[2];
+}
+
+#pragma omp declare simd
+static inline void __set_constant_edge_values(const double cv_k, double *edge_values)
+{
+  edge_values[0] = cv_k;
+  edge_values[1] = cv_k;
+  edge_values[2] = cv_k;
+}
+
+#pragma omp declare simd
+static inline void compute_qmin_qmax_from_dq1(const double dq1, double *qmin, double *qmax)
+{
+  if (dq1 >= 0.0)
   {
-    // Calculate the difference between vertex 0 of the auxiliary
-    // triangle and the centroid of triangle k
-    dq0 = cv_k0 - cv_k;
-
-    // Calculate differentials between the vertices
-    // of the auxiliary triangle (centroids of neighbouring triangles)
-    dq1 = cv_k1 - cv_k0;
-    dq2 = cv_k2 - cv_k0;
-
-    // Calculate the gradient of stage on the auxiliary triangle
-    a = dy2 * dq1 - dy1 * dq2;
-    a *= inv_area2;
-    b = dx1 * dq2 - dx2 * dq1;
-    b *= inv_area2;
-    // Calculate provisional jumps in stage from the centroid
-    // of triangle k to its vertices, to be limited
-    dqv[0] = a * dxv0 + b * dyv0;
-    dqv[1] = a * dxv1 + b * dyv1;
-    dqv[2] = a * dxv2 + b * dyv2;
-
-    // Now we want to find min and max of the centroid and the
-    // vertices of the auxiliary triangle and compute jumps
-    // from the centroid to the min and max
-    __find_qmin_and_qmax(dq0, dq1, dq2, &qmin, &qmax);
-
-    // Limit the gradient
-    __limit_gradient(dqv, qmin, qmax, beta_tmp);
-
-    edge_values[0] = cv_k + dqv[0];
-    edge_values[1] = cv_k + dqv[1];
-    edge_values[2] = cv_k + dqv[2];
+    *qmin = 0.0;
+    *qmax = dq1;
   }
   else
   {
-    // Fast alternative when beta_tmp==0
-    edge_values[0] = cv_k;
-    edge_values[1] = cv_k;
-    edge_values[2] = cv_k;
+    *qmin = dq1;
+    *qmax = 0.0;
   }
 }
 
-static inline void __calc_edge_values_2_bdy(double beta, double cv_k, double cv_k0, 
-                        double dxv0, double dxv1, double dxv2, double dyv0, double dyv1, double dyv2,
-                        double dx1, double dx2, double dy1, double dy2, double inv_area2,
-                        double *edge_values)
+
+static inline void update_centroid_values(struct domain *__restrict D,
+                                          const anuga_int number_of_elements,
+                                          const double minimum_allowed_height,
+                                          const anuga_int extrapolate_velocity_second_order)
 {
-  double dqv[3];
-  double dq0, dq1, dq2;
-  double a, b;
-  double qmin, qmax;
+#pragma omp parallel for simd default(none) shared(D) schedule(static) \
+    firstprivate(number_of_elements, minimum_allowed_height, extrapolate_velocity_second_order)
+  for (anuga_int k = 0; k < number_of_elements; ++k)
+  {
+    double stage = D->stage_centroid_values[k];
+    double bed   = D->bed_centroid_values[k];
+    double xmom  = D->xmom_centroid_values[k];
+    double ymom  = D->ymom_centroid_values[k];
+
+    double dk_local = fmax(stage - bed, 0.0);
+    D->height_centroid_values[k] = dk_local;
+
+    anuga_int is_dry = (dk_local <= minimum_allowed_height);
+    anuga_int extrapolate = (extrapolate_velocity_second_order == 1) & (dk_local > minimum_allowed_height);
+
+    // Prepare outputs branchless
+    double xmom_out = (is_dry) ? 0.0 : xmom;
+    double ymom_out = (is_dry) ? 0.0 : ymom;
+
+    double inv_dk = (extrapolate) ? (1.0 / dk_local) : 1.0;
+
+    D->x_centroid_work[k] = (extrapolate) ? xmom_out : 0.0;
+    D->y_centroid_work[k] = (extrapolate) ? ymom_out : 0.0;
+    D->xmom_centroid_values[k] = xmom_out * inv_dk;
+    D->ymom_centroid_values[k] = ymom_out * inv_dk;
+  }
+}
+
+
+
+#pragma omp declare simd
+static inline void set_all_edge_values_from_centroid(struct domain *__restrict D, const anuga_int k)
+{
+
+  const double stage = D->stage_centroid_values[k];
+  const double xmom = D->xmom_centroid_values[k];
+  const double ymom = D->ymom_centroid_values[k];
+  const double height = D->height_centroid_values[k];
 
+  for (anuga_int i = 0; i < 3; i++)
+  {
+    anuga_int ki = 3 * k + i;
+    D->stage_edge_values[ki] = stage;
+    D->xmom_edge_values[ki] = xmom;
+    D->ymom_edge_values[ki] = ymom;
+    D->height_edge_values[ki] = height;
+    D->bed_edge_values[ki] = D->bed_centroid_values[k];
+  }
+}
 
-  // Compute differentials
-  dq1 = cv_k0 - cv_k;
+#pragma omp declare simd
+static inline anuga_int get_internal_neighbour(const struct domain *__restrict D, const anuga_int k)
+{
+  for (anuga_int i = 0; i < 3; i++)
+  {
+    anuga_int n = D->surrogate_neighbours[3 * k + i];
+    if (n != k)
+    {
+      return n;
+    }
+  }
+  return -1; // Indicates failure
+}
 
+#pragma omp declare simd
+static inline void compute_dqv_from_gradient(const double dq1, const double dx2, const double dy2,
+                                             const double dxv0, const double dxv1, const double dxv2,
+                                             const double dyv0, const double dyv1, const double dyv2,
+                                             double dqv[3])
+{
   // Calculate the gradient between the centroid of triangle k
   // and that of its neighbour
-  a = dq1 * dx2;
-  b = dq1 * dy2;
+  double a = dq1 * dx2;
+  double b = dq1 * dy2;
 
-  // Calculate provisional edge jumps, to be limited
   dqv[0] = a * dxv0 + b * dyv0;
   dqv[1] = a * dxv1 + b * dyv1;
   dqv[2] = a * dxv2 + b * dyv2;
+}
 
-  // Now limit the jumps
-  if (dq1 >= 0.0)
+#pragma omp declare simd
+static inline void compute_gradient_projection_between_centroids(
+    const struct domain *__restrict D, const anuga_int k, const anuga_int k1,
+    double *__restrict dx2, double *__restrict dy2)
+{
+  double x = D->centroid_coordinates[2 * k + 0];
+  double y = D->centroid_coordinates[2 * k + 1];
+  double x1 = D->centroid_coordinates[2 * k1 + 0];
+  double y1 = D->centroid_coordinates[2 * k1 + 1];
+
+  double dx = x1 - x;
+  double dy = y1 - y;
+  double area2 = dx * dx + dy * dy;
+
+  if (area2 > 0.0)
   {
-    qmin = 0.0;
-    qmax = dq1;
+    *dx2 = dx / area2;
+    *dy2 = dy / area2;
   }
   else
   {
-    qmin = dq1;
-    qmax = 0.0;
+    *dx2 = 0.0;
+    *dy2 = 0.0;
   }
-
-  // Limit the gradient
-  __limit_gradient(dqv, qmin, qmax, beta);
-
-  edge_values[0] = cv_k + dqv[0];
-  edge_values[1] = cv_k + dqv[1];
-  edge_values[2] = cv_k + dqv[2];
-
 }
 
-
-
-
-
-// Computational routine
-int64_t _openmp_extrapolate_second_order_edge_sw(struct domain *D)
+#pragma omp declare simd
+static inline void extrapolate_gradient_limited(
+    const double *__restrict centroid_values, double *__restrict edge_values,
+    const anuga_int k, const anuga_int k1, const anuga_int k3,
+    const double dx2, const double dy2,
+    const double dxv0, const double dxv1, const double dxv2,
+    const double dyv0, const double dyv1, const double dyv2,
+    const double beta)
 {
+  double dq1 = centroid_values[k1] - centroid_values[k];
 
-  // Local variables
-  double a, b; // Gradient vector used to calculate edge values from centroids
-  int64_t k, k0, k1, k2, k3, k6, coord_index, i;
-  double x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2; // Vertices of the auxiliary triangle
-  double dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, dq1, area2, inv_area2;
-  double dqv[3], qmin, qmax, hmin, hmax;
-  double hc, h0, h1, h2, beta_tmp, hfactor;
-  double dk, dk_inv, a_tmp, b_tmp, c_tmp, d_tmp;
-  double edge_values[3];
-  double cv_k, cv_k0, cv_k1, cv_k2;
-
-  double x_centroid_work;
-  double xmom_centroid_values;
-  double y_centroid_work;
-  double ymom_centroid_values;
+  double dqv[3];
+  compute_dqv_from_gradient(dq1, dx2, dy2,
+                            dxv0, dxv1, dxv2,
+                            dyv0, dyv1, dyv2, dqv);
 
-  double minimum_allowed_height = D->minimum_allowed_height;
-  int64_t number_of_elements = D->number_of_elements;
-  int64_t extrapolate_velocity_second_order = D->extrapolate_velocity_second_order;
+  double qmin, qmax;
+  compute_qmin_qmax_from_dq1(dq1, &qmin, &qmax);
 
+  __limit_gradient(dqv, qmin, qmax, beta);
 
-  // Parameters used to control how the limiter is forced to first-order near
-  // wet-dry regions
-  a_tmp = 0.3; // Highest depth ratio with hfactor=1
-  b_tmp = 0.1; // Highest depth ratio with hfactor=0
-  c_tmp = 1.0 / (a_tmp - b_tmp);
-  d_tmp = 1.0 - (c_tmp * a_tmp);
+  for (anuga_int i = 0; i < 3; i++)
+  {
+    edge_values[k3 + i] = centroid_values[k] + dqv[i];
+  }
+}
 
-  // Replace momentum centroid with velocity centroid to allow velocity
-  // extrapolation This will be changed back at the end of the routine
+#pragma omp declare simd
+static inline void interpolate_edges_with_beta(
+    const double *__restrict centroid_values,
+    double *__restrict edge_values,
+    const anuga_int k, const anuga_int k0, const anuga_int k1, const anuga_int k2, const anuga_int k3,
+    const double dxv0, const double dxv1, const double dxv2,
+    const double dyv0, const double dyv1, const double dyv2,
+    const double dx1, const double dx2, const double dy1, const double dy2,
+    const double inv_area2,
+    const double beta_dry, const double beta_wet, const double hfactor)
+{
+  double beta = beta_dry + (beta_wet - beta_dry) * hfactor;
 
-  // Need to calculate height xmom and ymom centroid values for all triangles 
-  // before extrapolation and limiting
+  double edge_vals[3];
+  if (beta > 0.0)
+  {
+    __calc_edge_values_with_gradient(
+        centroid_values[k],
+        centroid_values[k0],
+        centroid_values[k1],
+        centroid_values[k2],
+        dxv0, dxv1, dxv2,
+        dyv0, dyv1, dyv2,
+        dx1, dx2, dy1, dy2,
+        inv_area2,
+        beta,
+        edge_vals);
+  }
+  else
+  {
+    __set_constant_edge_values(centroid_values[k], edge_vals);
+  }
+  for (anuga_int i = 0; i < 3; i++)
+  {
+    edge_values[k3 + i] = edge_vals[i];
+  }
+}
 
-#pragma omp parallel for simd shared(D) default(none) schedule(static) private(dk, dk_inv) firstprivate(number_of_elements, minimum_allowed_height, extrapolate_velocity_second_order)
-    for (k = 0; k < number_of_elements; k++)
-    {
-    dk = fmax(D->stage_centroid_values[k] - D->bed_centroid_values[k], 0.0);
+#pragma omp declare simd
+static inline void compute_hfactor_and_inv_area(
+    const struct domain *__restrict D,
+    const anuga_int k, const anuga_int k0, const anuga_int k1, const anuga_int k2,
+    const double area2, const double c_tmp, const double d_tmp,
+    double *__restrict hfactor, double *__restrict inv_area2)
+{
+  double hc = D->height_centroid_values[k];
+  double h0 = D->height_centroid_values[k0];
+  double h1 = D->height_centroid_values[k1];
+  double h2 = D->height_centroid_values[k2];
 
-    D->height_centroid_values[k] = dk;
-    D->x_centroid_work[k] = 0.0;
-    D->y_centroid_work[k] = 0.0;
+  double hmin = fmin(fmin(h0, fmin(h1, h2)), hc);
+  double hmax = fmax(fmax(h0, fmax(h1, h2)), hc);
 
-    if (dk <= minimum_allowed_height)
-      {
-        D->x_centroid_work[k] = 0.0;
-        D->xmom_centroid_values[k] = 0.0;
-        D->y_centroid_work[k] = 0.0;
-        D->ymom_centroid_values[k] = 0.0;
-      }
+  double tmp1 = c_tmp * fmax(hmin, 0.0) / fmax(hc, 1.0e-06) + d_tmp;
+  double tmp2 = c_tmp * fmax(hc, 0.0) / fmax(hmax, 1.0e-06) + d_tmp;
 
-    if (extrapolate_velocity_second_order == 1)
-    {
-      if (dk > minimum_allowed_height)
-      {
-        dk_inv = 1.0 / dk;
-        D->x_centroid_work[k] = D->xmom_centroid_values[k];
-        D->xmom_centroid_values[k] = D->xmom_centroid_values[k] * dk_inv;
+  *hfactor = fmax(0.0, fmin(tmp1, fmin(tmp2, 1.0)));
 
-        D->y_centroid_work[k] = D->ymom_centroid_values[k];
-        D->ymom_centroid_values[k] = D->ymom_centroid_values[k] * dk_inv;
-      }
-    }
-    } // end of for
+  // Smooth shutoff near dry areas
+  *hfactor = fmin(1.2 * fmax(hmin - D->minimum_allowed_height, 0.0) /
+                      (fmax(hmin, 0.0) + D->minimum_allowed_height),
+                  *hfactor);
 
+  *inv_area2 = 1.0 / area2;
+}
 
+#pragma omp declare simd
+static inline void reconstruct_vertex_values(double *__restrict edge_values, double *__restrict vertex_values, const anuga_int k3)
+{
+  vertex_values[k3 + 0] = edge_values[k3 + 1] + edge_values[k3 + 2] - edge_values[k3 + 0];
+  vertex_values[k3 + 1] = edge_values[k3 + 2] + edge_values[k3 + 0] - edge_values[k3 + 1];
+  vertex_values[k3 + 2] = edge_values[k3 + 0] + edge_values[k3 + 1] - edge_values[k3 + 2];
+}
 
-  // Begin extrapolation routine
+#pragma omp declare simd
+static inline void compute_edge_diffs(const double x, const double y,
+                                      const double xv0, const double yv0,
+                                      const double xv1, const double yv1,
+                                      const double xv2, const double yv2,
+                                      double *__restrict dxv0, double *__restrict dxv1, double *__restrict dxv2,
+                                      double *__restrict dyv0, double *__restrict dyv1, double *__restrict dyv2)
+{
+  *dxv0 = xv0 - x;
+  *dxv1 = xv1 - x;
+  *dxv2 = xv2 - x;
+  *dyv0 = yv0 - y;
+  *dyv1 = yv1 - y;
+  *dyv2 = yv2 - y;
+}
 
-#pragma omp parallel for simd private(k0, k1, k2, k3, k6, coord_index, i, \
-                          dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, \
-                          x_centroid_work, xmom_centroid_values, y_centroid_work, ymom_centroid_values, \
-                          dq1, area2, inv_area2, \
-                          cv_k, cv_k0, cv_k1, cv_k2, edge_values, \
-                          x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2, \
-                          dqv, qmin, qmax, hmin, hmax, \
-                          hc, h0, h1, h2, beta_tmp, hfactor, \
-                          dk, dk_inv, a, b) default(none) shared(D) schedule(static) \
-                          firstprivate(number_of_elements, minimum_allowed_height, extrapolate_velocity_second_order, c_tmp, d_tmp)
-  for (k = 0; k < number_of_elements; k++)
-  {
+// Computational routine
+// Extrapolate second order edge values from centroid values
+// This is the current procedure used in evolve loop.
+void _openmp_extrapolate_second_order_edge_sw(struct domain *__restrict D)
+{
+  double minimum_allowed_height = D->minimum_allowed_height;
+  anuga_int number_of_elements = D->number_of_elements;
+  anuga_int extrapolate_velocity_second_order = D->extrapolate_velocity_second_order;
 
-    //printf("%ld, %e \n",k, D->height_centroid_values[k]);
-    //printf("%ld,  %e, %e, %e, %e \n",k, x_centroid_work,xmom_centroid_values,y_centroid_work,ymom_centroid_values);
-    //printf("%ld,  %e, %e, %e, %e \n",k, D->x_centroid_work[k],D->xmom_centroid_values[k],D->y_centroid_work[k],D->ymom_centroid_values[k]);
+  // Parameters used to control how the limiter is forced to first-order near
+  // wet-dry regions
+  double a_tmp = 0.3; // Highest depth ratio with hfactor=1
+  double b_tmp = 0.1; // Highest depth ratio with hfactor=0
+  double c_tmp = 1.0 / (a_tmp - b_tmp);
+  double d_tmp = 1.0 - (c_tmp * a_tmp);
 
+  update_centroid_values(D, number_of_elements, minimum_allowed_height, extrapolate_velocity_second_order);
 
-    // Useful indices
-    k2 = k * 2;
-    k3 = k * 3;
-    k6 = k * 6;
+#pragma omp parallel for simd default(none) schedule(static) \
+    shared(D)                                                 \
+    firstprivate(number_of_elements, minimum_allowed_height, extrapolate_velocity_second_order, c_tmp, d_tmp)
+  for (anuga_int k = 0; k < number_of_elements; k++)
+  {
+    // // Useful indices
+    anuga_int k2 = k * 2;
+    anuga_int k3 = k * 3;
+    anuga_int k6 = k * 6;
 
     // Get the edge coordinates
-    xv0 = D->edge_coordinates[k6 + 0];
-    yv0 = D->edge_coordinates[k6 + 1];
-    xv1 = D->edge_coordinates[k6 + 2];
-    yv1 = D->edge_coordinates[k6 + 3];
-    xv2 = D->edge_coordinates[k6 + 4];
-    yv2 = D->edge_coordinates[k6 + 5];
+    const double xv0 = D->edge_coordinates[k6 + 0];
+    const double yv0 = D->edge_coordinates[k6 + 1];
+    const double xv1 = D->edge_coordinates[k6 + 2];
+    const double yv1 = D->edge_coordinates[k6 + 3];
+    const double xv2 = D->edge_coordinates[k6 + 4];
+    const double yv2 = D->edge_coordinates[k6 + 5];
 
     // Get the centroid coordinates
-    x = D->centroid_coordinates[k2 + 0];
-    y = D->centroid_coordinates[k2 + 1];
-
-    // Store x- and y- differentials for the edges of
-    // triangle k relative to the centroid
-    dxv0 = xv0 - x;
-    dxv1 = xv1 - x;
-    dxv2 = xv2 - x;
-    dyv0 = yv0 - y;
-    dyv1 = yv1 - y;
-    dyv2 = yv2 - y;
-
-    // If no boundaries, auxiliary triangle is formed
-    // from the centroids of the three neighbours
-    // If one boundary, auxiliary triangle is formed
-    // from this centroid and its two neighbours
-
-    k0 = D->surrogate_neighbours[k3 + 0];
-    k1 = D->surrogate_neighbours[k3 + 1];
+    const double x = D->centroid_coordinates[k2 + 0];
+    const double y = D->centroid_coordinates[k2 + 1];
+
+    // needed in the boundaries section
+    double dxv0, dxv1, dxv2;
+    double dyv0, dyv1, dyv2;
+    compute_edge_diffs(x, y,
+                       xv0, yv0,
+                       xv1, yv1,
+                       xv2, yv2,
+                       &dxv0, &dxv1, &dxv2,
+                       &dyv0, &dyv1, &dyv2);
+    // dxv0 = dxv0;
+    // dxv1 = dxv1;
+    // dxv2 = dxv2;
+    // dyv0 = dyv0;
+    // dyv1 = dyv1;
+    // dyv2 = dyv2;
+
+    anuga_int k0 = D->surrogate_neighbours[k3 + 0];
+    anuga_int k1 = D->surrogate_neighbours[k3 + 1];
     k2 = D->surrogate_neighbours[k3 + 2];
 
-    // Get the auxiliary triangle's vertex coordinates
-    // (normally the centroids of neighbouring triangles)
-    coord_index = 2 * k0;
-    x0 = D->centroid_coordinates[coord_index + 0];
-    y0 = D->centroid_coordinates[coord_index + 1];
+    anuga_int coord_index = 2 * k0;
+    double x0 = D->centroid_coordinates[coord_index + 0];
+    double y0 = D->centroid_coordinates[coord_index + 1];
 
     coord_index = 2 * k1;
-    x1 = D->centroid_coordinates[coord_index + 0];
-    y1 = D->centroid_coordinates[coord_index + 1];
+    double x1 = D->centroid_coordinates[coord_index + 0];
+    double y1 = D->centroid_coordinates[coord_index + 1];
 
     coord_index = 2 * k2;
-    x2 = D->centroid_coordinates[coord_index + 0];
-    y2 = D->centroid_coordinates[coord_index + 1];
-
-    // Store x- and y- differentials for the vertices
-    // of the auxiliary triangle
-    dx1 = x1 - x0;
-    dx2 = x2 - x0;
-    dy1 = y1 - y0;
-    dy2 = y2 - y0;
-
-    // Calculate 2*area of the auxiliary triangle
-    // The triangle is guaranteed to be counter-clockwise
-    area2 = dy2 * dx1 - dy1 * dx2;
-
-    if (((D->height_centroid_values[k0] < minimum_allowed_height) | (k0 == k)) &
+    double x2 = D->centroid_coordinates[coord_index + 0];
+    double y2 = D->centroid_coordinates[coord_index + 1];
+
+    // needed in the boundaries section
+    double dx1 = x1 - x0;
+    double dx2 = x2 - x0;
+    double dy1 = y1 - y0;
+    double dy2 = y2 - y0;
+    // dx1 = dx1;
+    // dx2 = dx2;
+    // dy1 = dy1;
+    // dy2 = dy2;
+    // needed in the boundaries section
+    double area2 = dy2 * dx1 - dy1 * dx2;
+    // area2 = area2;
+    // the calculation of dx0 dx1 dx2 dy0 dy1 dy2 etc could be calculated once and stored 
+    // in the domain structure.
+
+
+    const anuga_int dry =
+        ((D->height_centroid_values[k0] < minimum_allowed_height) | (k0 == k)) &
         ((D->height_centroid_values[k1] < minimum_allowed_height) | (k1 == k)) &
-        ((D->height_centroid_values[k2] < minimum_allowed_height) | (k2 == k)))
+        ((D->height_centroid_values[k2] < minimum_allowed_height) | (k2 == k));
+
+    if (dry)
     {
-      // printf("Surrounded by dry cells\n");
-      D->x_centroid_work[k] = 0.;
-      D->xmom_centroid_values[k] = 0.;
-      D->y_centroid_work[k] = 0.;
-      D->ymom_centroid_values[k] = 0.;
+      D->x_centroid_work[k] = 0.0;
+      D->xmom_centroid_values[k] = 0.0;
+      D->y_centroid_work[k] = 0.0;
+      D->ymom_centroid_values[k] = 0.0;
     }
 
-    // Limit the edge values
+    // int k0 = D->surrogate_neighbours[k3 + 0];
+    // int k1 = D->surrogate_neighbours[k3 + 1];
+    // k2 = D->surrogate_neighbours[k3 + 2];
+
     if (D->number_of_boundaries[k] == 3)
     {
       // Very unlikely
-      // No neighbours, set gradient on the triangle to zero
-
-      //printf("%ld 3 boundaries\n",k);
-
-      D->stage_edge_values[k3 + 0] = D->stage_centroid_values[k];
-      D->stage_edge_values[k3 + 1] = D->stage_centroid_values[k];
-      D->stage_edge_values[k3 + 2] = D->stage_centroid_values[k];
-
-      D->xmom_edge_values[k3 + 0] = D->xmom_centroid_values[k];
-      D->xmom_edge_values[k3 + 1] = D->xmom_centroid_values[k];
-      D->xmom_edge_values[k3 + 2] = D->xmom_centroid_values[k];
-
-      D->ymom_edge_values[k3 + 0] = D->ymom_centroid_values[k];
-      D->ymom_edge_values[k3 + 1] = D->ymom_centroid_values[k];
-      D->ymom_edge_values[k3 + 2] = D->ymom_centroid_values[k];
-
-      dk = D->height_centroid_values[k];
-      D->height_edge_values[k3 + 0] = dk;
-      D->height_edge_values[k3 + 1] = dk;
-      D->height_edge_values[k3 + 2] = dk;
-
+      // No neighbourso, set gradient on the triangle to zero
+      set_all_edge_values_from_centroid(D, k);
     }
     else if (D->number_of_boundaries[k] <= 1)
     {
@@ -1526,525 +1139,431 @@ int64_t _openmp_extrapolate_second_order_edge_sw(struct domain *D)
       // Number of boundaries <= 1
       // 'Typical case'
       //==============================================
-      //printf("%ld boundaries <= 1\n",k);
-
-      // Calculate heights of neighbouring cells
-      hc = D->height_centroid_values[k];
-      h0 = D->height_centroid_values[k0];
-      h1 = D->height_centroid_values[k1];
-      h2 = D->height_centroid_values[k2];
-
-      hmin = fmin(fmin(h0, fmin(h1, h2)), hc);
-      hmax = fmax(fmax(h0, fmax(h1, h2)), hc);
-
-      // Look for strong changes in cell depth as an indicator of near-wet-dry
-      // Reduce hfactor linearly from 1-0 between depth ratio (hmin/hc) of [a_tmp , b_tmp]
-      // NOTE: If we have a more 'second order' treatment in near dry areas (e.g. with b_tmp being negative), then
-      //       the water tends to dry more rapidly (which is in agreement with analytical results),
-      //       but is also more 'artefacty' in important cases (tendency for high velocities, etc).
-      //
-      // So hfactor = depth_ratio*(c_tmp) + d_tmp, but is clipped between 0 and 1.
-      hfactor = fmax(0., fmin(c_tmp * fmax(hmin, 0.0) / fmax(hc, 1.0e-06) + d_tmp,
-                              fmin(c_tmp * fmax(hc, 0.) / fmax(hmax, 1.0e-06) + d_tmp, 1.0)));
-      // Set hfactor to zero smothly as hmin--> minimum_allowed_height. This
-      // avoids some 'chatter' for very shallow flows
-      hfactor = fmin(1.2 * fmax(hmin - D->minimum_allowed_height, 0.) / (fmax(hmin, 0.) + 1. * D->minimum_allowed_height), hfactor);
-
-      inv_area2 = 1.0 / area2;
-
-      //-----------------------------------
+      double hfactor, inv_area2;
+      compute_hfactor_and_inv_area(D, k, k0, k1, k2, area2, c_tmp, d_tmp, &hfactor, &inv_area2);
       // stage
-      //-----------------------------------
-      beta_tmp = D->beta_w_dry + (D->beta_w - D->beta_w_dry) * hfactor;
-
-      cv_k  = D->stage_centroid_values[k];
-      cv_k0 = D->stage_centroid_values[k0];
-      cv_k1 = D->stage_centroid_values[k1];
-      cv_k2 = D->stage_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->stage_edge_values[k3 + 0] = edge_values[0];
-      D->stage_edge_values[k3 + 1] = edge_values[1];
-      D->stage_edge_values[k3 + 2] = edge_values[2];  
-
-      //-----------------------------------
+      interpolate_edges_with_beta(D->stage_centroid_values, D->stage_edge_values,
+                                  k, k0, k1, k2, k3,
+                                  dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
+                                  dx1, dx2, dy1, dy2, inv_area2,
+                                  D->beta_w_dry, D->beta_w, hfactor);
       // height
-      //-----------------------------------
-
-      cv_k  = D->height_centroid_values[k];
-      cv_k0 = D->height_centroid_values[k0];
-      cv_k1 = D->height_centroid_values[k1];
-      cv_k2 = D->height_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->height_edge_values[k3 + 0] = edge_values[0];
-      D->height_edge_values[k3 + 1] = edge_values[1];
-      D->height_edge_values[k3 + 2] = edge_values[2]; 
-
-    
-      //-----------------------------------
-      // xmomentum
-      //-----------------------------------
-
-      beta_tmp = D->beta_uh_dry + (D->beta_uh - D->beta_uh_dry) * hfactor;
-
-      cv_k  = D->xmom_centroid_values[k];
-      cv_k0 = D->xmom_centroid_values[k0];
-      cv_k1 = D->xmom_centroid_values[k1];
-      cv_k2 = D->xmom_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->xmom_edge_values[k3 + 0] = edge_values[0];
-      D->xmom_edge_values[k3 + 1] = edge_values[1];
-      D->xmom_edge_values[k3 + 2] = edge_values[2]; 
-
-      //-----------------------------------
-      // ymomentum
-      //-----------------------------------
-
-      beta_tmp = D->beta_vh_dry + (D->beta_vh - D->beta_vh_dry) * hfactor;
-
-      cv_k  = D->ymom_centroid_values[k];
-      cv_k0 = D->ymom_centroid_values[k0];
-      cv_k1 = D->ymom_centroid_values[k1];
-      cv_k2 = D->ymom_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->ymom_edge_values[k3 + 0] = edge_values[0];
-      D->ymom_edge_values[k3 + 1] = edge_values[1];
-      D->ymom_edge_values[k3 + 2] = edge_values[2]; 
+      interpolate_edges_with_beta(D->height_centroid_values, D->height_edge_values,
+                                  k, k0, k1, k2, k3,
+                                  dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
+                                  dx1, dx2, dy1, dy2, inv_area2,
+                                  D->beta_w_dry, D->beta_w, hfactor);
+      // xmom
+      interpolate_edges_with_beta(D->xmom_centroid_values, D->xmom_edge_values,
+                                  k, k0, k1, k2, k3,
+                                  dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
+                                  dx1, dx2, dy1, dy2, inv_area2,
+                                  D->beta_uh_dry, D->beta_uh, hfactor);
+      // ymom
+      interpolate_edges_with_beta(D->ymom_centroid_values, D->ymom_edge_values,
+                                  k, k0, k1, k2, k3,
+                                  dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
+                                  dx1, dx2, dy1, dy2, inv_area2,
+                                  D->beta_vh_dry, D->beta_vh, hfactor);
 
     } // End number_of_boundaries <=1
     else
     {
-      //printf("%ld 2 boundaries\n",k);
       //==============================================
-      // Number of boundaries == 2
+      //  Number of boundaries == 2
       //==============================================
-
       // One internal neighbour and gradient is in direction of the neighbour's centroid
-
       // Find the only internal neighbour (k1?)
-      for (k2 = k3; k2 < k3 + 3; k2++)
-      {
-        // Find internal neighbour of triangle k
-        // k2 indexes the edges of triangle k
-
-        if (D->surrogate_neighbours[k2] != k)
-        {
-          break;
-        }
-      }
-
-      // if ((k2 == k3 + 3))
-      // {
-      //   // If we didn't find an internal neighbour
-      //   // report_python_error(AT, "Internal neighbour not found");
-      //   return -1;
-      // }
-
-      k1 = D->surrogate_neighbours[k2];
-
-      // The coordinates of the triangle are already (x,y).
-      // Get centroid of the neighbour (x1,y1)
-      coord_index = 2 * k1;
-      x1 = D->centroid_coordinates[coord_index + 0];
-      y1 = D->centroid_coordinates[coord_index + 1];
-
-      // Compute x- and y- distances between the centroid of
-      // triangle k and that of its neighbour
-      dx1 = x1 - x;
-      dy1 = y1 - y;
-
-      // Set area2 as the square of the distance
-      area2 = dx1 * dx1 + dy1 * dy1;
-
-      // Set dx2=(x1-x0)/((x1-x0)^2+(y1-y0)^2)
-      // and dy2=(y1-y0)/((x1-x0)^2+(y1-y0)^2) which
-      // respectively correspond to the x- and y- gradients
-      // of the conserved quantities
-      dx2 = 1.0 / area2;
-      dy2 = dx2 * dy1;
-      dx2 *= dx1;
-
-      //-----------------------------------
+      k1 = get_internal_neighbour(D, k);
+      compute_gradient_projection_between_centroids(D, k, k1, &dx2, &dy2);
       // stage
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->stage_centroid_values[k1] - D->stage_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
+      extrapolate_gradient_limited(D->stage_centroid_values, D->stage_edge_values,
+                                   k, k1, k3, dx2, dy2,
+                                   dxv0, dxv1, dxv2,
+                                   dyv0, dyv1, dyv2, D->beta_w);
+      // height
+      extrapolate_gradient_limited(D->height_centroid_values, D->height_edge_values,
+                                   k, k1, k3, dx2, dy2,
+                                   dxv0, dxv1, dxv2,
+                                   dyv0, dyv1, dyv2, D->beta_w);
+      // xmom
+      extrapolate_gradient_limited(D->xmom_centroid_values, D->xmom_edge_values,
+                                   k, k1, k3, dx2, dy2,
+                                   dxv0, dxv1, dxv2,
+                                   dyv0, dyv1, dyv2, D->beta_w);
+      // ymom
+      extrapolate_gradient_limited(D->ymom_centroid_values, D->ymom_edge_values,
+                                   k, k1, k3, dx2, dy2,
+                                   dxv0, dxv1, dxv2,
+                                   dyv0, dyv1, dyv2, D->beta_w);
 
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
+    } // else [number_of_boundaries]
 
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
+    // If needed, convert from velocity to momenta
+    if (D->extrapolate_velocity_second_order == 1)
+    {
+      // Re-compute momenta at edges
+      for (anuga_int i = 0; i < 3; i++)
       {
-        qmin = dq1;
-        qmax = 0.0;
+        double dk = D->height_edge_values[k3 + i];
+        D->xmom_edge_values[k3 + i] = D->xmom_edge_values[k3 + i] * dk;
+        D->ymom_edge_values[k3 + i] = D->ymom_edge_values[k3 + i] * dk;
       }
+    }
 
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->stage_edge_values[k3 + 0] = D->stage_centroid_values[k] + dqv[0];
-      D->stage_edge_values[k3 + 1] = D->stage_centroid_values[k] + dqv[1];
-      D->stage_edge_values[k3 + 2] = D->stage_centroid_values[k] + dqv[2];
-
-      //-----------------------------------
-      // height
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->height_centroid_values[k1] - D->height_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
+    for (anuga_int i = 0; i < 3; i++)
+    {
+      D->bed_edge_values[k3 + i] = D->stage_edge_values[k3 + i] - D->height_edge_values[k3 + i];
+    }
 
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
+    // This should not be needed, as now the evolve loop should just depend
+    // on the edge values, which are reconstructed from the centroid values
+    // reconstruct_vertex_values(D->stage_edge_values, D->stage_vertex_values, k3);
+    // reconstruct_vertex_values(D->height_edge_values, D->height_vertex_values, k3);
+    // reconstruct_vertex_values(D->xmom_edge_values, D->xmom_vertex_values, k3);
+    // reconstruct_vertex_values(D->ymom_edge_values, D->ymom_vertex_values, k3);
+    // reconstruct_vertex_values(D->bed_edge_values, D->bed_vertex_values, k3);
+  }
+  // for k=0 to number_of_elements-1
+// Fix xmom and ymom centroid values
+if(extrapolate_velocity_second_order == 1)
+{
+#pragma omp parallel for simd schedule(static) firstprivate(extrapolate_velocity_second_order)
+  for (anuga_int k = 0; k < D->number_of_elements; k++)
+  {
+      // Convert velocity back to momenta at centroids
+      D->xmom_centroid_values[k] = D->x_centroid_work[k];
+      D->ymom_centroid_values[k] = D->y_centroid_work[k];
+  }
+}
 
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
+}
 
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
+void _openmp_distribute_edges_to_vertices(struct domain *__restrict D)
+{
+  // Distribute edge values to vertices
+  anuga_int number_of_elements = D->number_of_elements;
 
-      D->height_edge_values[k3 + 0] = D->height_centroid_values[k] + dqv[0];
-      D->height_edge_values[k3 + 1] = D->height_centroid_values[k] + dqv[1];
-      D->height_edge_values[k3 + 2] = D->height_centroid_values[k] + dqv[2];
+#pragma omp parallel for simd default(none) shared(D) schedule(static) firstprivate(number_of_elements)
+  for (anuga_int k = 0; k < number_of_elements; k++)
+  {
+    anuga_int k3 = 3 * k;
+
+    // Set vertex values from edge values
+    reconstruct_vertex_values(D->stage_edge_values, D->stage_vertex_values, k3);
+    reconstruct_vertex_values(D->height_edge_values, D->height_vertex_values, k3);
+    reconstruct_vertex_values(D->xmom_edge_values, D->xmom_vertex_values, k3);
+    reconstruct_vertex_values(D->ymom_edge_values, D->ymom_vertex_values, k3);
+    reconstruct_vertex_values(D->bed_edge_values, D->bed_vertex_values, k3);
+  
+  }
+}
 
-      //-----------------------------------
-      // xmomentum
-      //-----------------------------------
+void _openmp_manning_friction_flat_semi_implicit(const struct domain *__restrict D)
+{
 
-      // Compute differentials
-      dq1 = D->xmom_centroid_values[k1] - D->xmom_centroid_values[k];
+  anuga_int k;
 
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
+  const anuga_int N = D->number_of_elements;
+  const double eps = D->minimum_allowed_height;
+  const double g = D->g;
+  const double seven_thirds = 7.0 / 3.0;
 
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
+ 
+#pragma omp parallel for simd default(none) shared(D) schedule(static) \
+        firstprivate(N, eps, g, seven_thirds)
 
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
+  for (k = 0; k < N; k++)
+  {
+    double S = 0.0;
+    double h;
+    double uh = D->xmom_centroid_values[k];
+    double vh = D->ymom_centroid_values[k];
+    double eta = D->friction_centroid_values[k];
+    double abs_mom = sqrt( uh*uh + vh*vh );
+
+    if (eta > 1.0e-15)
+    {
+      h = D->stage_centroid_values[k] - D->bed_centroid_values[k];
+      if (h >= eps)
+       {
+        S = -g * eta * eta * abs_mom;
+        S /= pow(h, seven_thirds); 
+       }
       }
+    D->xmom_semi_implicit_update[k] += S * D->xmom_centroid_values[k];
+    D->ymom_semi_implicit_update[k] += S * D->ymom_centroid_values[k];
+  }
+}
 
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
 
-      D->xmom_edge_values[k3 + 0] = D->xmom_centroid_values[k] + dqv[0];
-      D->xmom_edge_values[k3 + 1] = D->xmom_centroid_values[k] + dqv[1];
-      D->xmom_edge_values[k3 + 2] = D->xmom_centroid_values[k] + dqv[2];
 
-      //-----------------------------------
-      // ymomentum
-      //-----------------------------------
 
-      // Compute differentials
-      dq1 = D->ymom_centroid_values[k1] - D->ymom_centroid_values[k];
+    
 
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
+void _openmp_manning_friction_sloped_semi_implicit(const struct domain *__restrict D)
+{
+  anuga_int k;
+  const double one_third = 1.0 / 3.0;
+  const double seven_thirds = 7.0 / 3.0;
+
+  anuga_int N = D->number_of_elements;
+  const double  g = D->g;
+  const double  eps = D->minimum_allowed_height;
+  
+#pragma omp parallel for simd default(none) shared(D) schedule(static) \
+        firstprivate(N, eps, g, seven_thirds, one_third)
+for (k = 0; k < N; k++)
+  {
+    double S, h, z, z0, z1, z2, zs, zx, zy;
+    double x0, y0, x1, y1, x2, y2;
+    anuga_int k3, k6;
 
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
+    double w = D->stage_centroid_values[k];
+    double uh = D->xmom_centroid_values[k];
+    double vh = D->ymom_centroid_values[k];
+    double eta = D->friction_centroid_values[k];
 
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
+    S = 0.0;
+    k3 = 3 * k;
+    
+    // Get bathymetry
+    z0 = D->bed_vertex_values[k3 + 0];
+    z1 = D->bed_vertex_values[k3 + 1];
+    z2 = D->bed_vertex_values[k3 + 2];
 
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
+    // Compute bed slope
+    k6 = 6 * k; // base index
 
-      D->ymom_edge_values[k3 + 0] = D->ymom_centroid_values[k] + dqv[0];
-      D->ymom_edge_values[k3 + 1] = D->ymom_centroid_values[k] + dqv[1];
-      D->ymom_edge_values[k3 + 2] = D->ymom_centroid_values[k] + dqv[2];
+    
+    x0 = D->vertex_coordinates[k6 + 0];
+    y0 = D->vertex_coordinates[k6 + 1];
+    x1 = D->vertex_coordinates[k6 + 2];
+    y1 = D->vertex_coordinates[k6 + 3];
+    x2 = D->vertex_coordinates[k6 + 4];
+    y2 = D->vertex_coordinates[k6 + 5];
 
-    } // else [number_of_boundaries]
+    
+    if (eta > 1.0e-16)
+    {
+      _gradient(x0, y0, x1, y1, x2, y2, z0, z1, z2, &zx, &zy);
 
-  // printf("%ld, bed    %e, %e, %e\n",k, D->bed_edge_values[k3],D->bed_edge_values[k3 + 1],D->bed_edge_values[k3 + 2] );
-  // printf("%ld, stage  %e, %e, %e\n",k, D->stage_edge_values[k3],D->stage_edge_values[k3 + 1],D->stage_edge_values[k3 + 2] );
-  // printf("%ld, height %e, %e, %e\n",k, D->height_edge_values[k3],D->height_edge_values[k3 + 1],D->height_edge_values[k3 + 2] );
-  // printf("%ld, xmom   %e, %e, %e\n",k, D->xmom_edge_values[k3],D->xmom_edge_values[k3 + 1],D->xmom_edge_values[k3 + 2] );
-  // printf("%ld, ymom   %e, %e, %e\n",k, D->ymom_edge_values[k3],D->ymom_edge_values[k3 + 1],D->ymom_edge_values[k3 + 2] );
+      zs = sqrt(1.0 + zx * zx + zy * zy);
+      z = (z0 + z1 + z2) * one_third;
 
-    // If needed, convert from velocity to momenta
-    if (D->extrapolate_velocity_second_order == 1)
-    {
-      // Re-compute momenta at edges
-      for (i = 0; i < 3; i++)
+      h = w - z;
+      if (h >= eps)
       {
-        dk = D->height_edge_values[k3 + i];
-        D->xmom_edge_values[k3 + i] = D->xmom_edge_values[k3 + i] * dk;
-        D->ymom_edge_values[k3 + i] = D->ymom_edge_values[k3 + i] * dk;
+        S = -g*eta*eta*zs * sqrt((uh*uh + vh*vh));
+        S /= pow(h, seven_thirds); 
       }
     }
+    D->xmom_semi_implicit_update[k] += S * uh;
+    D->ymom_semi_implicit_update[k] += S * vh;
+  }
+}
 
-    // Compute new bed elevation
-    D->bed_edge_values[k3 + 0] = D->stage_edge_values[k3 + 0] - D->height_edge_values[k3 + 0];
-    D->bed_edge_values[k3 + 1] = D->stage_edge_values[k3 + 1] - D->height_edge_values[k3 + 1];
-    D->bed_edge_values[k3 + 2] = D->stage_edge_values[k3 + 2] - D->height_edge_values[k3 + 2];
-
-
-    // FIXME SR: Do we need vertex values every inner timestep?
+void _openmp_manning_friction_sloped_semi_implicit_edge_based(const struct domain *__restrict D)
+{
+  anuga_int k;
+  const double one_third = 1.0 / 3.0;
+  const double seven_thirds = 7.0 / 3.0;
+
+  anuga_int N = D->number_of_elements;
+  const double  g = D->g;
+  const double  eps = D->minimum_allowed_height;
+  
+#pragma omp parallel for simd default(none) shared(D) schedule(static) \
+        firstprivate(N, eps, g, seven_thirds, one_third)
+for (k = 0; k < N; k++)
+  {
+    double S, h, z, z0, z1, z2, zs, zx, zy;
+    double x0, y0, x1, y1, x2, y2;
+    anuga_int k3, k6;
 
-    // Compute stage vertex values
-    D->stage_vertex_values[k3 + 0] = D->stage_edge_values[k3 + 1] + D->stage_edge_values[k3 + 2] - D->stage_edge_values[k3 + 0];
-    D->stage_vertex_values[k3 + 1] = D->stage_edge_values[k3 + 0] + D->stage_edge_values[k3 + 2] - D->stage_edge_values[k3 + 1];
-    D->stage_vertex_values[k3 + 2] = D->stage_edge_values[k3 + 0] + D->stage_edge_values[k3 + 1] - D->stage_edge_values[k3 + 2];
+    double w = D->stage_centroid_values[k];
+    double uh = D->xmom_centroid_values[k];
+    double vh = D->ymom_centroid_values[k];
+    double eta = D->friction_centroid_values[k];
 
-    // Compute height vertex values
-    D->height_vertex_values[k3 + 0] = D->height_edge_values[k3 + 1] + D->height_edge_values[k3 + 2] - D->height_edge_values[k3 + 0];
-    D->height_vertex_values[k3 + 1] = D->height_edge_values[k3 + 0] + D->height_edge_values[k3 + 2] - D->height_edge_values[k3 + 1];
-    D->height_vertex_values[k3 + 2] = D->height_edge_values[k3 + 0] + D->height_edge_values[k3 + 1] - D->height_edge_values[k3 + 2];
+    S = 0.0;
+    k3 = 3 * k;
+    
+    // Get bathymetry
+    z0 = D->bed_edge_values[k3 + 0];
+    z1 = D->bed_edge_values[k3 + 1];
+    z2 = D->bed_edge_values[k3 + 2];
 
-    // Compute momenta at vertices
-    D->xmom_vertex_values[k3 + 0] = D->xmom_edge_values[k3 + 1] + D->xmom_edge_values[k3 + 2] - D->xmom_edge_values[k3 + 0];
-    D->xmom_vertex_values[k3 + 1] = D->xmom_edge_values[k3 + 0] + D->xmom_edge_values[k3 + 2] - D->xmom_edge_values[k3 + 1];
-    D->xmom_vertex_values[k3 + 2] = D->xmom_edge_values[k3 + 0] + D->xmom_edge_values[k3 + 1] - D->xmom_edge_values[k3 + 2];
+    // Compute bed slope
+    k6 = 6 * k; // base index
 
-    D->ymom_vertex_values[k3 + 0] = D->ymom_edge_values[k3 + 1] + D->ymom_edge_values[k3 + 2] - D->ymom_edge_values[k3 + 0];
-    D->ymom_vertex_values[k3 + 1] = D->ymom_edge_values[k3 + 0] + D->ymom_edge_values[k3 + 2] - D->ymom_edge_values[k3 + 1];
-    D->ymom_vertex_values[k3 + 2] = D->ymom_edge_values[k3 + 0] + D->ymom_edge_values[k3 + 1] - D->ymom_edge_values[k3 + 2];
+    
+    x0 = D->edge_coordinates[k6 + 0];
+    y0 = D->edge_coordinates[k6 + 1];
+    x1 = D->edge_coordinates[k6 + 2];
+    y1 = D->edge_coordinates[k6 + 3];
+    x2 = D->edge_coordinates[k6 + 4];
+    y2 = D->edge_coordinates[k6 + 5];
 
     
-    D->bed_vertex_values[k3 + 0] = D->bed_edge_values[k3 + 1] + D->bed_edge_values[k3 + 2] - D->bed_edge_values[k3 + 0];
-    D->bed_vertex_values[k3 + 1] = D->bed_edge_values[k3 + 0] + D->bed_edge_values[k3 + 2] - D->bed_edge_values[k3 + 1];
-    D->bed_vertex_values[k3 + 2] = D->bed_edge_values[k3 + 0] + D->bed_edge_values[k3 + 1] - D->bed_edge_values[k3 + 2];
+    if (eta > 1.0e-16)
+    {
+      _gradient(x0, y0, x1, y1, x2, y2, z0, z1, z2, &zx, &zy);
 
+      zs = sqrt(1.0 + zx * zx + zy * zy);
+      z = (z0 + z1 + z2) * one_third;
 
+      h = w - z;
+      if (h >= eps)
+      {
+        S = -g*eta*eta*zs * sqrt((uh*uh + vh*vh));
+        S /= pow(h, seven_thirds); 
+      }
+    }
+    D->xmom_semi_implicit_update[k] += S * uh;
+    D->ymom_semi_implicit_update[k] += S * vh;
+  }
+}
 
-  }   // for k=0 to number_of_elements-1
+// Original function for flat friction
+void _openmp_manning_friction_flat(const double g, const double eps, const anuga_int N,
+                                   double *__restrict w, double *__restrict z_centroid,
+                                   double *__restrict uh, double *__restrict vh,
+                                   double *__restrict eta, double *__restrict xmom_update, double *__restrict ymom_update)
+{
 
-// Fix xmom and ymom centroid values
-#pragma omp parallel for simd schedule(static) private(k3, i, dk) firstprivate(extrapolate_velocity_second_order)
-  for (k = 0; k < D->number_of_elements; k++)
+  anuga_int k;
+  const double seven_thirds = 7.0 / 3.0;
+
+#pragma omp parallel for schedule(static) firstprivate(eps, g, seven_thirds)
+  for (k = 0; k < N; k++)
   {
-    if (extrapolate_velocity_second_order == 1)
+    double S, h, z, abs_mom;
+    abs_mom = sqrt((uh[k] * uh[k] + vh[k] * vh[k]));
+    S = 0.0;
+
+    if (eta[k] > eps)
     {
-      // Convert velocity back to momenta at centroids
-      D->xmom_centroid_values[k] = D->x_centroid_work[k];
-      D->ymom_centroid_values[k] = D->y_centroid_work[k];
+      z = z_centroid[k];
+      h = w[k] - z;
+      if (h >= eps)
+      {
+        S = -g * eta[k] * eta[k] * abs_mom;
+        S /= pow(h, seven_thirds); 
+      }
     }
-
-    // // Compute stage vertex values
-    // D->stage_vertex_values[k3] = D->stage_edge_values[k3 + 1] + D->stage_edge_values[k3 + 2] - D->stage_edge_values[k3];
-    // D->stage_vertex_values[k3 + 1] = D->stage_edge_values[k3] + D->stage_edge_values[k3 + 2] - D->stage_edge_values[k3 + 1];
-    // D->stage_vertex_values[k3 + 2] = D->stage_edge_values[k3] + D->stage_edge_values[k3 + 1] - D->stage_edge_values[k3 + 2];
-
-    // // Compute height vertex values
-    // D->height_vertex_values[k3] = D->height_edge_values[k3 + 1] + D->height_edge_values[k3 + 2] - D->height_edge_values[k3];
-    // D->height_vertex_values[k3 + 1] = D->height_edge_values[k3] + D->height_edge_values[k3 + 2] - D->height_edge_values[k3 + 1];
-    // D->height_vertex_values[k3 + 2] = D->height_edge_values[k3] + D->height_edge_values[k3 + 1] - D->height_edge_values[k3 + 2];
-
-    // // If needed, convert from velocity to momenta
-    // if (D->extrapolate_velocity_second_order == 1)
-    // {
-    //   // Re-compute momenta at edges
-    //   for (i = 0; i < 3; i++)
-    //   {
-    //     dk = D->height_edge_values[k3 + i];
-    //     D->xmom_edge_values[k3 + i] = D->xmom_edge_values[k3 + i] * dk;
-    //     D->ymom_edge_values[k3 + i] = D->ymom_edge_values[k3 + i] * dk;
-    //   }
-    // }
-
-    // // Compute momenta at vertices
-    // D->xmom_vertex_values[k3 + 0] = D->xmom_edge_values[k3 + 1] + D->xmom_edge_values[k3 + 2] - D->xmom_edge_values[k3 + 0];
-    // D->xmom_vertex_values[k3 + 1] = D->xmom_edge_values[k3 + 0] + D->xmom_edge_values[k3 + 2] - D->xmom_edge_values[k3 + 1];
-    // D->xmom_vertex_values[k3 + 2] = D->xmom_edge_values[k3 + 0] + D->xmom_edge_values[k3 + 1] - D->xmom_edge_values[k3 + 2];
-
-    // D->ymom_vertex_values[k3 + 0] = D->ymom_edge_values[k3 + 1] + D->ymom_edge_values[k3 + 2] - D->ymom_edge_values[k3 + 0];
-    // D->ymom_vertex_values[k3 + 1] = D->ymom_edge_values[k3 + 0] + D->ymom_edge_values[k3 + 2] - D->ymom_edge_values[k3 + 1];
-    // D->ymom_vertex_values[k3 + 2] = D->ymom_edge_values[k3 + 0] + D->ymom_edge_values[k3 + 1] - D->ymom_edge_values[k3 + 2];
-
-    // // Compute new bed elevation
-    // D->bed_edge_values[k3 + 0] = D->stage_edge_values[k3 + 0] - D->height_edge_values[k3 + 0];
-    // D->bed_edge_values[k3 + 1] = D->stage_edge_values[k3 + 1] - D->height_edge_values[k3 + 1];
-    // D->bed_edge_values[k3 + 2] = D->stage_edge_values[k3 + 2] - D->height_edge_values[k3 + 2];
-
-    // D->bed_vertex_values[k3 + 0] = D->bed_edge_values[k3 + 1] + D->bed_edge_values[k3 + 2] - D->bed_edge_values[k3 + 0];
-    // D->bed_vertex_values[k3 + 1] = D->bed_edge_values[k3 + 0] + D->bed_edge_values[k3 + 2] - D->bed_edge_values[k3 + 1];
-    // D->bed_vertex_values[k3 + 2] = D->bed_edge_values[k3 + 0] + D->bed_edge_values[k3 + 1] - D->bed_edge_values[k3 + 2];
-
+    xmom_update[k] += S * uh[k];
+    ymom_update[k] += S * vh[k];
   }
-
-  return 0;
 }
 
-void _openmp_manning_friction_flat(double g, double eps, int64_t N,
-  double* w, double* zv,
-  double* uh, double* vh,
-  double* eta, double* xmom_update, double* ymom_update) {
-
-int64_t k, k3;
-double S, h, z, abs_mom;
-const double one_third = 1.0/3.0; 
-const double seven_thirds = 7.0/3.0;
 
-#pragma omp parallel for schedule(static) private(k,k3,z,h,S) firstprivate(eps,g,seven_thirds)
-for (k = 0; k < N; k++) {
-  abs_mom = sqrt((uh[k] * uh[k] + vh[k] * vh[k]));
-  S = 0.0;
+void _openmp_manning_friction_sloped(const double g, const double eps, const anuga_int N,
+                                     double *__restrict x_vertex, double *__restrict w, double *__restrict z_vertex,
+                                     double *__restrict uh, double *__restrict vh,
+                                     double *__restrict eta, double *__restrict xmom_update, double *__restrict ymom_update)
+{
 
-  if (eta[k] > eps) {
-      z = zv[k];
-      h = w[k] - z;
-      if (h >= eps) {
-          S = -g * eta[k] * eta[k] * abs_mom;
-          S /= pow(h, seven_thirds); //Expensive (on Ole's home computer)
-          //S /= exp((7.0/3.0)*log(h));      //seems to save about 15% over manning_friction
-          //S /= h*h*(1 + h/3.0 - h*h/9.0); //FIXME: Could use a Taylor expansion
+  const double one_third = 1.0 / 3.0;
+  const double seven_thirds = 7.0 / 3.0;
 
+#pragma omp parallel for schedule(static) firstprivate(eps, g, one_third, seven_thirds)
+  for (anuga_int k = 0; k < N; k++)
+  {
+    double S = 0.0;
+    anuga_int k3 = 3 * k;
+    // Get bathymetry
+    double z0 = z_vertex[k3 + 0];
+    double z1 = z_vertex[k3 + 1];
+    double z2 = z_vertex[k3 + 2];
+
+    // Compute bed slope
+    anuga_int k6 = 6 * k; // base index
+
+    double x0 = x_vertex[k6 + 0];
+    double y0 = x_vertex[k6 + 1];
+    double x1 = x_vertex[k6 + 2];
+    double y1 = x_vertex[k6 + 3];
+    double x2 = x_vertex[k6 + 4];
+    double y2 = x_vertex[k6 + 5];
+
+    if (eta[k] > eps)
+    {
+      double zx, zy, zs, z, h;
+      _gradient(x0, y0, x1, y1, x2, y2, z0, z1, z2, &zx, &zy);
 
-          //Update momentum
-          
+      zs = sqrt(1.0 + zx * zx + zy * zy);
+      z = (z0 + z1 + z2) * one_third;
+      h = w[k] - z;
+      if (h >= eps)
+      {
+        S = -g * eta[k] * eta[k] * zs * sqrt((uh[k] * uh[k] + vh[k] * vh[k]));
+        S /= pow(h, seven_thirds); 
       }
+    }
+    xmom_update[k] += S * uh[k];
+    ymom_update[k] += S * vh[k];
   }
-  xmom_update[k] += S * uh[k];
-  ymom_update[k] += S * vh[k];
-}
 }
 
-void _openmp_manning_friction_sloped(double g, double eps, int64_t N,
-  double* x, double* w, double* zv,
-  double* uh, double* vh,
-  double* eta, double* xmom_update, double* ymom_update) {
-
-int64_t k, k3, k6;
-double S, h, z, z0, z1, z2, zs, zx, zy;
-double x0, y0, x1, y1, x2, y2;
-const double one_third = 1.0/3.0; 
-const double seven_thirds = 7.0/3.0;
-
-#pragma omp parallel for schedule(static) private(k,k3,z0,z1,z2,x0,y0,x1,y1,x2,y2,zs,zx,zy,h,S) firstprivate(eps,g,one_third,seven_thirds)
-for (k = 0; k < N; k++) {
-  S = 0.0;
-  k3 = 3 * k;
-  // Get bathymetry
-  z0 = zv[k3 + 0];
-  z1 = zv[k3 + 1];
-  z2 = zv[k3 + 2];
-
-  // Compute bed slope
-  k6 = 6 * k; // base index
-
-  x0 = x[k6 + 0];
-  y0 = x[k6 + 1];
-  x1 = x[k6 + 2];
-  y1 = x[k6 + 3];
-  x2 = x[k6 + 4];
-  y2 = x[k6 + 5]; 
-
-  if (eta[k] > eps) {
+void _openmp_manning_friction_sloped_edge_based(const double g, const double eps, const anuga_int N,
+                                     double *__restrict x_edge, double *__restrict w, double *__restrict z_edge,
+                                     double *__restrict uh, double *__restrict vh,
+                                     double *__restrict eta, double *__restrict xmom_update, double *__restrict ymom_update)
+{
+
+  const double one_third = 1.0 / 3.0;
+  const double seven_thirds = 7.0 / 3.0;
+
+#pragma omp parallel for schedule(static) firstprivate(eps, g, one_third, seven_thirds)
+  for (anuga_int k = 0; k < N; k++)
+  {
+    double S = 0.0;
+    anuga_int k3 = 3 * k;
+    // Get bathymetry
+    double z0 = z_edge[k3 + 0];
+    double z1 = z_edge[k3 + 1];
+    double z2 = z_edge[k3 + 2];
+
+    // Compute bed slope
+    anuga_int k6 = 6 * k; // base index
+
+    double x0 = x_edge[k6 + 0];
+    double y0 = x_edge[k6 + 1];
+    double x1 = x_edge[k6 + 2];
+    double y1 = x_edge[k6 + 3];
+    double x2 = x_edge[k6 + 4];
+    double y2 = x_edge[k6 + 5];
+
+    if (eta[k] > eps)
+    {
+      double zx, zy, zs, z, h;
       _gradient(x0, y0, x1, y1, x2, y2, z0, z1, z2, &zx, &zy);
 
       zs = sqrt(1.0 + zx * zx + zy * zy);
       z = (z0 + z1 + z2) * one_third;
       h = w[k] - z;
-      if (h >= eps) {
-          S = -g * eta[k] * eta[k] * zs * sqrt((uh[k] * uh[k] + vh[k] * vh[k]));
-          S /= pow(h, seven_thirds); //Expensive (on Ole's home computer)
-          //S /= exp((7.0/3.0)*log(h));      //seems to save about 15% over manning_friction
-          //S /= h*h*(1 + h/3.0 - h*h/9.0); //FIXME: Could use a Taylor expansion
+      if (h >= eps)
+      {
+        S = -g * eta[k] * eta[k] * zs * sqrt((uh[k] * uh[k] + vh[k] * vh[k]));
+        S /= pow(h, seven_thirds); 
       }
+    }
+    xmom_update[k] += S * uh[k];
+    ymom_update[k] += S * vh[k];
   }
-  xmom_update[k] += S * uh[k];
-  ymom_update[k] += S * vh[k];
-}
 }
 
+
 // Computational function for flux computation
-int64_t _openmp_fix_negative_cells(struct domain *D)
+anuga_int _openmp_fix_negative_cells(const struct domain *__restrict D)
 {
-  int64_t k;
-  int64_t tff;
-  int64_t num_negative_cells = 0;
+  anuga_int num_negative_cells = 0;
 
-  #pragma omp parallel for schedule(static) private(k, tff) reduction(+:num_negative_cells)
-  for (k = 0; k < D->number_of_elements; k++)
+#pragma omp parallel for schedule(static) reduction(+ : num_negative_cells)
+  for (anuga_int k = 0; k < D->number_of_elements; k++)
   {
-    tff = D->tri_full_flag[k];
-    if ((D->stage_centroid_values[k] - D->bed_centroid_values[k] < 0.0) & (tff > 0)) 
+    if ((D->stage_centroid_values[k] - D->bed_centroid_values[k] < 0.0) & (D->tri_full_flag[k] > 0))
     {
       num_negative_cells = num_negative_cells + 1;
       D->stage_centroid_values[k] = D->bed_centroid_values[k];
@@ -2054,3 +1573,960 @@ int64_t _openmp_fix_negative_cells(struct domain *D)
   }
   return num_negative_cells;
 }
+
+
+anuga_int _openmp_gravity(const struct domain *__restrict D) {
+
+    anuga_int k, N, k3, k6;
+    double g, avg_h, zx, zy;
+    double x0, y0, x1, y1, x2, y2, z0, z1, z2;
+
+    g = D->g;
+    N = D->number_of_elements;
+
+    for (k = 0; k < N; k++) {
+        k3 = 3 * k; // base index
+
+        // Get bathymetry
+        z0 = (D->bed_vertex_values)[k3 + 0];
+        z1 = (D->bed_vertex_values)[k3 + 1];
+        z2 = (D->bed_vertex_values)[k3 + 2];
+
+        //printf("z0 %g, z1 %g, z2 %g \n",z0,z1,z2);
+
+        // Get average depth from centroid values
+        avg_h = (D->stage_centroid_values)[k] - (D->bed_centroid_values)[k];
+
+        //printf("avg_h  %g \n",avg_h);
+        // Compute bed slope
+        k6 = 6 * k; // base index
+
+        x0 = (D->vertex_coordinates)[k6 + 0];
+        y0 = (D->vertex_coordinates)[k6 + 1];
+        x1 = (D->vertex_coordinates)[k6 + 2];
+        y1 = (D->vertex_coordinates)[k6 + 3];
+        x2 = (D->vertex_coordinates)[k6 + 4];
+        y2 = (D->vertex_coordinates)[k6 + 5];
+
+        //printf("x0 %g, y0 %g, x1 %g, y1 %g, x2 %g, y2 %g \n",x0,y0,x1,y1,x2,y2);
+        _gradient(x0, y0, x1, y1, x2, y2, z0, z1, z2, &zx, &zy);
+
+        //printf("zx %g, zy %g \n",zx,zy);
+
+        // Update momentum
+        (D->xmom_explicit_update)[k] += -g * zx*avg_h;
+        (D->ymom_explicit_update)[k] += -g * zy*avg_h;
+    }
+    return 0;
+}
+
+anuga_int _openmp_gravity_wb(const struct domain *__restrict D) {
+
+    anuga_int i, k, N, k3, k6;
+    double g, avg_h, wx, wy, fact;
+    double x0, y0, x1, y1, x2, y2;
+    double hh[3];
+    double w0, w1, w2;
+    double sidex, sidey, area;
+    double n0, n1;
+
+    g = D->g;
+
+    N = D->number_of_elements;
+    for (k = 0; k < N; k++) {
+        k3 = 3 * k; // base index
+
+        //------------------------------------
+        // Calculate side terms -ghw_x term
+        //------------------------------------
+
+        // Get vertex stage values for gradient calculation
+        w0 = (D->stage_vertex_values)[k3 + 0];
+        w1 = (D->stage_vertex_values)[k3 + 1];
+        w2 = (D->stage_vertex_values)[k3 + 2];
+
+        // Compute stage slope
+        k6 = 6 * k; // base index
+
+        x0 = (D->vertex_coordinates)[k6 + 0];
+        y0 = (D->vertex_coordinates)[k6 + 1];
+        x1 = (D->vertex_coordinates)[k6 + 2];
+        y1 = (D->vertex_coordinates)[k6 + 3];
+        x2 = (D->vertex_coordinates)[k6 + 4];
+        y2 = (D->vertex_coordinates)[k6 + 5];
+
+        //printf("x0 %g, y0 %g, x1 %g, y1 %g, x2 %g, y2 %g \n",x0,y0,x1,y1,x2,y2);
+        _gradient(x0, y0, x1, y1, x2, y2, w0, w1, w2, &wx, &wy);
+
+        avg_h = (D->stage_centroid_values)[k] - (D->bed_centroid_values)[k];
+
+        // Update using -ghw_x term
+        (D->xmom_explicit_update)[k] += -g * wx*avg_h;
+        (D->ymom_explicit_update)[k] += -g * wy*avg_h;
+
+        //------------------------------------
+        // Calculate side terms \sum_i 0.5 g l_i h_i^2 n_i
+        //------------------------------------
+
+        // Getself.stage_c = self.domain.quantities['stage'].centroid_values edge depths
+        hh[0] = (D->stage_edge_values)[k3 + 0] - (D->bed_edge_values)[k3 + 0];
+        hh[1] = (D->stage_edge_values)[k3 + 1] - (D->bed_edge_values)[k3 + 1];
+        hh[2] = (D->stage_edge_values)[k3 + 2] - (D->bed_edge_values)[k3 + 2];
+
+
+        //printf("h0,1,2 %f %f %f\n",hh[0],hh[1],hh[2]);
+
+        // Calculate the side correction term
+        sidex = 0.0;
+        sidey = 0.0;
+        for (i = 0; i < 3; i++) {
+            n0 = (D->normals)[k6 + 2 * i];
+            n1 = (D->normals)[k6 + 2 * i + 1];
+
+            //printf("n0, n1 %i %g %g\n",i,n0,n1);
+            fact = -0.5 * g * hh[i] * hh[i] * (D->edgelengths)[k3 + i];
+            sidex = sidex + fact*n0;
+            sidey = sidey + fact*n1;
+        }
+
+        // Update momentum with side terms
+        area = (D->areas)[k];
+        (D->xmom_explicit_update)[k] += -sidex / area;
+        (D->ymom_explicit_update)[k] += -sidey / area;
+
+    }
+    return 0;
+}
+
+
+// Old function for extrapolating second order edge values from centroid values
+// This function is now replaced by _openmp_extrapolate_second_order_edge_sw
+// which uses SIMD and OpenMP for parallelization
+// This function is kept for reference and compatibility
+void _openmp_extrapolate_second_order_sw(const struct domain *__restrict D) {
+
+
+  // Domain Variables
+    anuga_int number_of_elements;
+    double epsilon;
+    double minimum_allowed_height;
+    double beta_w;
+    double beta_w_dry;
+    double beta_uh;
+    double beta_uh_dry;
+    double beta_vh;
+    double beta_vh_dry;
+    anuga_int* surrogate_neighbours;
+    anuga_int* number_of_boundaries;
+    double* centroid_coordinates;
+    double* stage_centroid_values;
+    double* xmom_centroid_values;
+    double* ymom_centroid_values;
+    double* bed_centroid_values;
+    double* vertex_coordinates;
+    double* stage_vertex_values;
+    double* xmom_vertex_values;
+    double* ymom_vertex_values;
+    double* bed_vertex_values;
+    anuga_int optimise_dry_cells;
+    anuga_int extrapolate_velocity_second_order;
+
+    // Local variables
+    double a, b; // Gradient vector used to calculate edge values from centroids
+    anuga_int k, k0, k1, k2, k3, k6, coord_index, i;
+    double x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2; // Vertices of the auxiliary triangle
+    double dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, dq0, dq1, dq2, area2, inv_area2;
+    double dqv[3], qmin, qmax, hmin, hmax;
+    double hc, h0, h1, h2, beta_tmp, hfactor;
+    //double dk, dv0, dv1, dv2, de[3], demin, dcmax, r0scale;
+    double dk, dv0, dv1, dv2;
+
+    double *xmom_centroid_store;
+    double *ymom_centroid_store;
+    //double *stage_centroid_store;
+
+
+    // Associate memory location of Domain varibles with local aliases
+    number_of_elements     = D->number_of_elements;
+    epsilon                = D->epsilon;
+    minimum_allowed_height = D->minimum_allowed_height;
+    beta_w                 = D->beta_w;
+    beta_w_dry             = D->beta_w_dry;
+    beta_uh                = D->beta_uh;
+    beta_uh_dry            = D->beta_uh_dry;
+    beta_vh                = D->beta_vh;
+    beta_vh_dry            = D->beta_vh_dry;
+    optimise_dry_cells     = D->optimise_dry_cells;
+
+    extrapolate_velocity_second_order = D->extrapolate_velocity_second_order;
+
+    surrogate_neighbours      = D->surrogate_neighbours;
+    number_of_boundaries      = D->number_of_boundaries;
+    centroid_coordinates      = D->centroid_coordinates;
+    stage_centroid_values     = D->stage_centroid_values;
+    xmom_centroid_values      = D->xmom_centroid_values;
+    ymom_centroid_values      = D->ymom_centroid_values;
+    bed_centroid_values       = D->bed_centroid_values;
+    vertex_coordinates        = D->vertex_coordinates;
+    stage_vertex_values       = D->stage_vertex_values;
+    xmom_vertex_values        = D->xmom_vertex_values;
+    ymom_vertex_values        = D->ymom_vertex_values;
+    bed_vertex_values         = D->bed_vertex_values;
+
+
+
+
+/*
+anuga_int _extrapolate_second_order_sw(anuga_int number_of_elements,
+        double epsilon,
+        double minimum_allowed_height,
+        double beta_w,
+        double beta_w_dry,
+        double beta_uh,
+        double beta_uh_dry,
+        double beta_vh,
+        double beta_vh_dry,
+        anuga_int* surrogate_neighbours,
+        anuga_int* number_of_boundaries,
+        double* centroid_coordinates,
+        double* stage_centroid_values,
+        double* xmom_centroid_values,
+        double* ymom_centroid_values,
+        double* elevation_centroid_values,
+        double* vertex_coordinates,
+        double* stage_vertex_values,
+        double* xmom_vertex_values,
+        double* ymom_vertex_values,
+        double* elevation_vertex_values,
+        anuga_int optimise_dry_cells,
+        anuga_int extrapolate_velocity_second_order) {
+
+
+
+    // Local variables
+    double a, b; // Gradient vector used to calculate vertex values from centroids
+    anuga_int k, k0, k1, k2, k3, k6, coord_index, i;
+    double x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2; // Vertices of the auxiliary triangle
+    double dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, dq0, dq1, dq2, area2, inv_area2;
+    double dqv[3], qmin, qmax, hmin, hmax;
+    double hc, h0, h1, h2, beta_tmp, hfactor;
+    double xmom_centroid_store[number_of_elements], ymom_centroid_store[number_of_elements], dk, dv0, dv1, dv2;
+*/
+
+   // Use malloc to avoid putting these variables on the stack, which can cause
+   // segfaults in large model runs
+    xmom_centroid_store = malloc(number_of_elements*sizeof(double));
+    ymom_centroid_store = malloc(number_of_elements*sizeof(double));
+    // stage_centroid_store = malloc(number_of_elements*sizeof(double));
+
+    if (extrapolate_velocity_second_order == 1) {
+        // Replace momentum centroid with velocity centroid to allow velocity
+        // extrapolation This will be changed back at the end of the routine
+        for (k = 0; k < number_of_elements; k++) {
+
+            dk = fmax(stage_centroid_values[k] - bed_centroid_values[k], minimum_allowed_height);
+            xmom_centroid_store[k] = xmom_centroid_values[k];
+            xmom_centroid_values[k] = xmom_centroid_values[k] / dk;
+
+            ymom_centroid_store[k] = ymom_centroid_values[k];
+            ymom_centroid_values[k] = ymom_centroid_values[k] / dk;
+        }
+    }
+
+    // Begin extrapolation routine
+    for (k = 0; k < number_of_elements; k++) {
+        k3 = k * 3;
+        k6 = k * 6;
+
+        if (number_of_boundaries[k] == 3) {
+            // No neighbours, set gradient on the triangle to zero
+
+            stage_vertex_values[k3] = stage_centroid_values[k];
+            stage_vertex_values[k3 + 1] = stage_centroid_values[k];
+            stage_vertex_values[k3 + 2] = stage_centroid_values[k];
+            xmom_vertex_values[k3] = xmom_centroid_values[k];
+            xmom_vertex_values[k3 + 1] = xmom_centroid_values[k];
+            xmom_vertex_values[k3 + 2] = xmom_centroid_values[k];
+            ymom_vertex_values[k3] = ymom_centroid_values[k];
+            ymom_vertex_values[k3 + 1] = ymom_centroid_values[k];
+            ymom_vertex_values[k3 + 2] = ymom_centroid_values[k];
+
+            continue;
+        } else {
+            // Triangle k has one or more neighbours.
+            // Get centroid and vertex coordinates of the triangle
+
+            // Get the vertex coordinates
+            xv0 = vertex_coordinates[k6];
+            yv0 = vertex_coordinates[k6 + 1];
+            xv1 = vertex_coordinates[k6 + 2];
+            yv1 = vertex_coordinates[k6 + 3];
+            xv2 = vertex_coordinates[k6 + 4];
+            yv2 = vertex_coordinates[k6 + 5];
+
+            // Get the centroid coordinates
+            coord_index = 2 * k;
+            x = centroid_coordinates[coord_index];
+            y = centroid_coordinates[coord_index + 1];
+
+            // Store x- and y- differentials for the vertices of
+            // triangle k relative to the centroid
+            dxv0 = xv0 - x;
+            dxv1 = xv1 - x;
+            dxv2 = xv2 - x;
+            dyv0 = yv0 - y;
+            dyv1 = yv1 - y;
+            dyv2 = yv2 - y;
+        }
+
+
+
+
+        if (number_of_boundaries[k] <= 1) {
+            //==============================================
+            // Number of boundaries <= 1
+            //==============================================
+
+
+            // If no boundaries, auxiliary triangle is formed
+            // from the centroids of the three neighbours
+            // If one boundary, auxiliary triangle is formed
+            // from this centroid and its two neighbours
+
+            k0 = surrogate_neighbours[k3];
+            k1 = surrogate_neighbours[k3 + 1];
+            k2 = surrogate_neighbours[k3 + 2];
+
+            // Get the auxiliary triangle's vertex coordinates
+            // (really the centroids of neighbouring triangles)
+            coord_index = 2 * k0;
+            x0 = centroid_coordinates[coord_index];
+            y0 = centroid_coordinates[coord_index + 1];
+
+            coord_index = 2 * k1;
+            x1 = centroid_coordinates[coord_index];
+            y1 = centroid_coordinates[coord_index + 1];
+
+            coord_index = 2 * k2;
+            x2 = centroid_coordinates[coord_index];
+            y2 = centroid_coordinates[coord_index + 1];
+
+            // Store x- and y- differentials for the vertices
+            // of the auxiliary triangle
+            dx1 = x1 - x0;
+            dx2 = x2 - x0;
+            dy1 = y1 - y0;
+            dy2 = y2 - y0;
+
+            // Calculate 2*area of the auxiliary triangle
+            // The triangle is guaranteed to be counter-clockwise
+            area2 = dy2 * dx1 - dy1*dx2;
+
+            // If the mesh is 'weird' near the boundary,
+            // the triangle might be flat or clockwise
+            // Default to zero gradient
+            if (area2 <= 0) {
+                //printf("Error negative triangle area \n");
+                //return -1;
+
+                stage_vertex_values[k3] = stage_centroid_values[k];
+                stage_vertex_values[k3 + 1] = stage_centroid_values[k];
+                stage_vertex_values[k3 + 2] = stage_centroid_values[k];
+                xmom_vertex_values[k3] = xmom_centroid_values[k];
+                xmom_vertex_values[k3 + 1] = xmom_centroid_values[k];
+                xmom_vertex_values[k3 + 2] = xmom_centroid_values[k];
+                ymom_vertex_values[k3] = ymom_centroid_values[k];
+                ymom_vertex_values[k3 + 1] = ymom_centroid_values[k];
+                ymom_vertex_values[k3 + 2] = ymom_centroid_values[k];
+
+                continue;
+            }
+
+            // Calculate heights of neighbouring cells
+            hc = stage_centroid_values[k] - bed_centroid_values[k];
+            h0 = stage_centroid_values[k0] - bed_centroid_values[k0];
+            h1 = stage_centroid_values[k1] - bed_centroid_values[k1];
+            h2 = stage_centroid_values[k2] - bed_centroid_values[k2];
+            hmin = fmax(fmax(h0, fmax(h1, h2)), hc);
+            //hfactor = hc/(hc + 1.0);
+
+            hfactor = 0.0;
+            if (hmin > 0.001) {
+                hfactor = (hmin - 0.001) / (hmin + 0.004);
+            }
+
+            if (optimise_dry_cells) {
+                // Check if linear reconstruction is necessary for triangle k
+                // This check will exclude dry cells.
+
+                hmax = fmax(h0, fmax(h1, h2));
+                if (hmax < epsilon) {
+                    continue;
+                }
+            }
+
+            //-----------------------------------
+            // stage
+            //-----------------------------------
+
+            // Calculate the difference between vertex 0 of the auxiliary
+            // triangle and the centroid of triangle k
+            dq0 = stage_centroid_values[k0] - stage_centroid_values[k];
+
+            // Calculate differentials between the vertices
+            // of the auxiliary triangle (centroids of neighbouring triangles)
+            dq1 = stage_centroid_values[k1] - stage_centroid_values[k0];
+            dq2 = stage_centroid_values[k2] - stage_centroid_values[k0];
+
+            inv_area2 = 1.0 / area2;
+            // Calculate the gradient of stage on the auxiliary triangle
+            a = dy2 * dq1 - dy1*dq2;
+            a *= inv_area2;
+            b = dx1 * dq2 - dx2*dq1;
+            b *= inv_area2;
+
+            // Calculate provisional jumps in stage from the centroid
+            // of triangle k to its vertices, to be limited
+            dqv[0] = a * dxv0 + b*dyv0;
+            dqv[1] = a * dxv1 + b*dyv1;
+            dqv[2] = a * dxv2 + b*dyv2;
+
+            // Now we want to find min and max of the centroid and the
+            // vertices of the auxiliary triangle and compute jumps
+            // from the centroid to the min and max
+            __find_qmin_and_qmax_dq1_dq2(dq0, dq1, dq2, &qmin, &qmax);
+
+            // Playing with dry wet interface
+            //hmin = qmin;
+            //beta_tmp = beta_w_dry;
+            //if (hmin>minimum_allowed_height)
+            beta_tmp = beta_w_dry + (beta_w - beta_w_dry) * hfactor;
+
+            //printf("min_alled_height = %f\n",minimum_allowed_height);
+            //printf("hmin = %f\n",hmin);
+            //printf("beta_w = %f\n",beta_w);
+            //printf("beta_tmp = %f\n",beta_tmp);
+            // Limit the gradient
+            __limit_gradient(dqv, qmin, qmax, beta_tmp);
+
+            //for (i=0;i<3;i++)
+            stage_vertex_values[k3 + 0] = stage_centroid_values[k] + dqv[0];
+            stage_vertex_values[k3 + 1] = stage_centroid_values[k] + dqv[1];
+            stage_vertex_values[k3 + 2] = stage_centroid_values[k] + dqv[2];
+
+
+            //-----------------------------------
+            // xmomentum
+            //-----------------------------------
+
+            // Calculate the difference between vertex 0 of the auxiliary
+            // triangle and the centroid of triangle k
+            dq0 = xmom_centroid_values[k0] - xmom_centroid_values[k];
+
+            // Calculate differentials between the vertices
+            // of the auxiliary triangle
+            dq1 = xmom_centroid_values[k1] - xmom_centroid_values[k0];
+            dq2 = xmom_centroid_values[k2] - xmom_centroid_values[k0];
+
+            // Calculate the gradient of xmom on the auxiliary triangle
+            a = dy2 * dq1 - dy1*dq2;
+            a *= inv_area2;
+            b = dx1 * dq2 - dx2*dq1;
+            b *= inv_area2;
+
+            // Calculate provisional jumps in stage from the centroid
+            // of triangle k to its vertices, to be limited
+            dqv[0] = a * dxv0 + b*dyv0;
+            dqv[1] = a * dxv1 + b*dyv1;
+            dqv[2] = a * dxv2 + b*dyv2;
+
+            // Now we want to find min and max of the centroid and the
+            // vertices of the auxiliary triangle and compute jumps
+            // from the centroid to the min and max
+            __find_qmin_and_qmax_dq1_dq2(dq0, dq1, dq2, &qmin, &qmax);
+            //beta_tmp = beta_uh;
+            //if (hmin<minimum_allowed_height)
+            //beta_tmp = beta_uh_dry;
+            beta_tmp = beta_uh_dry + (beta_uh - beta_uh_dry) * hfactor;
+
+            // Limit the gradient
+            __limit_gradient(dqv, qmin, qmax, beta_tmp);
+
+            for (i = 0; i < 3; i++) {
+                xmom_vertex_values[k3 + i] = xmom_centroid_values[k] + dqv[i];
+            }
+
+            //-----------------------------------
+            // ymomentum
+            //-----------------------------------
+
+            // Calculate the difference between vertex 0 of the auxiliary
+            // triangle and the centroid of triangle k
+            dq0 = ymom_centroid_values[k0] - ymom_centroid_values[k];
+
+            // Calculate differentials between the vertices
+            // of the auxiliary triangle
+            dq1 = ymom_centroid_values[k1] - ymom_centroid_values[k0];
+            dq2 = ymom_centroid_values[k2] - ymom_centroid_values[k0];
+
+            // Calculate the gradient of xmom on the auxiliary triangle
+            a = dy2 * dq1 - dy1*dq2;
+            a *= inv_area2;
+            b = dx1 * dq2 - dx2*dq1;
+            b *= inv_area2;
+
+            // Calculate provisional jumps in stage from the centroid
+            // of triangle k to its vertices, to be limited
+            dqv[0] = a * dxv0 + b*dyv0;
+            dqv[1] = a * dxv1 + b*dyv1;
+            dqv[2] = a * dxv2 + b*dyv2;
+
+            // Now we want to find min and max of the centroid and the
+            // vertices of the auxiliary triangle and compute jumps
+            // from the centroid to the min and max
+            __find_qmin_and_qmax_dq1_dq2(dq0, dq1, dq2, &qmin, &qmax);
+
+            //beta_tmp = beta_vh;
+            //
+            //if (hmin<minimum_allowed_height)
+            //beta_tmp = beta_vh_dry;
+            beta_tmp = beta_vh_dry + (beta_vh - beta_vh_dry) * hfactor;
+
+            // Limit the gradient
+            __limit_gradient(dqv, qmin, qmax, beta_tmp);
+
+            for (i = 0; i < 3; i++) {
+                ymom_vertex_values[k3 + i] = ymom_centroid_values[k] + dqv[i];
+            }
+        }// End number_of_boundaries <=1
+        else {
+
+            //==============================================
+            // Number of boundaries == 2
+            //==============================================
+
+            // One internal neighbour and gradient is in direction of the neighbour's centroid
+
+            // Find the only internal neighbour (k1?)
+            for (k2 = k3; k2 < k3 + 3; k2++) {
+                // Find internal neighbour of triangle k
+                // k2 indexes the edges of triangle k
+
+                if (surrogate_neighbours[k2] != k) {
+                    break;
+                }
+            }
+
+            // if (k2 == k3 + 3) {
+            //     // If we didn't find an internal neighbour
+            //     return -1;
+            // }
+
+            k1 = surrogate_neighbours[k2];
+
+            // The coordinates of the triangle are already (x,y).
+            // Get centroid of the neighbour (x1,y1)
+            coord_index = 2 * k1;
+            x1 = centroid_coordinates[coord_index];
+            y1 = centroid_coordinates[coord_index + 1];
+
+            // Compute x- and y- distances between the centroid of
+            // triangle k and that of its neighbour
+            dx1 = x1 - x;
+            dy1 = y1 - y;
+
+            // Set area2 as the square of the distance
+            area2 = dx1 * dx1 + dy1*dy1;
+
+            // Set dx2=(x1-x0)/((x1-x0)^2+(y1-y0)^2)
+            // and dy2=(y1-y0)/((x1-x0)^2+(y1-y0)^2) which
+            // respectively correspond to the x- and y- gradients
+            // of the conserved quantities
+            dx2 = 1.0 / area2;
+            dy2 = dx2*dy1;
+            dx2 *= dx1;
+
+
+            //-----------------------------------
+            // stage
+            //-----------------------------------
+
+            // Compute differentials
+            dq1 = stage_centroid_values[k1] - stage_centroid_values[k];
+
+            // Calculate the gradient between the centroid of triangle k
+            // and that of its neighbour
+            a = dq1*dx2;
+            b = dq1*dy2;
+
+            // Calculate provisional vertex jumps, to be limited
+            dqv[0] = a * dxv0 + b*dyv0;
+            dqv[1] = a * dxv1 + b*dyv1;
+            dqv[2] = a * dxv2 + b*dyv2;
+
+            // Now limit the jumps
+            if (dq1 >= 0.0) {
+                qmin = 0.0;
+                qmax = dq1;
+            } else {
+                qmin = dq1;
+                qmax = 0.0;
+            }
+
+            // Limit the gradient
+            __limit_gradient(dqv, qmin, qmax, beta_w);
+
+            //for (i=0; i < 3; i++)
+            //{
+            stage_vertex_values[k3] = stage_centroid_values[k] + dqv[0];
+            stage_vertex_values[k3 + 1] = stage_centroid_values[k] + dqv[1];
+            stage_vertex_values[k3 + 2] = stage_centroid_values[k] + dqv[2];
+            //}
+
+            //-----------------------------------
+            // xmomentum
+            //-----------------------------------
+
+            // Compute differentials
+            dq1 = xmom_centroid_values[k1] - xmom_centroid_values[k];
+
+            // Calculate the gradient between the centroid of triangle k
+            // and that of its neighbour
+            a = dq1*dx2;
+            b = dq1*dy2;
+
+            // Calculate provisional vertex jumps, to be limited
+            dqv[0] = a * dxv0 + b*dyv0;
+            dqv[1] = a * dxv1 + b*dyv1;
+            dqv[2] = a * dxv2 + b*dyv2;
+
+            // Now limit the jumps
+            if (dq1 >= 0.0) {
+                qmin = 0.0;
+                qmax = dq1;
+            } else {
+                qmin = dq1;
+                qmax = 0.0;
+            }
+
+            // Limit the gradient
+            __limit_gradient(dqv, qmin, qmax, beta_w);
+
+            //for (i=0;i<3;i++)
+            //xmom_vertex_values[k3] = xmom_centroid_values[k] + dqv[0];
+            //xmom_vertex_values[k3 + 1] = xmom_centroid_values[k] + dqv[1];
+            //xmom_vertex_values[k3 + 2] = xmom_centroid_values[k] + dqv[2];
+
+            for (i = 0; i < 3; i++) {
+                xmom_vertex_values[k3 + i] = xmom_centroid_values[k] + dqv[i];
+            }
+
+            //-----------------------------------
+            // ymomentum
+            //-----------------------------------
+
+            // Compute differentials
+            dq1 = ymom_centroid_values[k1] - ymom_centroid_values[k];
+
+            // Calculate the gradient between the centroid of triangle k
+            // and that of its neighbour
+            a = dq1*dx2;
+            b = dq1*dy2;
+
+            // Calculate provisional vertex jumps, to be limited
+            dqv[0] = a * dxv0 + b*dyv0;
+            dqv[1] = a * dxv1 + b*dyv1;
+            dqv[2] = a * dxv2 + b*dyv2;
+
+            // Now limit the jumps
+            if (dq1 >= 0.0) {
+                qmin = 0.0;
+                qmax = dq1;
+            }
+            else {
+                qmin = dq1;
+                qmax = 0.0;
+            }
+
+            // Limit the gradient
+            __limit_gradient(dqv, qmin, qmax, beta_w);
+
+            //for (i=0;i<3;i++)
+            //ymom_vertex_values[k3] = ymom_centroid_values[k] + dqv[0];
+            //ymom_vertex_values[k3 + 1] = ymom_centroid_values[k] + dqv[1];
+            //ymom_vertex_values[k3 + 2] = ymom_centroid_values[k] + dqv[2];
+
+            for (i = 0; i < 3; i++) {
+                ymom_vertex_values[k3 + i] = ymom_centroid_values[k] + dqv[i];
+            }
+            //ymom_vertex_values[k3] = ymom_centroid_values[k] + dqv[0];
+            //ymom_vertex_values[k3 + 1] = ymom_centroid_values[k] + dqv[1];
+            //ymom_vertex_values[k3 + 2] = ymom_centroid_values[k] + dqv[2];
+        } // else [number_of_boundaries==2]
+
+
+
+
+    } // for k=0 to number_of_elements-1
+
+    if (extrapolate_velocity_second_order == 1) {
+        // Convert back from velocity to momentum
+        for (k = 0; k < number_of_elements; k++) {
+            k3 = 3 * k;
+            //dv0 = fmax(stage_vertex_values[k3]-bed_vertex_values[k3],minimum_allowed_height);
+            //dv1 = fmax(stage_vertex_values[k3+1]-bed_vertex_values[k3+1],minimum_allowed_height);
+            //dv2 = fmax(stage_vertex_values[k3+2]-bed_vertex_values[k3+2],minimum_allowed_height);
+            dv0 = fmax(stage_vertex_values[k3] - bed_vertex_values[k3], 0.);
+            dv1 = fmax(stage_vertex_values[k3 + 1] - bed_vertex_values[k3 + 1], 0.);
+            dv2 = fmax(stage_vertex_values[k3 + 2] - bed_vertex_values[k3 + 2], 0.);
+
+            //Correct centroid and vertex values
+            xmom_centroid_values[k] = xmom_centroid_store[k];
+            xmom_vertex_values[k3] = xmom_vertex_values[k3] * dv0;
+            xmom_vertex_values[k3 + 1] = xmom_vertex_values[k3 + 1] * dv1;
+            xmom_vertex_values[k3 + 2] = xmom_vertex_values[k3 + 2] * dv2;
+
+            ymom_centroid_values[k] = ymom_centroid_store[k];
+            ymom_vertex_values[k3] = ymom_vertex_values[k3] * dv0;
+            ymom_vertex_values[k3 + 1] = ymom_vertex_values[k3 + 1] * dv1;
+            ymom_vertex_values[k3 + 2] = ymom_vertex_values[k3 + 2] * dv2;
+
+        }
+    }
+
+
+    free(xmom_centroid_store);
+    free(ymom_centroid_store);
+    //free(stage_centroid_store);
+
+
+}
+
+
+anuga_int _openmp_update_conserved_quantities(const struct domain *__restrict D, 
+                                              const double timestep)
+      {
+	// Update centroid values based on values stored in
+	// explicit_update and semi_implicit_update as well as given timestep
+
+
+	anuga_int k;
+  anuga_int N = D->number_of_elements;
+  
+
+	// Divide semi_implicit update by conserved quantity
+	#pragma omp parallel for private(k) schedule(static) shared(D) firstprivate(N, timestep)
+	for (k=0; k<N; k++) {
+
+    double stage_c, xmom_c, ymom_c;
+
+    double denominator;
+
+		// use previous centroid value
+		stage_c = D->stage_centroid_values[k];
+		if (stage_c == 0.0) {
+			D->stage_semi_implicit_update[k] = 0.0;
+		} else {
+			D->stage_semi_implicit_update[k] /= stage_c;
+		}
+ 
+
+    xmom_c = D->xmom_centroid_values[k];
+		if (xmom_c == 0.0) {
+			D->xmom_semi_implicit_update[k] = 0.0;
+		} else {
+			D->xmom_semi_implicit_update[k] /= xmom_c;
+		}
+
+    ymom_c = D->ymom_centroid_values[k];
+		if (ymom_c == 0.0) {
+			D->ymom_semi_implicit_update[k] = 0.0;
+		} else {
+			D->ymom_semi_implicit_update[k] /= ymom_c;
+		}
+
+		// Explicit updates
+		D->stage_centroid_values[k] += timestep*D->stage_explicit_update[k];
+    D->xmom_centroid_values[k]  += timestep*D->xmom_explicit_update[k];
+    D->ymom_centroid_values[k]  += timestep*D->ymom_explicit_update[k];
+
+		// Semi implicit updates
+		denominator = 1.0 - timestep*D->stage_semi_implicit_update[k];
+		if (denominator > 0.0) {
+			//Update conserved_quantities from semi implicit updates
+			D->stage_centroid_values[k] /= denominator;
+		}
+
+    denominator = 1.0 - timestep*D->xmom_semi_implicit_update[k];
+		if (denominator > 0.0) {
+			//Update conserved_quantities from semi implicit updates
+			D->xmom_centroid_values[k] /= denominator;
+		}
+
+    denominator = 1.0 - timestep*D->ymom_semi_implicit_update[k];
+		if (denominator > 0.0) {
+			//Update conserved_quantities from semi implicit updates
+			D->ymom_centroid_values[k] /= denominator;
+		}
+		
+		// Reset semi_implicit_update here ready for next time step
+		D->stage_semi_implicit_update[k] = 0.0;
+    D->xmom_semi_implicit_update[k] = 0.0;
+    D->ymom_semi_implicit_update[k] = 0.0;
+	}
+
+	return 0;
+}
+
+anuga_int _openmp_saxpy_conserved_quantities(const struct domain *__restrict D, 
+                                             const double a, 
+                                             const double b, 
+                                             const double c)
+{
+  // This function performs a SAXPY operation on the centroid values and backup values.
+  //
+  // It does a standard SAXPY operation and then multiplies through a constant c.
+  // to deal with some numerical issues when using a = 1/3 and b = 2/3 and maintaining
+  // positive values.
+  
+
+  anuga_int N = D->number_of_elements;
+  // double a_c = a / c;
+  // double bc_a = b *c /a;
+  double c_inv = 1.0 / c;
+
+  #pragma omp parallel for simd schedule(static)
+  for (anuga_int i = 0; i < N; i++)
+  {
+    D->stage_centroid_values[i] = a*D->stage_centroid_values[i] + b*D->stage_backup_values[i];
+    D->xmom_centroid_values[i]  = a*D->xmom_centroid_values[i] + b*D->xmom_backup_values[i];
+    D->ymom_centroid_values[i]  = a*D->ymom_centroid_values[i] + b*D->ymom_backup_values[i];
+  }
+
+  if (c != 1.0)
+  {
+    #pragma omp parallel for simd schedule(static)
+    for (anuga_int i = 0; i < N; i++)
+    {
+      D->stage_centroid_values[i] *= c_inv;
+      D->xmom_centroid_values[i]  *= c_inv;
+      D->ymom_centroid_values[i]  *= c_inv;
+    }
+  }
+
+  // FIXME: Should get this to work as it should be faster than the above
+  // // stage
+  // anuga_dscal(N, a, D->stage_centroid_values, 1);
+  // anuga_daxpy(N, b, D->stage_backup_values, 1, D->stage_centroid_values, 1);
+  // if (c != 1.0) {
+  //   anuga_dscal(N, c_inv, D->stage_centroid_values, 1);
+  // }
+  
+  // // xmom
+  // anuga_dscal(N, a, D->xmom_centroid_values, 1);
+  // anuga_daxpy(N, b, D->xmom_backup_values, 1, D->xmom_centroid_values, 1);
+  // if (c != 1.0) {
+  //   anuga_dscal(N, c_inv, D->xmom_centroid_values, 1);
+  // }
+
+
+  // // ymom
+  // anuga_dscal(N, a, D->ymom_centroid_values, 1);
+  // anuga_daxpy(N, b, D->ymom_backup_values, 1, D->ymom_centroid_values, 1);
+  // if (c != 1.0) {
+  //   anuga_dscal(N, c_inv, D->ymom_centroid_values, 1);
+  // }
+
+  return 0;
+}
+
+anuga_int _openmp_backup_conserved_quantities(const struct domain *__restrict D)
+{
+  anuga_int k;
+  anuga_int N = D->number_of_elements;
+
+  // double stage_tmp[N];
+  // double xmom_tmp[N];
+  // double ymom_tmp[N];
+
+  #pragma omp parallel for simd default(none) shared(D) schedule(static) firstprivate(N)
+  for (k = 0; k < N; k++)
+  {
+    D->stage_backup_values[k] = D->stage_centroid_values[k];
+    D->xmom_backup_values[k]  = D->xmom_centroid_values[k];
+    D->ymom_backup_values[k]  = D->ymom_centroid_values[k];
+
+  }
+
+// #pragma omp parallel for simd default(none) shared(D, stage_tmp, xmom_tmp, ymom_tmp) \
+//         schedule(static) firstprivate(N)
+//   for (k = 0; k < N; k++)
+//   {
+//     stage_tmp[k] = D->stage_centroid_values[k];
+//     xmom_tmp[k]  = D->xmom_centroid_values[k];
+//     ymom_tmp[k]  = D->ymom_centroid_values[k];
+// }
+
+// #pragma omp parallel for simd default(none) shared(D, stage_tmp, xmom_tmp, ymom_tmp) \
+//         schedule(static) firstprivate(N)
+//   for (k = 0; k < N; k++)
+//   {
+//     D->stage_backup_values[k] = stage_tmp[k];
+//     D->xmom_backup_values[k]  = xmom_tmp[k];
+//     D->ymom_backup_values[k]  = ymom_tmp[k];
+// }
+  return 0;
+}
+
+void _openmp_set_omp_num_threads(anuga_int num_threads)
+{
+  // Set the number of threads for OpenMP
+  // This is a global setting and will affect all subsequent OpenMP parallel regions
+  omp_set_num_threads(num_threads);
+}
+void _openmp_evaluate_reflective_segment(struct domain *D, anuga_int N,
+   anuga_int *edge_segment, anuga_int *vol_ids, anuga_int *edge_ids){
+
+    #pragma omp parallel for schedule(static)
+     for(int k = 0; k < N; k++){
+
+
+      // get vol_ids 
+      int edge_segment_id = edge_segment[k];
+      int vid = vol_ids[k];
+      int edge_id = edge_ids[k];
+      double n1 = D->normals[vid * 6 + 2 * edge_id];
+      double n2 = D->normals[vid * 6 + 2 * edge_id + 1];
+
+      D->stage_boundary_values[edge_segment_id] = D->stage_edge_values[3 * vid + edge_id];
+      // the bed is the elevation
+      D->bed_boundary_values[edge_segment_id] = D->bed_edge_values[3 * vid + edge_id];
+      D->height_boundary_values[edge_segment_id] = D->height_edge_values[3 * vid + edge_id];
+
+      double q1 = D->xmom_edge_values[3 * vid + edge_id];
+      double q2 = D->ymom_edge_values[3 * vid + edge_id];
+
+      double r1 = -q1*n1 - q2*n2;
+      double r2 = -q1*n2 + q2*n1;
+
+      double x_mom_boundary_value = n1*r1 - n2*r2;
+      double y_mom_boundary_value = n2*r1 + n1*r2;
+
+      D->xmom_boundary_values[edge_segment_id] = x_mom_boundary_value;
+      D->ymom_boundary_values[edge_segment_id] = y_mom_boundary_value;
+
+      q1 = D->xvelocity_edge_values[3 * vid + edge_id];
+      q2 = D->yvelocity_edge_values[3 * vid + edge_id];
+
+      r1 = q1*n1 + q2*n2;
+      r2 = q1*n2 - q2*n1;
+
+      double x_vel_boundary_value = n1*r1 - n2*r2;
+      double y_vel_boundary_value = n2*r1 + n1*r2;
+
+      D->xvelocity_boundary_values[edge_segment_id] = x_vel_boundary_value;
+      D->yvelocity_boundary_values[edge_segment_id] = y_vel_boundary_value;
+
+     }
+
+}
\ No newline at end of file
diff --git a/anuga/shallow_water/sw_domain_openmp_ext.pyx b/anuga/shallow_water/sw_domain_openmp_ext.pyx
index 695cccdc6..4ad598671 100644
--- a/anuga/shallow_water/sw_domain_openmp_ext.pyx
+++ b/anuga/shallow_water/sw_domain_openmp_ext.pyx
@@ -1,4 +1,4 @@
-#cython: wraparound=False, boundscheck=True, cdivision=True, profile=False, nonecheck=False, overflowcheck=False, cdivision_warnings=False, unraisable_tracebacks=False
+#cython: wraparound=False, boundscheck=False, cdivision=True, profile=False, nonecheck=False, overflowcheck=False, cdivision_warnings=False, unraisable_tracebacks=False
 
 import cython
 from libc.stdint cimport int64_t
@@ -55,6 +55,8 @@ cdef extern from "sw_domain_openmp.c" nogil:
 		double* ymom_edge_values
 		double* bed_edge_values
 		double* height_edge_values
+		double* xvelocity_edge_values
+		double* yvelocity_edge_values
 		double* stage_centroid_values
 		double* xmom_centroid_values
 		double* ymom_centroid_values
@@ -69,6 +71,9 @@ cdef extern from "sw_domain_openmp.c" nogil:
 		double* xmom_boundary_values
 		double* ymom_boundary_values
 		double* bed_boundary_values
+		double* height_boundary_values
+		double* xvelocity_boundary_values
+		double* yvelocity_boundary_values
 		double* stage_explicit_update
 		double* xmom_explicit_update
 		double* ymom_explicit_update
@@ -90,20 +95,42 @@ cdef extern from "sw_domain_openmp.c" nogil:
 		double* stage_semi_implicit_update
 		double* xmom_semi_implicit_update
 		double* ymom_semi_implicit_update
+		double* friction_centroid_values
+		double* stage_backup_values
+		double* xmom_backup_values
+		double* ymom_backup_values
+
 
 
 	struct edge:
 		pass
 
+	int64_t __rotate(double *q, double n1, double n2)
+	void _openmp_set_omp_num_threads(int64_t num_threads)
 	double _openmp_compute_fluxes_central(domain* D, double timestep)
 	double _openmp_protect(domain* D)
-	int64_t _openmp_extrapolate_second_order_edge_sw(domain* D)
+	void _openmp_extrapolate_second_order_sw(domain* D)
+	void _openmp_extrapolate_second_order_edge_sw(domain* D)
 	int64_t _openmp_fix_negative_cells(domain* D)
-	# FIXME SR: Change over to domain* D argument
+	int64_t _openmp_gravity(domain *D)
+	int64_t _openmp_gravity_wb(domain *D) 
+	int64_t _openmp_update_conserved_quantities(domain* D, double timestep)
+	void _openmp_manning_friction_flat_semi_implicit(domain *D)
+	void _openmp_manning_friction_sloped_semi_implicit(domain *D)
+	void _openmp_manning_friction_sloped_semi_implicit_edge_based(domain *D)
+	int64_t _openmp_saxpy_conserved_quantities(domain *D, double a, double b, double c)
+	int64_t _openmp_backup_conserved_quantities(domain *D)
+	void _openmp_distribute_edges_to_vertices(domain *D)
+	# FIXME SR: Change over to domain* D argument ?
 	void _openmp_manning_friction_flat(double g, double eps, int64_t N, double* w, double* zv, double* uh, double* vh, double* eta, double* xmom, double* ymom)
 	void _openmp_manning_friction_sloped(double g, double eps, int64_t N, double* x, double* w, double* zv, double* uh, double* vh, double* eta, double* xmom_update, double* ymom_update)
-
-
+	void _openmp_manning_friction_sloped_edge_based(double g, double eps, int64_t N, double* x, double* w, double* zv, double* uh, double* vh, double* eta, double* xmom_update, double* ymom_update)
+	void _openmp_evaluate_reflective_segment(domain *D, int64_t N, int64_t *edge_ptr, int64_t *vol_ids_ptr, int64_t *edge_ids_ptr)
+	int64_t __flux_function_central(double* ql, double* qr, double h_left,
+	double h_right, double hle, double hre, double n1, double n2,
+	double epsilon, double ze, double g,
+	double* edgeflux, double* max_speed, double* pressure_flux,
+	int64_t low_froude)
 
 
 cdef int64_t pointer_flag = 0
@@ -272,6 +299,9 @@ cdef inline get_python_domain_pointers(domain *D, object domain_object):
 	ymomentum = quantities["ymomentum"]
 	elevation = quantities["elevation"]
 	height = quantities["height"]
+	friction = quantities["friction"]
+	xvelocity = quantities["xvelocity"]
+	yvelocity = quantities["yvelocity"]
 
 	edge_values = stage.edge_values
 	D.stage_edge_values = &edge_values[0,0]
@@ -288,6 +318,12 @@ cdef inline get_python_domain_pointers(domain *D, object domain_object):
 	edge_values = height.edge_values
 	D.height_edge_values = &edge_values[0,0]
 
+	edge_values = xvelocity.edge_values
+	D.xvelocity_edge_values = &edge_values[0,0]
+
+	edge_values = yvelocity.edge_values
+	D.yvelocity_edge_values = &edge_values[0,0]
+
 	centroid_values = stage.centroid_values
 	D.stage_centroid_values = &centroid_values[0]
 
@@ -303,6 +339,22 @@ cdef inline get_python_domain_pointers(domain *D, object domain_object):
 	centroid_values = height.centroid_values
 	D.height_centroid_values = &centroid_values[0]
 
+	centroid_values = friction.centroid_values
+	D.friction_centroid_values = &centroid_values[0]	
+
+	centroid_values = stage.centroid_backup_values
+	D.stage_backup_values = &centroid_values[0]	
+	
+	centroid_values = xmomentum.centroid_backup_values
+	D.xmom_backup_values = &centroid_values[0]		
+	
+	centroid_values = ymomentum.centroid_backup_values
+	D.ymom_backup_values = &centroid_values[0]	
+
+	#------------------------------------------------------
+	# Vertex values
+	#------------------------------------------------------
+
 	vertex_values = stage.vertex_values
 	D.stage_vertex_values = &vertex_values[0,0]
 
@@ -318,6 +370,11 @@ cdef inline get_python_domain_pointers(domain *D, object domain_object):
 	vertex_values = height.vertex_values
 	D.height_vertex_values = &vertex_values[0,0]
 
+
+	#------------------------------------------------------
+	# Boundary values
+	#------------------------------------------------------
+
 	boundary_values = stage.boundary_values
 	D.stage_boundary_values = &boundary_values[0]
 
@@ -330,6 +387,19 @@ cdef inline get_python_domain_pointers(domain *D, object domain_object):
 	boundary_values = elevation.boundary_values
 	D.bed_boundary_values = &boundary_values[0]
 
+	boundary_values = height.boundary_values
+	D.height_boundary_values = &boundary_values[0]
+
+	boundary_values = xvelocity.boundary_values
+	D.xvelocity_boundary_values = &boundary_values[0]
+
+	boundary_values = yvelocity.boundary_values
+	D.yvelocity_boundary_values = &boundary_values[0]
+
+	#------------------------------------------------------
+	# Explicit and semi-implicit update values
+	#------------------------------------------------------
+
 	explicit_update = stage.explicit_update
 	D.stage_explicit_update = &explicit_update[0]
 
@@ -346,28 +416,51 @@ cdef inline get_python_domain_pointers(domain *D, object domain_object):
 	D.xmom_semi_implicit_update = &semi_implicit_update[0]
 
 	semi_implicit_update = ymomentum.semi_implicit_update
-	D.ymom_semi_implicit_update = &semi_implicit_update[0]	
+	D.ymom_semi_implicit_update = &semi_implicit_update[0]
+
+
 
 	#------------------------------------------------------
 	# Riverwall structures
+	# 
+	# Deal with the case when create_riverwall is called
+	# but no reiverwall edges are found.
 	#------------------------------------------------------
 	riverwallData = domain_object.riverwallData
 
-	riverwall_elevation = riverwallData.riverwall_elevation
-	D.riverwall_elevation = &riverwall_elevation[0]
-
-	riverwall_rowIndex = riverwallData.hydraulic_properties_rowIndex
-	D.riverwall_rowIndex = &riverwall_rowIndex[0]
+	try:
+		riverwall_elevation = riverwallData.riverwall_elevation
+		D.riverwall_elevation = &riverwall_elevation[0]
+	except:
+		D.riverwall_elevation = NULL
+
+	try:
+		riverwall_rowIndex = riverwallData.hydraulic_properties_rowIndex
+		D.riverwall_rowIndex = &riverwall_rowIndex[0]
+	except:
+		D.riverwall_rowIndex = NULL
+
+	try:
+		riverwall_hydraulic_properties = riverwallData.hydraulic_properties
+		D.riverwall_hydraulic_properties = &riverwall_hydraulic_properties[0,0]
+	except:
+		D.riverwall_hydraulic_properties = NULL
+		
 
 	D.ncol_riverwall_hydraulic_properties = riverwallData.ncol_hydraulic_properties
 
-	riverwall_hydraulic_properties = riverwallData.hydraulic_properties
-	D.riverwall_hydraulic_properties = &riverwall_hydraulic_properties[0,0]
 
 
 
 #===============================================================================
 
+def set_omp_num_threads(int64_t num_threads):
+	"""
+	Set the number of OpenMP threads to use.
+	"""
+	_openmp_set_omp_num_threads(num_threads)
+
+	
 def compute_fluxes_ext_central(object domain_object, double timestep):
 
 	cdef domain D
@@ -381,7 +474,7 @@ def compute_fluxes_ext_central(object domain_object, double timestep):
 
 	return timestep
 
-def extrapolate_second_order_edge_sw(object domain_object):
+def extrapolate_second_order_sw(object domain_object):
 
 	cdef domain D
 	cdef int64_t e
@@ -390,10 +483,39 @@ def extrapolate_second_order_edge_sw(object domain_object):
 	get_python_domain_pointers(&D, domain_object)
 
 	with nogil:
-		e = _openmp_extrapolate_second_order_edge_sw(&D)
+		_openmp_extrapolate_second_order_sw(&D)
+
+
+def distribute_edges_to_vertices(object domain_object):
+
+	cdef domain D
+	cdef int64_t e
+
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
+
+	with nogil:
+		_openmp_distribute_edges_to_vertices(&D)
+
+	
+
+def extrapolate_second_order_edge_sw(object domain_object, distribute_to_vertices=True):
+
+	cdef domain D
+	cdef int64_t e
+
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
+
+
+	with nogil:
+		_openmp_extrapolate_second_order_edge_sw(&D)
+
+	if distribute_to_vertices:
+		with nogil:
+			_openmp_distribute_edges_to_vertices(&D)
+
 
-	if e == -1:
-		return None
 
 def protect_new(object domain_object):
 
@@ -414,12 +536,42 @@ def compute_flux_update_frequency(object domain_object, double timestep):
 
 	pass
 
+def manning_friction_flat_semi_implicit(object domain_object):
+	
+	cdef domain D
+
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
+
+	with nogil:
+		_openmp_manning_friction_flat_semi_implicit(&D)
+
+def manning_friction_sloped_semi_implicit(object domain_object):
+	
+	cdef domain D
+
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
+
+	with nogil:
+		_openmp_manning_friction_sloped_semi_implicit(&D)
+
+def manning_friction_sloped_semi_implicit_edge_based(object domain_object):
+	
+	cdef domain D
+
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
 
+	with nogil:
+		_openmp_manning_friction_sloped_semi_implicit_edge_based(&D)
+
+# FIXME SR: Why is the order of arguments different from the C function?
 def manning_friction_flat(double g, double eps,
             np.ndarray[double, ndim=1, mode="c"] w not None,
 			np.ndarray[double, ndim=1, mode="c"] uh not None,
 			np.ndarray[double, ndim=1, mode="c"] vh not None,
-			np.ndarray[double, ndim=1, mode="c"] z not None,
+			np.ndarray[double, ndim=1, mode="c"] z_centroid not None,
 			np.ndarray[double, ndim=1, mode="c"] eta not None,
 			np.ndarray[double, ndim=1, mode="c"] xmom not None,
 			np.ndarray[double, ndim=1, mode="c"] ymom not None):
@@ -427,14 +579,31 @@ def manning_friction_flat(double g, double eps,
 	cdef int64_t N
 	
 	N = w.shape[0]
-	_openmp_manning_friction_flat(g, eps, N, &w[0], &z[0], &uh[0], &vh[0], &eta[0], &xmom[0], &ymom[0])
+	_openmp_manning_friction_flat(g, eps, N, &w[0], &z_centroid[0], &uh[0], &vh[0], &eta[0], &xmom[0], &ymom[0])
 
+# FIXME SR: Why is the order of arguments different from the C function?
 def manning_friction_sloped(double g, double eps,
-        np.ndarray[double, ndim=2, mode="c"] x not None,
+        np.ndarray[double, ndim=2, mode="c"] x_vertex not None,
+		np.ndarray[double, ndim=1, mode="c"] w not None,
+		np.ndarray[double, ndim=1, mode="c"] uh not None,
+		np.ndarray[double, ndim=1, mode="c"] vh not None,
+		np.ndarray[double, ndim=2, mode="c"] z_vertex not None,
+		np.ndarray[double, ndim=1, mode="c"] eta not None,
+		np.ndarray[double, ndim=1, mode="c"] xmom not None,
+		np.ndarray[double, ndim=1, mode="c"] ymom not None):
+		
+	cdef int64_t N
+	
+	N = w.shape[0]
+	_openmp_manning_friction_sloped(g, eps, N, &x_vertex[0,0], &w[0], &z_vertex[0,0], &uh[0], &vh[0], &eta[0], &xmom[0], &ymom[0])
+
+# FIXME SR: Why is the order of arguments different from the C function?
+def manning_friction_sloped_edge_based(double g, double eps,
+        np.ndarray[double, ndim=2, mode="c"] x_edge not None,
 		np.ndarray[double, ndim=1, mode="c"] w not None,
 		np.ndarray[double, ndim=1, mode="c"] uh not None,
 		np.ndarray[double, ndim=1, mode="c"] vh not None,
-		np.ndarray[double, ndim=2, mode="c"] z not None,
+		np.ndarray[double, ndim=2, mode="c"] z_edge not None,
 		np.ndarray[double, ndim=1, mode="c"] eta not None,
 		np.ndarray[double, ndim=1, mode="c"] xmom not None,
 		np.ndarray[double, ndim=1, mode="c"] ymom not None):
@@ -442,7 +611,7 @@ def manning_friction_sloped(double g, double eps,
 	cdef int64_t N
 	
 	N = w.shape[0]
-	_openmp_manning_friction_sloped(g, eps, N, &x[0,0], &w[0], &z[0,0], &uh[0], &vh[0], &eta[0], &xmom[0], &ymom[0])
+	_openmp_manning_friction_sloped_edge_based(g, eps, N, &x_edge[0,0], &w[0], &z_edge[0,0], &uh[0], &vh[0], &eta[0], &xmom[0], &ymom[0])
 
 
 def fix_negative_cells(object domain_object):
@@ -458,5 +627,123 @@ def fix_negative_cells(object domain_object):
 
 	return num_negative_cells
 
+def update_conserved_quantities(object domain_object, double timestep):
 
+	cdef domain D
+	cdef int64_t num_negative_cells
+
+
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
+
+	with nogil:
+		_openmp_update_conserved_quantities(&D, timestep)
+		num_negative_cells = _openmp_fix_negative_cells(&D)
+
+	return num_negative_cells
+
+def saxpy_conserved_quantities(object domain_object, double a, double b, double c):
+
+	cdef domain D
+
+
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
+
+	with nogil:
+		_openmp_saxpy_conserved_quantities(&D, a, b, c)
+
+
+def backup_conserved_quantities(object domain_object):
+
+	cdef domain D
+
+
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
+
+	with nogil:
+		_openmp_backup_conserved_quantities(&D)	
+
+def evaluate_reflective_segment(object domain_object, np.ndarray[np.int64_t, ndim=1, mode="c"] segment_edges not None, np.ndarray[np.int64_t, ndim=1, mode="c"] vol_ids not None, np.ndarray[np.int64_t, ndim=1, mode="c"] edge_ids not None): 
+	cdef domain D
+	cdef int64_t N
+	N = segment_edges.shape[0]
+
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
+
+
+
+	with nogil:
+		_openmp_evaluate_reflective_segment(&D, N, &segment_edges[0], &vol_ids[0], &edge_ids[0])
+
+
+def rotate(np.ndarray[double, ndim=1, mode="c"] q not None, np.ndarray[double, ndim=1, mode="c"] normal not None, int64_t direction):
+	assert normal.shape[0] == 2, "Normal vector must have 2 components"
+	cdef np.ndarray[double, ndim=1, mode="c"] r
+	cdef double n1, n2
+	n1 = normal[0]
+	n2 = normal[1]
+	if direction == -1:
+		n2 = -n2
+	r = np.ascontiguousarray(np.copy(q))
+	__rotate(&r[0], n1, n2)
+	return r
+
+
+
+
+def flux_function_central(
+	np.ndarray[double, ndim=1, mode="c"] normal not None,
+	np.ndarray[double, ndim=1, mode="c"] ql not None,
+	np.ndarray[double, ndim=1, mode="c"] qr not None,
+	double h_left,
+	double h_right,
+	double hle,
+	double hre,
+	np.ndarray[double, ndim=1, mode="c"] edgeflux not None,
+	double epsilon,
+	double ze,
+	double g,
+	double H0,
+	double hc,
+	double hc_n,
+	int64_t low_froude
+):
+	cdef double h0, limiting_threshold, max_speed, pressure_flux
+	cdef int64_t err
+
+	h0 = H0 * H0
+	limiting_threshold = 10 * H0
+
+	err = __flux_function_central(
+		&ql[0], &qr[0],
+		h_left, h_right, hle, hre, normal[0], normal[1],
+		epsilon, ze, g,
+		&edgeflux[0], &max_speed, &pressure_flux,
+		low_froude
+	)
+
+	assert err >= 0, "Discontinuous Elevation"
+
+	return max_speed, pressure_flux
+
+def gravity(object domain_object):
+	cdef domain D
+
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
+
+	err = _openmp_gravity(&D)
+	if err == -1:
+		return None
+
+def gravity_wb(object domain_object):
+	cdef domain D
+	get_python_domain_parameters(&D, domain_object)
+	get_python_domain_pointers(&D, domain_object)
+	err = _openmp_gravity_wb(&D)
+	if err == -1:
+		return None
 
diff --git a/anuga/shallow_water/sw_domain_orig.c b/anuga/shallow_water/sw_domain_orig.c
deleted file mode 100644
index 126f23f0c..000000000
--- a/anuga/shallow_water/sw_domain_orig.c
+++ /dev/null
@@ -1,2909 +0,0 @@
-// Python - C extension module for shallow_water.py
-//
-// To compile (Python2.6):
-//  gcc -c swb2_domain_ext.c -I/usr/include/python2.6 -o domain_ext.o -Wall -O
-//  gcc -shared swb2_domain_ext.o  -o swb2_domain_ext.so
-//
-// or use python compile.py
-//
-// See the module shallow_water_domain.py for more documentation on
-// how to use this module
-//
-//
-// Stephen Roberts, ANU 2009
-// Ole Nielsen, GA 2004
-// Gareth Davies, GA 2011
-
-
-#include "math.h"
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-// Shared code snippets
-#include "util_ext.h"
-#include "sw_domain.h"
-
-const double pi = 3.14159265358979;
-
-// Trick to compute n modulo d (n%d in python) when d is a power of 2
-uint64_t Mod_of_power_2(uint64_t n, uint64_t d)
-{
-  return ( n & (d-1) );
-}
-
-
-// Computational function for rotation
-int64_t _rotate(double *q, double n1, double n2) {
-  /*Rotate the last  2 coordinates of 3x1 array q (q[1], q[2])
-    from x,y coordinates to coordinates based on normal vector (n1, n2).
-
-    Result is returned in array q
-    To rotate in opposite direction, call rotate with (q, n1, -n2)
-
-    Contents of q are changed by this function */
-
-
-  double q1, q2;
-
-  // Shorthands
-  q1 = q[1];  // x coordinate
-  q2 = q[2];  // y coordinate
-
-  // Rotate
-  q[1] =  n1*q1 + n2*q2;
-  q[2] = -n2*q1 + n1*q2;
-
-  return 0;
-}
-
-//// Function to obtain speed from momentum and depth.
-//// This is used by flux functions
-//// Input parameters uh and h may be modified by this function.
-//// Tried to inline, but no speedup was achieved 27th May 2009 (Ole)
-////static double _compute_speed(double *uh,
-//double _compute_speed(double *uh,
-//		      double *h,
-//		      double epsilon,
-//		      double h0,
-//		      double limiting_threshold) {
-//
-//  double u;
-//
-//  if (*h < limiting_threshold) {
-//    // Apply limiting of speeds according to the ANUGA manual
-//    if (*h < epsilon) {
-//      //*h = max(0.0,*h);  // Could have been negative
-//      u = 0.0;
-//    } else {
-//      u = *uh/(*h + h0/ *h);
-//    }
-//
-//
-//    // Adjust momentum to be consistent with speed
-//    *uh = u * *h;
-//  } else {
-//    // We are in deep water - no need for limiting
-//    u = *uh/ *h;
-//  }
-//
-//  return u;
-//}
-//
-//// minmod limiter
-//int64_t _minmod(double a, double b){
-//    // Compute minmod
-//
-//    if(sign(a)!=sign(b)){
-//        return 0.0;
-//    }else{
-//        return fmax(fabs(a), fabs(b))*sign(a);
-//    }
-//
-//
-//}
-
-
-// Innermost flux function (using stage w=z+h)
-int64_t _flux_function_toro(double *q_left, double *q_right,
-                           double h_left, double h_right,
-                           double hle, double hre,
-                           double n1, double n2,
-                           double epsilon,
-                           double ze,
-                           double limiting_threshold,
-                           double g,
-                           double *edgeflux, double *max_speed,
-                           double *pressure_flux, double hc,
-                           double hc_n)
-{
-
-  /*Compute fluxes between volumes for the shallow water wave equation
-    cast in terms of the 'stage', w = h+z using
-
-	HLL scheme of Fraccarollo and Toro Experimental and numerical assessment of the shallow
-water model for two-dimensional dam-break type. Journal of Computational Physics,
-33(6):843–864, 1995.
-
-    FIXME: Several variables in this interface are no longer used, clean up
-  */
-
-  int64_t i;
-
-  double uh_left, vh_left, u_left;
-  double uh_right, vh_right, u_right;
-  double s_min, s_max, soundspeed_left, soundspeed_right;
-  double u_m, h_m, soundspeed_m;
-  double denom, inverse_denominator;
-  double tmp;
-  // Workspace (allocate once, use many)
-  static double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3];
-
-
-  // Copy conserved quantities to protect from modification
-  q_left_rotated[0] = q_left[0];
-  q_right_rotated[0] = q_right[0];
-  q_left_rotated[1] = q_left[1];
-  q_right_rotated[1] = q_right[1];
-  q_left_rotated[2] = q_left[2];
-  q_right_rotated[2] = q_right[2];
-
-  // Align x- and y-momentum with x-axis
-  _rotate(q_left_rotated, n1, n2);
-  _rotate(q_right_rotated, n1, n2);
-
-
-  // Compute speeds in x-direction
-  //w_left = q_left_rotated[0];
-  uh_left=q_left_rotated[1];
-  vh_left=q_left_rotated[2];
-  if(hle>0.0){
-    tmp = 1.0 / hle;
-    u_left = uh_left * tmp ;
-    uh_left = h_left * u_left;
-    vh_left = h_left* tmp * vh_left;
-  }else{
-    u_left = 0.;
-    uh_left = 0.;
-    vh_left = 0.;
-  }
-
-  //u_left = _compute_speed(&uh_left, &hle,
-  //            epsilon, h0, limiting_threshold);
-
-  //w_right = q_right_rotated[0];
-  uh_right = q_right_rotated[1];
-  vh_right = q_right_rotated[2];
-  if(hre>0.0){
-    tmp = 1.0 / hre;
-    u_right = uh_right * tmp; //max(h_right, 1.0e-06);
-    uh_right = h_right * u_right;
-    vh_right = h_right * tmp * vh_right;
-  }else{
-    u_right = 0.;
-    uh_right = 0.;
-    vh_right = 0.;
-  }
-  //u_right = _compute_speed(&uh_right, &hre,
-  //              epsilon, h0, limiting_threshold);
-
-
-
-  // Maximal and minimal wave speeds
-  soundspeed_left  = sqrt(g*h_left);
-  soundspeed_right = sqrt(g*h_right);
-
-  // Toro for shallow water
-  u_m = 0.5*(u_left + u_right) + soundspeed_left - soundspeed_right;
-  h_m = (u_left + 2.0*soundspeed_left - u_right + 2.0*soundspeed_right);
-  h_m = h_m*h_m/(16.0*g);
-  soundspeed_m = sqrt(g*h_m);
-
-
-  if(h_left < 1.0e-10){
-	  s_min = u_right - 2.0*soundspeed_right;
-	  s_max = u_right + soundspeed_right;
-	  //s_m   = s_min;
-  }
-  else if (h_right < 1.0e-10){
-	  s_min = u_left - soundspeed_left;
-	  s_max = u_left + 2.0*soundspeed_left;
-	  //s_m = s_max;
-  }
-  else {
-	  s_max = fmax(u_right + soundspeed_right, u_m + soundspeed_right);
-	  s_min = fmin(u_left - soundspeed_left, u_m - soundspeed_m);
-  }
-
-  if (s_max < 0.0)
-  {
-    s_max = 0.0;
-  }
-
-  if (s_min > 0.0)
-  {
-    s_min = 0.0;
-  }
-
-
-  // Flux formulas
-  flux_left[0] = u_left*h_left;
-  flux_left[1] = u_left*uh_left; //+ 0.5*g*h_left*h_left;
-  flux_left[2] = u_left*vh_left;
-
-  flux_right[0] = u_right*h_right;
-  flux_right[1] = u_right*uh_right ; //+ 0.5*g*h_right*h_right;
-  flux_right[2] = u_right*vh_right;
-
-  // Flux computation
-  denom = s_max - s_min;
-  if (denom < epsilon)
-  {
-    // Both wave speeds are very small
-    memset(edgeflux, 0, 3*sizeof(double));
-
-    *max_speed = 0.0;
-    //*pressure_flux = 0.0;
-    *pressure_flux = 0.5*g*0.5*(h_left*h_left+h_right*h_right);
-  }
-  else
-  {
-    // Maximal wavespeed
-    *max_speed = fmax(s_max, -s_min);
-
-    inverse_denominator = 1.0/fmax(denom,1.0e-100);
-    for (i = 0; i < 3; i++)
-    {
-      edgeflux[i] = s_max*flux_left[i] - s_min*flux_right[i];
-
-      // Standard smoothing term
-      // edgeflux[i] += 1.0*(s_max*s_min)*(q_right_rotated[i] - q_left_rotated[i]);
-      // Smoothing by stage alone can cause high velocities / slow draining for nearly dry cells
-      if(i==0) edgeflux[i] += (s_max*s_min)*(fmax(q_right_rotated[i],ze) - fmax(q_left_rotated[i],ze));
-      if(i==1) edgeflux[i] += (s_max*s_min)*(uh_right - uh_left);
-      if(i==2) edgeflux[i] += (s_max*s_min)*(vh_right - vh_left);
-
-      edgeflux[i] *= inverse_denominator;
-    }
-    // Separate pressure flux, so we can apply different wet-dry hacks to it
-    *pressure_flux = 0.5*g*( s_max*h_left*h_left -s_min*h_right*h_right)*inverse_denominator;
-
-
-    // Rotate back
-    _rotate(edgeflux, n1, -n2);
-  }
-
-  return 0;
-}
-
-
-
-
-
-// Innermost flux function (using stage w=z+h)
-int64_t _flux_function_central(double *q_left, double *q_right,
-                           double h_left, double h_right,
-                           double hle, double hre,
-                           double n1, double n2,
-                           double epsilon,
-                           double ze,
-                           double limiting_threshold,
-                           double g,
-                           double *edgeflux, double *max_speed,
-                           double *pressure_flux, double hc,
-                           double hc_n,
-                           int64_t low_froude)
-{
-
-  /*Compute fluxes between volumes for the shallow water wave equation
-    cast in terms of the 'stage', w = h+z using
-    the 'central scheme' as described in
-
-    Kurganov, Noelle, Petrova. 'Semidiscrete Central-Upwind Schemes For
-    Hyperbolic Conservation Laws and Hamilton-Jacobi Equations'.
-    Siam J. Sci. Comput. Vol. 23, No. 3, pp. 707-740.
-
-    The implemented formula is given in equation (3.15) on page 714
-
-    FIXME: Several variables in this interface are no longer used, clean up
-
-    low_froude is either 1 or 2 - see comment inline
-  */
-
-  int64_t i;
-
-  double uh_left, vh_left, u_left;
-  double uh_right, vh_right, u_right;
-  double s_min, s_max, soundspeed_left, soundspeed_right;
-  double denom, inverse_denominator;
-  double tmp, local_fr, v_right, v_left;
-  // Workspace (allocate once, use many)
-  static double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3];
-
-  //printf("Entering _flux_function_central with h_left = %f, h_right = %f\n", h_left, h_right);
-
-//   printf("%e %e %e\n%e %e %e\n%e %e\n%e %e\n%e %e\n%e\n%e\n%e\n%e\n %e %e %e\n%e\n%e\n%e\n%e\n%ld\n", \
-//                            q_left[0], q_left[1], q_left[2], \
-//                            q_right[0], q_right[1], q_right[2], \
-//                            h_left, h_right,\
-//                            hle, hre,\
-//                            n1, n2,\
-//                            epsilon,\
-//                            ze,\
-//                            limiting_threshold,\
-//                            g,\
-//                            edgeflux[0], edgeflux[1], edgeflux[2], \
-//                            max_speed[0],\
-//                            pressure_flux[0],\
-//                            hc,\
-//                            hc_n,\
-//                            low_froude);
-
-  if(h_left==0. && h_right==0.){
-    // Quick exit
-    memset(edgeflux, 0, 3*sizeof(double));
-    *max_speed = 0.0;
-    *pressure_flux = 0.;
-    return 0;
-  }
-  // Copy conserved quantities to protect from modification
-  q_left_rotated[0] = q_left[0];
-  q_right_rotated[0] = q_right[0];
-  q_left_rotated[1] = q_left[1];
-  q_right_rotated[1] = q_right[1];
-  q_left_rotated[2] = q_left[2];
-  q_right_rotated[2] = q_right[2];
-
-  // Align x- and y-momentum with x-axis
-  _rotate(q_left_rotated, n1, n2);
-  _rotate(q_right_rotated, n1, n2);
-
-
-  // Compute speeds in x-direction
-  //w_left = q_left_rotated[0];
-  uh_left=q_left_rotated[1];
-  vh_left=q_left_rotated[2];
-  if(hle>0.0){
-    tmp = 1.0/hle;
-    u_left = uh_left * tmp ; //max(h_left, 1.0e-06);
-    uh_left = h_left * u_left;
-    v_left = vh_left * tmp;  // Only used to define local_fr
-    vh_left = h_left * tmp * vh_left;
-  }else{
-    u_left = 0.;
-    uh_left = 0.;
-    vh_left = 0.;
-    v_left = 0.;
-  }
-
-  //u_left = _compute_speed(&uh_left, &hle,
-  //            epsilon, h0, limiting_threshold);
-
-  //w_right = q_right_rotated[0];
-  uh_right = q_right_rotated[1];
-  vh_right = q_right_rotated[2];
-  if(hre>0.0){
-    tmp = 1.0 / hre;
-    u_right = uh_right * tmp;//max(h_right, 1.0e-06);
-    uh_right=h_right*u_right;
-    v_right = vh_right * tmp; // Only used to define local_fr
-    vh_right=h_right * tmp * vh_right;
-  }else{
-    u_right=0.;
-    uh_right=0.;
-    vh_right=0.;
-    v_right = 0.;
-  }
-  //u_right = _compute_speed(&uh_right, &hre,
-  //              epsilon, h0, limiting_threshold);
-
-
-  // Maximal and minimal wave speeds
-  soundspeed_left  = sqrt(g*h_left);
-  soundspeed_right = sqrt(g*h_right);
-  //soundspeed_left  = sqrt(g*hle);
-  //soundspeed_right = sqrt(g*hre);
-
-  // Something that scales like the Froude number
-  // We will use this to scale the diffusive component of the UH/VH fluxes.
-
-  //local_fr = sqrt(
-  //    max(0.001, fmax(1.0,
-  //        (u_right*u_right + u_left*u_left + v_right*v_right + v_left*v_left)/
-  //        (soundspeed_left*soundspeed_left + soundspeed_right*soundspeed_right + 1.0e-10))));
-  if (low_froude == 1)
-  {
-    local_fr = sqrt(
-      fmax(0.001, fmin(1.0,
-          (u_right*u_right + u_left*u_left + v_right*v_right + v_left*v_left)/
-          (soundspeed_left*soundspeed_left + soundspeed_right*soundspeed_right + 1.0e-10))));
-  }
-  else if (low_froude == 2)
-  {
-    local_fr = sqrt((u_right*u_right + u_left*u_left + v_right*v_right + v_left*v_left)/
-          (soundspeed_left*soundspeed_left + soundspeed_right*soundspeed_right + 1.0e-10));
-    local_fr = sqrt(fmin(1.0, 0.01 + fmax(local_fr-0.01,0.0)));
-  }
-  else
-  {
-  local_fr = 1.0;
-  }
-  //printf("local_fr %e \n:", local_fr);
-
-  s_max = fmax(u_left + soundspeed_left, u_right + soundspeed_right);
-  if (s_max < 0.0)
-  {
-    s_max = 0.0;
-  }
-
-  //if( hc < 1.0e-03){
-  //  s_max = 0.0;
-  //}
-
-
-  s_min = fmin(u_left - soundspeed_left, u_right - soundspeed_right);
-  if (s_min > 0.0)
-  {
-    s_min = 0.0;
-  }
-
-  //if( hc_n < 1.0e-03){
-  //  s_min = 0.0;
-  //}
-
-  // Flux formulas
-  flux_left[0] = u_left*h_left;
-  flux_left[1] = u_left*uh_left; //+ 0.5*g*h_left*h_left;
-  flux_left[2] = u_left*vh_left;
-
-  flux_right[0] = u_right*h_right;
-  flux_right[1] = u_right*uh_right ; //+ 0.5*g*h_right*h_right;
-  flux_right[2] = u_right*vh_right;
-
-  // Flux computation
-  denom = s_max - s_min;
-  if (denom < epsilon)
-  {
-    // Both wave speeds are very small
-    memset(edgeflux, 0, 3*sizeof(double));
-
-    *max_speed = 0.0;
-    //*pressure_flux = 0.0;
-    *pressure_flux = 0.5*g*0.5*(h_left*h_left+h_right*h_right);
-  }
-  else
-  {
-    // Maximal wavespeed
-    *max_speed = fmax(s_max, -s_min);
-
-    inverse_denominator = 1.0/fmax(denom,1.0e-100);
-    for (i = 0; i < 3; i++)
-    {
-      edgeflux[i] = s_max*flux_left[i] - s_min*flux_right[i];
-
-      // Standard smoothing term
-      // edgeflux[i] += 1.0*(s_max*s_min)*(q_right_rotated[i] - q_left_rotated[i]);
-      // Smoothing by stage alone can cause high velocities / slow draining for nearly dry cells
-      if(i==0) edgeflux[i] += (s_max*s_min)*(fmax(q_right_rotated[i],ze) - fmax(q_left_rotated[i],ze));
-      //if(i==0) edgeflux[i] += (s_max*s_min)*(h_right - h_left);
-      if(i==1) edgeflux[i] += local_fr*(s_max*s_min)*(uh_right - uh_left);
-      if(i==2) edgeflux[i] += local_fr*(s_max*s_min)*(vh_right - vh_left);
-
-      edgeflux[i] *= inverse_denominator;
-    }
-    // Separate pressure flux, so we can apply different wet-dry hacks to it
-    *pressure_flux = 0.5*g*( s_max*h_left*h_left -s_min*h_right*h_right)*inverse_denominator;
-
-    // Rotate back
-    _rotate(edgeflux, n1, -n2);
-  }
-
-  //printf("Pressure flux = %f, s_max = %f, s_min = %f, h_left = %f, h_right = %f\n", *pressure_flux, s_max, s_min, h_left, h_right);
-
-
-  return 0;
-}
-
-////////////////////////////////////////////////////////////////
-
-int64_t _compute_flux_update_frequency(struct domain *D, double timestep){
-    // Compute the 'flux_update_frequency' for each edge.
-    //
-    // This determines how regularly we need
-    // to update the flux computation (not every timestep)
-    //
-    // Allowed values are 1,2,4,8,... max_flux_update_frequency.
-    //
-    // For example, an edge with flux_update_frequency = 4 would
-    // only have the flux updated every 4 timesteps
-    //
-    //
-    // Local variables
-    int64_t k, i, k3, ki, m, n, nm, ii, ii2;
-    int64_t fuf;
-    double notSoFast=1.0;
-    static int64_t cyclic_number_of_steps=-1;
-
-    // QUICK EXIT
-    if(D->max_flux_update_frequency==1){
-        return 0;
-    }
-
-    // Count the steps
-    cyclic_number_of_steps++;
-    if(cyclic_number_of_steps==D->max_flux_update_frequency){
-        // The flux was just updated in every cell
-        cyclic_number_of_steps=0;
-    }
-
-
-    // PART 1: ONLY OCCURS FOLLOWING FLUX UPDATE
-
-    for ( k = 0; k < D->number_of_elements; k++){
-        for ( i = 0; i < 3; i++){
-            ki = k*3 + i;
-            if((Mod_of_power_2(cyclic_number_of_steps, D->flux_update_frequency[ki])==0)){
-                // The flux was just updated, along with the edge_timestep
-                // So we better recompute the flux_update_frequency
-                n=D->neighbours[ki];
-                if(n>=0){
-                    m = D->neighbour_edges[ki];
-                    nm = n * 3 + m; // Linear index (triangle n, edge m)
-                }
-
-                // Check if we have already done this edge
-                // (Multiply already_computed_flux by -1 on the first update,
-                // and again on the 2nd)
-                if(D->already_computed_flux[ki] > 0 ){
-                    // We have not fixed this flux value yet
-                    D->already_computed_flux[ki] *=-1;
-                    if(n>=0){
-                        D->already_computed_flux[nm] *=-1;
-                    }
-                }else{
-                    // We have fixed this flux value already
-                    D->already_computed_flux[ki] *=-1;
-                    if(n>=0){
-                        D->already_computed_flux[nm] *=-1;
-                    }
-                    continue;
-                }
-
-                // Basically int64_t( edge_ki_timestep/timestep ) with upper limit + tweaks
-                // notSoFast is ideally = 1.0, but in practice values < 1.0 can enhance stability
-                // NOTE: edge_timestep[ki]/timestep can be very large [so int64_t overflows].
-                //       Do not pull the (int64_t) inside the min term
-                fuf = (int64_t)fmin((D->edge_timestep[ki]/timestep)*notSoFast,D->max_flux_update_frequency*1.);
-                // Account for neighbour
-                if(n>=0){
-                    fuf = fmin( (int64_t)fmin(D->edge_timestep[nm]/timestep*notSoFast, D->max_flux_update_frequency*1.), fuf);
-                }
-
-                // Deal with notSoFast<1.0
-                if(fuf<1){
-                    fuf=1;
-                }
-                // Deal with large fuf
-                if(fuf> D->max_flux_update_frequency){
-                    fuf = D->max_flux_update_frequency;
-                }
-                //// Deal with intermediate cases
-                ii=2;
-                while(ii< D->max_flux_update_frequency){
-                    // Set it to 1,2,4, 8, ...
-                    ii2=2*ii;
-                    if((fuf>ii) && (fuf<ii2)){
-                        fuf = ii;
-                        continue;
-                    }
-                    ii=ii2;
-                }
-
-                // Set the values
-                D->flux_update_frequency[ki]=fuf;
-                if(n>=0){
-                    D->flux_update_frequency[nm]= fuf;
-                }
-
-            }
-        }
-    }
-
-    //// PART 2 -- occcurs every timestep
-
-    // At this point, both edges have the same flux_update_frequency.
-    // Next, ensure that flux_update_frequency varies within a constant over each triangle
-    // Experiences suggests this is numerically important
-    // (But, it can result in the same edge having different flux_update_freq)
-    for( k=0; k< D->number_of_elements; k++){
-        k3=3*k;
-        ii = 1*fmin(D->flux_update_frequency[k3],
-                 fmin(D->flux_update_frequency[k3+1],
-                     D->flux_update_frequency[k3+2]));
-
-        D->flux_update_frequency[k3]=fmin(ii, D->flux_update_frequency[k3]);
-        D->flux_update_frequency[k3+1]=fmin(ii, D->flux_update_frequency[k3+1]);
-        D->flux_update_frequency[k3+2]=fmin(ii,D->flux_update_frequency[k3+2]);
-
-    }
-
-    // Now enforce the same flux_update_frequency on each edge
-    // (Could have been broken above when we limited the variation on each triangle)
-    // This seems to have nice behaviour. Notice how an edge
-    // with a large flux_update_frequency, near an edge with a small flux_update_frequency,
-    // will have its flux_update_frequency updated after a few timesteps (i.e. before max_flux_update_frequency timesteps)
-    // OTOH, could this cause oscillations in flux_update_frequency?
-    for( k = 0; k < D->number_of_elements; k++){
-        D->update_extrapolation[k]=0;
-        for( i = 0; i< 3; i++){
-            ki=3*k+i;
-            // Account for neighbour
-            n=D->neighbours[ki];
-            if(n>=0){
-                m = D->neighbour_edges[ki];
-                nm = n * 3 + m; // Linear index (triangle n, edge m)
-                D->flux_update_frequency[ki]=fmin(D->flux_update_frequency[ki], D->flux_update_frequency[nm]);
-            }
-            // Do we need to update the extrapolation?
-            // (We do if the next flux computation will actually compute a flux!)
-            if(Mod_of_power_2((cyclic_number_of_steps+1),D->flux_update_frequency[ki])==0){
-                D->update_next_flux[ki]=1;
-                D->update_extrapolation[k]=1;
-            }else{
-                D->update_next_flux[ki]=0;
-            }
-        }
-    }
-
-    // Check whether the timestep can be increased in the next compute_fluxes call
-    if(cyclic_number_of_steps+1==D->max_flux_update_frequency){
-        // All fluxes will be updated on the next timestep
-        // We also allow the timestep to increase then
-        D->allow_timestep_increase[0]=1;
-    }else{
-        D->allow_timestep_increase[0]=0;
-    }
-
-    return 0;
-}
-
-
-double adjust_edgeflux_with_weir(double *edgeflux,
-                                 double h_left, double h_right,
-                                 double g, double weir_height,
-                                 double Qfactor,
-                                 double s1, double s2,
-                                 double h1, double h2,
-                                 double *max_speed_local
-                                ){
-    // Adjust the edgeflux to agree with a weir relation [including
-    // subergence], but smoothly vary to shallow water solution when
-    // the flow over the weir is much deeper than the weir, or the
-    // upstream/downstream water elevations are too similar
-    double rw, rw2; // 'Raw' weir fluxes
-    double rwRat, hdRat,hdWrRat, scaleFlux, minhd, maxhd;
-    double w1,w2; // Weights for averaging
-    double newFlux;
-    double twothirds = (2.0/3.0);
-    // Following constants control the 'blending' with the shallow water solution
-    // They are now user-defined
-    //double s1=0.9; // At this submergence ratio, begin blending with shallow water solution
-    //double s2=0.95; // At this submergence ratio, completely use shallow water solution
-    //double h1=1.0; // At this (tailwater height above weir) / (weir height) ratio, begin blending with shallow water solution
-    //double h2=1.5; // At this (tailwater height above weir) / (weir height) ratio, completely use the shallow water solution
-
-    minhd = fmin(h_left, h_right);
-    maxhd = fmax(h_left, h_right);
-    // 'Raw' weir discharge = Qfactor*2/3*H*(2/3*g*H)**0.5
-    rw = Qfactor * twothirds * maxhd * sqrt(twothirds * g * maxhd);
-    // Factor for villemonte correction
-    rw2 = Qfactor * twothirds * minhd * sqrt(twothirds * g * minhd);
-    // Useful ratios
-    rwRat = rw2 / fmax(rw, 1.0e-100);
-    hdRat = minhd / fmax(maxhd, 1.0e-100);
-
-    // (tailwater height above weir)/weir_height ratio
-    hdWrRat = minhd / fmax(weir_height, 1.0e-100);
-
-    // Villemonte (1947) corrected weir flow with submergence
-    // Q = Q1*(1-Q2/Q1)**0.385
-    rw = rw*pow(1.0 - rwRat, 0.385);
-
-    if(h_right > h_left){
-        rw *= -1.0;
-    }
-
-    if( (hdRat<s2) & (hdWrRat< h2) ){
-        // Rescale the edge fluxes so that the mass flux = desired flux
-        // Linearly shift to shallow water solution between hdRat = s1 and s2
-        // and between hdWrRat = h1 and h2
-
-        //
-        // WEIGHT WITH RAW SHALLOW WATER FLUX BELOW
-        // This ensures that as the weir gets very submerged, the
-        // standard shallow water equations smoothly take over
-        //
-
-        // Weighted average constants to transition to shallow water eqn flow
-        w1 = fmin( fmax(hdRat-s1, 0.) / (s2-s1), 1.0);
-
-        // Adjust again when the head is too deep relative to the weir height
-        w2 = fmin( fmax(hdWrRat-h1,0.) / (h2-h1), 1.0);
-
-        newFlux = (rw*(1.0-w1)+w1*edgeflux[0])*(1.0-w2) + w2*edgeflux[0];
-
-        if(fabs(edgeflux[0]) > 1.0e-100){
-            scaleFlux = newFlux/edgeflux[0];
-        }else{
-            scaleFlux = 0.;
-        }
-
-        scaleFlux = fmax(scaleFlux, 0.);
-
-        edgeflux[0] = newFlux;
-
-        // FIXME: Do this in a cleaner way
-        // IDEA: Compute momentum flux implied by weir relations, and use
-        //       those in a weighted average (rather than the rescaling trick here)
-        // If we allow the scaling to momentum to be unbounded,
-        // velocity spikes can arise for very-shallow-flooded walls
-        edgeflux[1] *= fmin(scaleFlux, 10.);
-        edgeflux[2] *= fmin(scaleFlux, 10.);
-    }
-
-    // Adjust the max speed
-    if (fabs(edgeflux[0]) > 0.){
-        *max_speed_local = sqrt(g*(maxhd+weir_height)) + fabs(edgeflux[0]/(maxhd + 1.0e-12));
-    }
-    //*max_speed_local += fabs(edgeflux[0])/(maxhd+1.0e-100);
-    //*max_speed_local *= fmax(scaleFlux, 1.0);
-
-    return 0;
-}
-
-// Computational function for flux computation
-double _compute_fluxes_central(struct domain *D, double timestep){
-
-    // Local variables
-    double max_speed_local, length, inv_area, zl, zr;
-    double h_left, h_right, z_half ;  // For andusse scheme
-    // FIXME: limiting_threshold is not used for DE1
-    double limiting_threshold = 10*D->H0;
-    int64_t low_froude = D->low_froude;
-    //
-    int64_t k, i, m, n, ii;
-    int64_t ki, nm = 0, ki2,ki3, nm3; // Index shorthands
-    // Workspace (making them static actually made function slightly slower (Ole))
-    double ql[3], qr[3], edgeflux[3]; // Work array for summing up fluxes
-    double bedslope_work;
-    static double local_timestep;
-    int64_t RiverWall_count, substep_count;
-    double hle, hre, zc, zc_n, Qfactor, s1, s2, h1, h2;
-    double pressure_flux, hc, hc_n, tmp;
-    double h_left_tmp, h_right_tmp;
-    static int64_t call = 0; // Static local variable flagging already computed flux
-    static int64_t timestep_fluxcalls=1;
-    static int64_t base_call = 1;
-    double speed_max_last, weir_height;
-
-    call++; // Flag 'id' of flux calculation for this timestep
-
-    if (D->timestep_fluxcalls != timestep_fluxcalls) {
-    	timestep_fluxcalls = D->timestep_fluxcalls;
-    	base_call = call;
-    }
-
-    // Set explicit_update to zero for all conserved_quantities.
-    // This assumes compute_fluxes called before forcing terms
-    memset((char*) D->stage_explicit_update, 0, D->number_of_elements * sizeof (double));
-    memset((char*) D->xmom_explicit_update, 0, D->number_of_elements * sizeof (double));
-    memset((char*) D->ymom_explicit_update, 0, D->number_of_elements * sizeof (double));
-    memset((char*) D->max_speed, 0, D->number_of_elements * sizeof (double));
-
-
-    // Counter for riverwall edges
-    RiverWall_count=0;
-    // Which substep of the timestepping method are we on?
-    substep_count=(call-base_call)%D->timestep_fluxcalls;
-
-    //printf("call = %d substep_count = %d base_call = %d \n",call,substep_count, base_call);
-
-    // Fluxes are not updated every timestep,
-    // but all fluxes ARE updated when the following condition holds
-    if(D->allow_timestep_increase[0]==1){
-        // We can only increase the timestep if all fluxes are allowed to be updated
-        // If this is not done the timestep can't increase (since local_timestep is static)
-        local_timestep=1.0e+100;
-    }
-
-    // For all triangles
-    for (k = 0; k < D->number_of_elements; k++) {
-        speed_max_last = 0.0;
-
-        // Loop through neighbours and compute edge flux for each
-        for (i = 0; i < 3; i++) {
-            ki = k * 3 + i; // Linear index to edge i of triangle k
-            ki2 = 2 * ki; //k*6 + i*2
-            ki3 = 3*ki;
-
-            if ((D->already_computed_flux[ki] == call) || (D->update_next_flux[ki]!=1)) {
-                // We've already computed the flux across this edge
-                // Check if it is a riverwall
-                if(D->edge_flux_type[ki] == 1){
-                    // Update counter of riverwall edges == index of
-                    // riverwall_elevation + riverwall_rowIndex
-                    RiverWall_count += 1;
-                }
-                continue;
-            }
-
-            // Get left hand side values from triangle k, edge i
-            ql[0] = D->stage_edge_values[ki];
-            ql[1] = D->xmom_edge_values[ki];
-            ql[2] = D->ymom_edge_values[ki];
-            zl = D->bed_edge_values[ki];
-            hc = D->height_centroid_values[k];
-            zc = D->bed_centroid_values[k];
-            hle= D->height_edge_values[ki];
-
-            // Get right hand side values either from neighbouring triangle
-            // or from boundary array (Quantities at neighbour on nearest face).
-            n = D->neighbours[ki];
-            hc_n = hc;
-            zc_n = D->bed_centroid_values[k];
-            if (n < 0) {
-                // Neighbour is a boundary condition
-                m = -n - 1; // Convert negative flag to boundary index
-
-                qr[0] = D->stage_boundary_values[m];
-                qr[1] = D->xmom_boundary_values[m];
-                qr[2] = D->ymom_boundary_values[m];
-                zr = zl; // Extend bed elevation to boundary
-                hre= fmax(qr[0]-zr,0.);//hle;
-            } else {
-                // Neighbour is a real triangle
-                hc_n = D->height_centroid_values[n];
-                zc_n = D->bed_centroid_values[n];
-                m = D->neighbour_edges[ki];
-                nm = n * 3 + m; // Linear index (triangle n, edge m)
-                nm3 = nm*3;
-
-                qr[0] = D->stage_edge_values[nm];
-                qr[1] = D->xmom_edge_values[nm];
-                qr[2] = D->ymom_edge_values[nm];
-                zr = D->bed_edge_values[nm];
-                hre = D->height_edge_values[nm];
-            }
-
-            // Audusse magic
-            z_half = fmax(zl, zr);
-
-            //// Account for riverwalls
-            if(D->edge_flux_type[ki] == 1){
-                if( n>=0 && D->edge_flux_type[nm] != 1){
-                    printf("Riverwall Error\n");
-                }
-                // Update counter of riverwall edges == index of
-                // riverwall_elevation + riverwall_rowIndex
-                RiverWall_count += 1;
-
-                // Set central bed to riverwall elevation
-                z_half = fmax(D->riverwall_elevation[RiverWall_count-1], z_half) ;
-
-            }
-
-            // Define h left/right for Audusse flux method
-            h_left = fmax(hle+zl-z_half,0.);
-            h_right = fmax(hre+zr-z_half,0.);
-
-            // Edge flux computation (triangle k, edge i)
-            _flux_function_central(ql, qr,
-            //_flux_function_toro(ql, qr,
-                h_left, h_right,
-                hle, hre,
-                D->normals[ki2],D->normals[ki2 + 1],
-                D->epsilon, z_half, limiting_threshold, D->g,
-                edgeflux, &max_speed_local, &pressure_flux, hc, hc_n, low_froude);
-
-            // Force weir discharge to match weir theory
-            // FIXME: Switched off at the moment
-            if(D->edge_flux_type[ki]==1){
-                weir_height = fmax(D->riverwall_elevation[RiverWall_count-1] - fmin(zl, zr), 0.); // Reference weir height
-
-                // If the weir is not higher than both neighbouring cells, then
-                // do not try to match the weir equation. If we do, it seems we
-                // can get mass conservation issues (caused by large weir
-                // fluxes in such situations)
-                if(D->riverwall_elevation[RiverWall_count-1] > fmax(zc, zc_n)){
-                    ////////////////////////////////////////////////////////////////////////////////////
-                    // Use first-order h's for weir -- as the 'upstream/downstream' heads are
-                    //  measured away from the weir itself
-                    h_left_tmp = fmax(D->stage_centroid_values[k] - z_half, 0.);
-                    if(n >= 0){
-                        h_right_tmp = fmax(D->stage_centroid_values[n] - z_half, 0.);
-                    }else{
-                        h_right_tmp = fmax(hc_n + zr - z_half, 0.);
-                    }
-
-                    if( (h_left_tmp > 0.) || (h_right_tmp > 0.)){
-
-                        //////////////////////////////////////////////////////////////////////////////////
-                        // Get Qfactor index - multiply the idealised weir discharge by this constant factor
-                        ii = D->riverwall_rowIndex[RiverWall_count-1] * D->ncol_riverwall_hydraulic_properties;
-                        Qfactor = D->riverwall_hydraulic_properties[ii];
-
-                        // Get s1, submergence ratio at which we start blending with the shallow water solution
-                        ii+=1;
-                        s1 = D->riverwall_hydraulic_properties[ii];
-
-                        // Get s2, submergence ratio at which we entirely use the shallow water solution
-                        ii+=1;
-                        s2 = D->riverwall_hydraulic_properties[ii];
-
-                        // Get h1, tailwater head / weir height at which we start blending with the shallow water solution
-                        ii+=1;
-                        h1 = D->riverwall_hydraulic_properties[ii];
-
-                        // Get h2, tailwater head / weir height at which we entirely use the shallow water solution
-                        ii+=1;
-                        h2 = D->riverwall_hydraulic_properties[ii];
-
-                        // Weir flux adjustment
-                        // FIXME
-                        adjust_edgeflux_with_weir(edgeflux, h_left_tmp, h_right_tmp, D->g,
-                                                  weir_height, Qfactor,
-                                                  s1, s2, h1, h2, &max_speed_local);
-                    }
-                }
-            }
-
-            //printf("%d %d edgeflux %e %e %e \n", k,i, edgeflux[0],edgeflux[1],edgeflux[2]);
-
-            // Multiply edgeflux by edgelength
-            length = D->edgelengths[ki];
-            edgeflux[0] *= length;
-            edgeflux[1] *= length;
-            edgeflux[2] *= length;
-
-            //// Don't allow an outward advective flux if the cell centroid
-            ////   stage is < the edge value. Is this important (??). Seems not
-            ////   to be with DE algorithms
-            //if((hc<H0) && edgeflux[0] > 0.){
-            //    edgeflux[0] = 0.;
-            //    edgeflux[1] = 0.;
-            //    edgeflux[2] = 0.;
-            //    //max_speed_local=0.;
-            //    //pressure_flux=0.;
-            //}
-            ////
-            //if((hc_n<H0) && edgeflux[0] < 0.){
-            //    edgeflux[0] = 0.;
-            //    edgeflux[1] = 0.;
-            //    edgeflux[2] = 0.;
-            //    //max_speed_local=0.;
-            //    //pressure_flux=0.;
-            //}
-
-            D->edge_flux_work[ki3 + 0 ] = -edgeflux[0];
-            D->edge_flux_work[ki3 + 1 ] = -edgeflux[1];
-            D->edge_flux_work[ki3 + 2 ] = -edgeflux[2];
-
-            // bedslope_work contains all gravity related terms
-            bedslope_work = length*(- D->g *0.5*(h_left*h_left - hle*hle -(hle+hc)*(zl-zc))+pressure_flux);
-
-            D->pressuregrad_work[ki] = bedslope_work;
-
-            D->already_computed_flux[ki] = call; // #k Done
-
-            // Update neighbour n with same flux but reversed sign
-            if (n >= 0) {
-
-                D->edge_flux_work[nm3 + 0 ] = edgeflux[0];
-                D->edge_flux_work[nm3 + 1 ] = edgeflux[1];
-                D->edge_flux_work[nm3 + 2 ] = edgeflux[2];
-                bedslope_work = length*(-D->g * 0.5 *( h_right*h_right - hre*hre- (hre+hc_n)*(zr-zc_n)) + pressure_flux);
-                D->pressuregrad_work[nm] = bedslope_work;
-
-
-                D->already_computed_flux[nm] = call; // #n Done
-            }
-
-            // Update timestep based on edge i and possibly neighbour n
-            // NOTE: We should only change the timestep on the 'first substep'
-            //  of the timestepping method [substep_count==0]
-            if(substep_count==0){
-
-                // Compute the 'edge-timesteps' (useful for setting flux_update_frequency)
-                tmp = 1.0 / fmax(max_speed_local, D->epsilon);
-                D->edge_timestep[ki] = D->radii[k] * tmp ;
-                if (n >= 0) {
-                    D->edge_timestep[nm] = D->radii[n] * tmp;
-                    if ((D->tri_full_flag[k] == 1)) 
-                        D->max_speed[n] = fmax(D->max_speed[n], max_speed_local);
-                }
-
-                // Update the timestep
-                if ((D->tri_full_flag[k] == 1)) {
-
-                    speed_max_last = fmax(speed_max_last, max_speed_local);
-
-                    D->max_speed[k] = fmax(D->max_speed[k], max_speed_local);
-
-                    if (max_speed_local > D->epsilon) {
-                        // Apply CFL condition for triangles joining this edge (triangle k and triangle n)
-
-                        // CFL for triangle k
-                        local_timestep = fmin(local_timestep, D->edge_timestep[ki]);
-
-                        if (n >= 0) {
-                            // Apply CFL condition for neigbour n (which is on the ith edge of triangle k)
-                            local_timestep = fmin(local_timestep, D->edge_timestep[nm]);
-                        }
-                    }
-                }
-            }
-
-        } // End edge i (and neighbour n)
-        // Keep track of maximal speeds
-        //if(substep_count==0) D->max_speed[k] = speed_max_last; //max_speed;
-
-
-    } // End triangle k
-
-    //// Limit edgefluxes, for mass conservation near wet/dry cells
-    //// This doesn't seem to be needed anymore
-    //for(k=0; k< number_of_elements; k++){
-    //    //continue;
-    //    hc = height_centroid_values[k];
-    //    // Loop over every edge
-    //    for(i = 0; i<3; i++){
-    //        if(i==0){
-    //            // Add up the outgoing flux through the cell -- only do this once (i==0)
-    //            outgoing_mass_edges=0.0;
-    //            for(useint=0; useint<3; useint++){
-    //                if(edge_flux_work[3*(3*k+useint)]< 0.){
-    //                    //outgoing_mass_edges+=1.0;
-    //                    outgoing_mass_edges+=(edge_flux_work[3*(3*k+useint)]);
-    //                }
-    //            }
-    //            outgoing_mass_edges*=local_timestep;
-    //        }
-
-    //        ki=3*k+i;
-    //        ki2=ki*2;
-    //        ki3 = ki*3;
-    //
-    //        // Prevent outflow from 'seriously' dry cells
-    //        // Idea: The cell will not go dry if:
-    //        // total_outgoing_flux <= cell volume = Area_triangle*hc
-    //        vol=areas[k]*hc;
-    //        if((edge_flux_work[ki3]< 0.0) && (-outgoing_mass_edges> vol)){
-    //
-    //            // This bound could be improved (e.g. we could actually sum the
-    //            // + and - fluxes and check if they are too large).  However,
-    //            // the advantage of this method is that we don't have to worry
-    //            // about subsequent changes to the + edgeflux caused by
-    //            // constraints associated with neighbouring triangles.
-    //            tmp = vol/(-(outgoing_mass_edges)) ;
-    //            if(tmp< 1.0){
-    //                edge_flux_work[ki3+0]*=tmp;
-    //                edge_flux_work[ki3+1]*=tmp;
-    //                edge_flux_work[ki3+2]*=tmp;
-
-    //                // Compute neighbour edge index
-    //                n = neighbours[ki];
-    //                if(n>=0){
-    //                    nm = 3*n + neighbour_edges[ki];
-    //                    nm3 = nm*3;
-    //                    edge_flux_work[nm3+0]*=tmp;
-    //                    edge_flux_work[nm3+1]*=tmp;
-    //                    edge_flux_work[nm3+2]*=tmp;
-    //                }
-    //            }
-    //        }
-    //    }
-    // }
-
-    // Now add up stage, xmom, ymom explicit updates
-    for(k=0; k < D->number_of_elements; k++){
-        hc = fmax(D->stage_centroid_values[k] - D->bed_centroid_values[k],0.);
-
-        for(i=0;i<3;i++){
-            // FIXME: Make use of neighbours to efficiently set things
-            ki=3*k+i;
-            ki2=ki*2;
-            ki3 = ki*3;
-            n=D->neighbours[ki];
-
-            D->stage_explicit_update[k] += D->edge_flux_work[ki3+0];
-            D->xmom_explicit_update[k] += D->edge_flux_work[ki3+1];
-            D->ymom_explicit_update[k] += D->edge_flux_work[ki3+2];
-
-            // If this cell is not a ghost, and the neighbour is a boundary
-            // condition OR a ghost cell, then add the flux to the
-            // boundary_flux_integral
-            if( ((n<0) & (D->tri_full_flag[k]==1)) | ( (n>=0) && ((D->tri_full_flag[k]==1) & (D->tri_full_flag[n]==0) )) ){
-                // boundary_flux_sum is an array with length = timestep_fluxcalls
-                // For each sub-step, we put the boundary flux sum in.
-                D->boundary_flux_sum[substep_count] += D->edge_flux_work[ki3];
-            }
-
-            D->xmom_explicit_update[k] -= D->normals[ki2]*D->pressuregrad_work[ki];
-            D->ymom_explicit_update[k] -= D->normals[ki2+1]*D->pressuregrad_work[ki];
-
-
-        } // end edge i
-
-        // Normalise triangle k by area and store for when all conserved
-        // quantities get updated
-        inv_area = 1.0 / D->areas[k];
-        D->stage_explicit_update[k] *= inv_area;
-        D->xmom_explicit_update[k] *= inv_area;
-        D->ymom_explicit_update[k] *= inv_area;
-
-    }  // end cell k
-
-    // Ensure we only update the timestep on the first call within each rk2/rk3 step
-    if(substep_count == 0) timestep=local_timestep;
-
-    return timestep;
-}
-
-// Protect against the water elevation falling below the triangle bed
-double  _protect(int64_t N,
-         double minimum_allowed_height,
-         double maximum_allowed_speed,
-         double epsilon,
-         double* wc,
-         double* wv,
-         double* zc,
-         double* zv,
-         double* xmomc,
-         double* ymomc,
-         double* areas,
-         double* xc,
-         double* yc) {
-
-  int64_t k;
-  double hc, bmin;
-  double mass_error = 0.;
-  // This acts like minimum_allowed height, but scales with the vertical
-  // distance between the bed_centroid_value and the max bed_edge_value of
-  // every triangle.
-  //double minimum_relative_height=0.05;
-  //int64_t mass_added = 0;
-
-  // Protect against inifintesimal and negative heights
-  //if (maximum_allowed_speed < epsilon) {
-    for (k=0; k<N; k++) {
-      hc = wc[k] - zc[k];
-      if (hc < minimum_allowed_height*1.0 ){
-            // Set momentum to zero and ensure h is non negative
-            xmomc[k] = 0.;
-            ymomc[k] = 0.;
-        if (hc <= 0.0){
-             bmin = zc[k];
-             // Minimum allowed stage = bmin
-
-             // WARNING: ADDING MASS if wc[k]<bmin
-             if(wc[k] < bmin){
-                 mass_error += (bmin-wc[k])*areas[k];
-                 //mass_added = 1; //Flag to warn of added mass
-
-                 wc[k] = bmin;
-
-                 // FIXME: Set vertex values as well. Seems that this shouldn't be
-                 // needed. However, from memory this is important at the first
-                 // time step, for 'dry' areas where the designated stage is
-                 // less than the bed centroid value
-                 wv[3*k] = bmin; //min(bmin, wc[k]); //zv[3*k]-minimum_allowed_height);
-                 wv[3*k+1] = bmin; //min(bmin, wc[k]); //zv[3*k+1]-minimum_allowed_height);
-                 wv[3*k+2] = bmin; //min(bmin, wc[k]); //zv[3*k+2]-minimum_allowed_height);
-            }
-        }
-      }
-    }
-
-  //if(mass_added == 1){
-  //  printf("Cumulative mass protection: %f m^3 \n", mass_error);
-  //}
-
-  return mass_error;
-}
-
-// Protect against the water elevation falling below the triangle bed
-double  _protect_new(struct domain *D) {
-
-  int64_t k;
-  double hc, bmin;
-  double mass_error = 0.;
-
-  double* wc;
-  double* zc;
-  double* wv;
-  double* xmomc;
-  double* ymomc;
-  double* areas;
-
-  double minimum_allowed_height;
-
-  minimum_allowed_height = D->minimum_allowed_height;
-
-  wc = D->stage_centroid_values;
-  zc = D->bed_centroid_values;
-  wv = D->stage_vertex_values;
-  xmomc = D->xmom_centroid_values;
-  ymomc = D->ymom_centroid_values;
-  areas = D->areas;
-
-  // This acts like minimum_allowed height, but scales with the vertical
-  // distance between the bed_centroid_value and the max bed_edge_value of
-  // every triangle.
-  //double minimum_relative_height=0.05;
-  //int64_t mass_added = 0;
-
-  // Protect against inifintesimal and negative heights
-  //if (maximum_allowed_speed < epsilon) {
-    for (k=0; k<D->number_of_elements; k++) {
-      hc = wc[k] - zc[k];
-      if (hc < minimum_allowed_height*1.0 ){
-            // Set momentum to zero and ensure h is non negative
-            xmomc[k] = 0.;
-            ymomc[k] = 0.;
-        if (hc <= 0.0){
-             bmin = zc[k];
-             // Minimum allowed stage = bmin
-
-             // WARNING: ADDING MASS if wc[k]<bmin
-             if(wc[k] < bmin){
-                 mass_error += (bmin-wc[k])*areas[k];
-                 //mass_added = 1; //Flag to warn of added mass
-
-                 wc[k] = bmin;
-
-                 // FIXME: Set vertex values as well. Seems that this shouldn't be
-                 // needed. However, from memory this is important at the first
-                 // time step, for 'dry' areas where the designated stage is
-                 // less than the bed centroid value
-                 wv[3*k] = bmin; //min(bmin, wc[k]); //zv[3*k]-minimum_allowed_height);
-                 wv[3*k+1] = bmin; //min(bmin, wc[k]); //zv[3*k+1]-minimum_allowed_height);
-                 wv[3*k+2] = bmin; //min(bmin, wc[k]); //zv[3*k+2]-minimum_allowed_height);
-            }
-        }
-      }
-    }
-
-  //if(mass_added == 1){
-  //  printf("Cumulative mass protection: %f m^3 \n", mass_error);
-  //}
-
-  return mass_error;
-}
-
-
-
-
-int64_t find_qmin_and_qmax(double dq0, double dq1, double dq2,
-               double *qmin, double *qmax){
-  // Considering the centroid of an FV triangle and the vertices of its
-  // auxiliary triangle, find
-  // qmin=min(q)-qc and qmax=max(q)-qc,
-  // where min(q) and max(q) are respectively min and max over the
-  // four values (at the centroid of the FV triangle and the auxiliary
-  // triangle vertices),
-  // and qc is the centroid
-  // dq0=q(vertex0)-q(centroid of FV triangle)
-  // dq1=q(vertex1)-q(vertex0)
-  // dq2=q(vertex2)-q(vertex0)
-
-  // This is a simple implementation
-  *qmax = fmax(fmax(dq0, fmax(dq0+dq1, dq0+dq2)), 0.0) ;
-  *qmin = fmin(fmin(dq0, fmin(dq0+dq1, dq0+dq2)), 0.0) ;
-
-  return 0;
-}
-
-int64_t limit_gradient(double *dqv, double qmin, double qmax, double beta_w){
-  // Given provisional jumps dqv from the FV triangle centroid to its
-  // vertices/edges, and jumps qmin (qmax) between the centroid of the FV
-  // triangle and the minimum (maximum) of the values at the auxiliary triangle
-  // vertices (which are centroids of neighbour mesh triangles), calculate a
-  // multiplicative factor phi by which the provisional vertex jumps are to be
-  // limited
-
-  int64_t i;
-  double r=1000.0, r0=1.0, phi=1.0;
-  static double TINY = 1.0e-100; // to avoid machine accuracy problems.
-  // FIXME: Perhaps use the epsilon used elsewhere.
-
-  // Any provisional jump with magnitude < TINY does not contribute to
-  // the limiting process.
-  //return 0;
-
-  for (i=0;i<3;i++){
-    if (dqv[i] < -TINY)
-      r0=qmin/dqv[i];
-
-    if (dqv[i] > TINY)
-      r0=qmax/dqv[i];
-
-    r=fmin(r0,r);
-  }
-
-  phi=fmin(r*beta_w,1.0);
-  //phi=1.;
-  dqv[0]=dqv[0]*phi;
-  dqv[1]=dqv[1]*phi;
-  dqv[2]=dqv[2]*phi;
-
-  return 0;
-}
-
-
-
-// MIGRATED from shallow_water.c
-// FIXME (Ole): Maybe superseded by extrapolate_second_order_edge_sw below?
-// Computational routine
-int64_t _extrapolate_second_order_sw(struct domain *D) {
-
-
-  // Domain Variables
-    int64_t number_of_elements;
-    double epsilon;
-    double minimum_allowed_height;
-    double beta_w;
-    double beta_w_dry;
-    double beta_uh;
-    double beta_uh_dry;
-    double beta_vh;
-    double beta_vh_dry;
-    int64_t* surrogate_neighbours;
-    int64_t* number_of_boundaries;
-    double* centroid_coordinates;
-    double* stage_centroid_values;
-    double* xmom_centroid_values;
-    double* ymom_centroid_values;
-    double* bed_centroid_values;
-    double* edge_coordinates;
-    double* vertex_coordinates;
-    double* stage_edge_values;
-    double* xmom_edge_values;
-    double* ymom_edge_values;
-    double* bed_edge_values;
-    double* stage_vertex_values;
-    double* xmom_vertex_values;
-    double* ymom_vertex_values;
-    double* bed_vertex_values;
-    int64_t optimise_dry_cells;
-    int64_t extrapolate_velocity_second_order;
-
-    // Local variables
-    double a, b; // Gradient vector used to calculate edge values from centroids
-    int64_t k, k0, k1, k2, k3, k6, coord_index, i;
-    double x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2; // Vertices of the auxiliary triangle
-    double dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, dq0, dq1, dq2, area2, inv_area2;
-    double dqv[3], qmin, qmax, hmin, hmax;
-    double hc, h0, h1, h2, beta_tmp, hfactor;
-    //double dk, dv0, dv1, dv2, de[3], demin, dcmax, r0scale;
-    double dk, dv0, dv1, dv2;
-
-    double *xmom_centroid_store;
-    double *ymom_centroid_store;
-    //double *stage_centroid_store;
-
-
-    // Associate memory location of Domain varibles with local aliases
-    number_of_elements     = D->number_of_elements;
-    epsilon                = D->epsilon;
-    minimum_allowed_height = D->minimum_allowed_height;
-    beta_w                 = D->beta_w;
-    beta_w_dry             = D->beta_w_dry;
-    beta_uh                = D->beta_uh;
-    beta_uh_dry            = D->beta_uh_dry;
-    beta_vh                = D->beta_vh;
-    beta_vh_dry            = D->beta_vh_dry;
-    optimise_dry_cells     = D->optimise_dry_cells;
-
-    extrapolate_velocity_second_order = D->extrapolate_velocity_second_order;
-
-    surrogate_neighbours      = D->surrogate_neighbours;
-    number_of_boundaries      = D->number_of_boundaries;
-    centroid_coordinates      = D->centroid_coordinates;
-    stage_centroid_values     = D->stage_centroid_values;
-    xmom_centroid_values      = D->xmom_centroid_values;
-    ymom_centroid_values      = D->ymom_centroid_values;
-    bed_centroid_values       = D->bed_centroid_values;
-    edge_coordinates          = D->edge_coordinates;
-    vertex_coordinates        = D->vertex_coordinates;
-    stage_edge_values         = D->stage_edge_values;
-    xmom_edge_values          = D->xmom_edge_values;
-    ymom_edge_values          = D->ymom_edge_values;
-    bed_edge_values           = D->bed_edge_values;
-    stage_vertex_values       = D->stage_vertex_values;
-    xmom_vertex_values        = D->xmom_vertex_values;
-    ymom_vertex_values        = D->ymom_vertex_values;
-    bed_vertex_values         = D->bed_vertex_values;
-
-
-
-
-/*
-int64_t _extrapolate_second_order_sw(int64_t number_of_elements,
-        double epsilon,
-        double minimum_allowed_height,
-        double beta_w,
-        double beta_w_dry,
-        double beta_uh,
-        double beta_uh_dry,
-        double beta_vh,
-        double beta_vh_dry,
-        int64_t* surrogate_neighbours,
-        int64_t* number_of_boundaries,
-        double* centroid_coordinates,
-        double* stage_centroid_values,
-        double* xmom_centroid_values,
-        double* ymom_centroid_values,
-        double* elevation_centroid_values,
-        double* vertex_coordinates,
-        double* stage_vertex_values,
-        double* xmom_vertex_values,
-        double* ymom_vertex_values,
-        double* elevation_vertex_values,
-        int64_t optimise_dry_cells,
-        int64_t extrapolate_velocity_second_order) {
-
-
-
-    // Local variables
-    double a, b; // Gradient vector used to calculate vertex values from centroids
-    int64_t k, k0, k1, k2, k3, k6, coord_index, i;
-    double x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2; // Vertices of the auxiliary triangle
-    double dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, dq0, dq1, dq2, area2, inv_area2;
-    double dqv[3], qmin, qmax, hmin, hmax;
-    double hc, h0, h1, h2, beta_tmp, hfactor;
-    double xmom_centroid_store[number_of_elements], ymom_centroid_store[number_of_elements], dk, dv0, dv1, dv2;
-*/
-
-   // Use malloc to avoid putting these variables on the stack, which can cause
-   // segfaults in large model runs
-    xmom_centroid_store = malloc(number_of_elements*sizeof(double));
-    ymom_centroid_store = malloc(number_of_elements*sizeof(double));
-    // stage_centroid_store = malloc(number_of_elements*sizeof(double));
-
-    if (extrapolate_velocity_second_order == 1) {
-        // Replace momentum centroid with velocity centroid to allow velocity
-        // extrapolation This will be changed back at the end of the routine
-        for (k = 0; k < number_of_elements; k++) {
-
-            dk = fmax(stage_centroid_values[k] - bed_centroid_values[k], minimum_allowed_height);
-            xmom_centroid_store[k] = xmom_centroid_values[k];
-            xmom_centroid_values[k] = xmom_centroid_values[k] / dk;
-
-            ymom_centroid_store[k] = ymom_centroid_values[k];
-            ymom_centroid_values[k] = ymom_centroid_values[k] / dk;
-        }
-    }
-
-    // Begin extrapolation routine
-    for (k = 0; k < number_of_elements; k++) {
-        k3 = k * 3;
-        k6 = k * 6;
-
-        if (number_of_boundaries[k] == 3) {
-            // No neighbours, set gradient on the triangle to zero
-
-            stage_vertex_values[k3] = stage_centroid_values[k];
-            stage_vertex_values[k3 + 1] = stage_centroid_values[k];
-            stage_vertex_values[k3 + 2] = stage_centroid_values[k];
-            xmom_vertex_values[k3] = xmom_centroid_values[k];
-            xmom_vertex_values[k3 + 1] = xmom_centroid_values[k];
-            xmom_vertex_values[k3 + 2] = xmom_centroid_values[k];
-            ymom_vertex_values[k3] = ymom_centroid_values[k];
-            ymom_vertex_values[k3 + 1] = ymom_centroid_values[k];
-            ymom_vertex_values[k3 + 2] = ymom_centroid_values[k];
-
-            continue;
-        } else {
-            // Triangle k has one or more neighbours.
-            // Get centroid and vertex coordinates of the triangle
-
-            // Get the vertex coordinates
-            xv0 = vertex_coordinates[k6];
-            yv0 = vertex_coordinates[k6 + 1];
-            xv1 = vertex_coordinates[k6 + 2];
-            yv1 = vertex_coordinates[k6 + 3];
-            xv2 = vertex_coordinates[k6 + 4];
-            yv2 = vertex_coordinates[k6 + 5];
-
-            // Get the centroid coordinates
-            coord_index = 2 * k;
-            x = centroid_coordinates[coord_index];
-            y = centroid_coordinates[coord_index + 1];
-
-            // Store x- and y- differentials for the vertices of
-            // triangle k relative to the centroid
-            dxv0 = xv0 - x;
-            dxv1 = xv1 - x;
-            dxv2 = xv2 - x;
-            dyv0 = yv0 - y;
-            dyv1 = yv1 - y;
-            dyv2 = yv2 - y;
-        }
-
-
-
-
-        if (number_of_boundaries[k] <= 1) {
-            //==============================================
-            // Number of boundaries <= 1
-            //==============================================
-
-
-            // If no boundaries, auxiliary triangle is formed
-            // from the centroids of the three neighbours
-            // If one boundary, auxiliary triangle is formed
-            // from this centroid and its two neighbours
-
-            k0 = surrogate_neighbours[k3];
-            k1 = surrogate_neighbours[k3 + 1];
-            k2 = surrogate_neighbours[k3 + 2];
-
-            // Get the auxiliary triangle's vertex coordinates
-            // (really the centroids of neighbouring triangles)
-            coord_index = 2 * k0;
-            x0 = centroid_coordinates[coord_index];
-            y0 = centroid_coordinates[coord_index + 1];
-
-            coord_index = 2 * k1;
-            x1 = centroid_coordinates[coord_index];
-            y1 = centroid_coordinates[coord_index + 1];
-
-            coord_index = 2 * k2;
-            x2 = centroid_coordinates[coord_index];
-            y2 = centroid_coordinates[coord_index + 1];
-
-            // Store x- and y- differentials for the vertices
-            // of the auxiliary triangle
-            dx1 = x1 - x0;
-            dx2 = x2 - x0;
-            dy1 = y1 - y0;
-            dy2 = y2 - y0;
-
-            // Calculate 2*area of the auxiliary triangle
-            // The triangle is guaranteed to be counter-clockwise
-            area2 = dy2 * dx1 - dy1*dx2;
-
-            // If the mesh is 'weird' near the boundary,
-            // the triangle might be flat or clockwise
-            // Default to zero gradient
-            if (area2 <= 0) {
-                //printf("Error negative triangle area \n");
-                //return -1;
-
-                stage_vertex_values[k3] = stage_centroid_values[k];
-                stage_vertex_values[k3 + 1] = stage_centroid_values[k];
-                stage_vertex_values[k3 + 2] = stage_centroid_values[k];
-                xmom_vertex_values[k3] = xmom_centroid_values[k];
-                xmom_vertex_values[k3 + 1] = xmom_centroid_values[k];
-                xmom_vertex_values[k3 + 2] = xmom_centroid_values[k];
-                ymom_vertex_values[k3] = ymom_centroid_values[k];
-                ymom_vertex_values[k3 + 1] = ymom_centroid_values[k];
-                ymom_vertex_values[k3 + 2] = ymom_centroid_values[k];
-
-                continue;
-            }
-
-            // Calculate heights of neighbouring cells
-            hc = stage_centroid_values[k] - bed_centroid_values[k];
-            h0 = stage_centroid_values[k0] - bed_centroid_values[k0];
-            h1 = stage_centroid_values[k1] - bed_centroid_values[k1];
-            h2 = stage_centroid_values[k2] - bed_centroid_values[k2];
-            hmin = fmax(fmax(h0, fmax(h1, h2)), hc);
-            //hfactor = hc/(hc + 1.0);
-
-            hfactor = 0.0;
-            if (hmin > 0.001) {
-                hfactor = (hmin - 0.001) / (hmin + 0.004);
-            }
-
-            if (optimise_dry_cells) {
-                // Check if linear reconstruction is necessary for triangle k
-                // This check will exclude dry cells.
-
-                hmax = fmax(h0, fmax(h1, h2));
-                if (hmax < epsilon) {
-                    continue;
-                }
-            }
-
-            //-----------------------------------
-            // stage
-            //-----------------------------------
-
-            // Calculate the difference between vertex 0 of the auxiliary
-            // triangle and the centroid of triangle k
-            dq0 = stage_centroid_values[k0] - stage_centroid_values[k];
-
-            // Calculate differentials between the vertices
-            // of the auxiliary triangle (centroids of neighbouring triangles)
-            dq1 = stage_centroid_values[k1] - stage_centroid_values[k0];
-            dq2 = stage_centroid_values[k2] - stage_centroid_values[k0];
-
-            inv_area2 = 1.0 / area2;
-            // Calculate the gradient of stage on the auxiliary triangle
-            a = dy2 * dq1 - dy1*dq2;
-            a *= inv_area2;
-            b = dx1 * dq2 - dx2*dq1;
-            b *= inv_area2;
-
-            // Calculate provisional jumps in stage from the centroid
-            // of triangle k to its vertices, to be limited
-            dqv[0] = a * dxv0 + b*dyv0;
-            dqv[1] = a * dxv1 + b*dyv1;
-            dqv[2] = a * dxv2 + b*dyv2;
-
-            // Now we want to find min and max of the centroid and the
-            // vertices of the auxiliary triangle and compute jumps
-            // from the centroid to the min and max
-            find_qmin_and_qmax(dq0, dq1, dq2, &qmin, &qmax);
-
-            // Playing with dry wet interface
-            //hmin = qmin;
-            //beta_tmp = beta_w_dry;
-            //if (hmin>minimum_allowed_height)
-            beta_tmp = beta_w_dry + (beta_w - beta_w_dry) * hfactor;
-
-            //printf("min_alled_height = %f\n",minimum_allowed_height);
-            //printf("hmin = %f\n",hmin);
-            //printf("beta_w = %f\n",beta_w);
-            //printf("beta_tmp = %f\n",beta_tmp);
-            // Limit the gradient
-            limit_gradient(dqv, qmin, qmax, beta_tmp);
-
-            //for (i=0;i<3;i++)
-            stage_vertex_values[k3 + 0] = stage_centroid_values[k] + dqv[0];
-            stage_vertex_values[k3 + 1] = stage_centroid_values[k] + dqv[1];
-            stage_vertex_values[k3 + 2] = stage_centroid_values[k] + dqv[2];
-
-
-            //-----------------------------------
-            // xmomentum
-            //-----------------------------------
-
-            // Calculate the difference between vertex 0 of the auxiliary
-            // triangle and the centroid of triangle k
-            dq0 = xmom_centroid_values[k0] - xmom_centroid_values[k];
-
-            // Calculate differentials between the vertices
-            // of the auxiliary triangle
-            dq1 = xmom_centroid_values[k1] - xmom_centroid_values[k0];
-            dq2 = xmom_centroid_values[k2] - xmom_centroid_values[k0];
-
-            // Calculate the gradient of xmom on the auxiliary triangle
-            a = dy2 * dq1 - dy1*dq2;
-            a *= inv_area2;
-            b = dx1 * dq2 - dx2*dq1;
-            b *= inv_area2;
-
-            // Calculate provisional jumps in stage from the centroid
-            // of triangle k to its vertices, to be limited
-            dqv[0] = a * dxv0 + b*dyv0;
-            dqv[1] = a * dxv1 + b*dyv1;
-            dqv[2] = a * dxv2 + b*dyv2;
-
-            // Now we want to find min and max of the centroid and the
-            // vertices of the auxiliary triangle and compute jumps
-            // from the centroid to the min and max
-            find_qmin_and_qmax(dq0, dq1, dq2, &qmin, &qmax);
-            //beta_tmp = beta_uh;
-            //if (hmin<minimum_allowed_height)
-            //beta_tmp = beta_uh_dry;
-            beta_tmp = beta_uh_dry + (beta_uh - beta_uh_dry) * hfactor;
-
-            // Limit the gradient
-            limit_gradient(dqv, qmin, qmax, beta_tmp);
-
-            for (i = 0; i < 3; i++) {
-                xmom_vertex_values[k3 + i] = xmom_centroid_values[k] + dqv[i];
-            }
-
-            //-----------------------------------
-            // ymomentum
-            //-----------------------------------
-
-            // Calculate the difference between vertex 0 of the auxiliary
-            // triangle and the centroid of triangle k
-            dq0 = ymom_centroid_values[k0] - ymom_centroid_values[k];
-
-            // Calculate differentials between the vertices
-            // of the auxiliary triangle
-            dq1 = ymom_centroid_values[k1] - ymom_centroid_values[k0];
-            dq2 = ymom_centroid_values[k2] - ymom_centroid_values[k0];
-
-            // Calculate the gradient of xmom on the auxiliary triangle
-            a = dy2 * dq1 - dy1*dq2;
-            a *= inv_area2;
-            b = dx1 * dq2 - dx2*dq1;
-            b *= inv_area2;
-
-            // Calculate provisional jumps in stage from the centroid
-            // of triangle k to its vertices, to be limited
-            dqv[0] = a * dxv0 + b*dyv0;
-            dqv[1] = a * dxv1 + b*dyv1;
-            dqv[2] = a * dxv2 + b*dyv2;
-
-            // Now we want to find min and max of the centroid and the
-            // vertices of the auxiliary triangle and compute jumps
-            // from the centroid to the min and max
-            find_qmin_and_qmax(dq0, dq1, dq2, &qmin, &qmax);
-
-            //beta_tmp = beta_vh;
-            //
-            //if (hmin<minimum_allowed_height)
-            //beta_tmp = beta_vh_dry;
-            beta_tmp = beta_vh_dry + (beta_vh - beta_vh_dry) * hfactor;
-
-            // Limit the gradient
-            limit_gradient(dqv, qmin, qmax, beta_tmp);
-
-            for (i = 0; i < 3; i++) {
-                ymom_vertex_values[k3 + i] = ymom_centroid_values[k] + dqv[i];
-            }
-        }// End number_of_boundaries <=1
-        else {
-
-            //==============================================
-            // Number of boundaries == 2
-            //==============================================
-
-            // One internal neighbour and gradient is in direction of the neighbour's centroid
-
-            // Find the only internal neighbour (k1?)
-            for (k2 = k3; k2 < k3 + 3; k2++) {
-                // Find internal neighbour of triangle k
-                // k2 indexes the edges of triangle k
-
-                if (surrogate_neighbours[k2] != k) {
-                    break;
-                }
-            }
-
-            if ((k2 == k3 + 3)) {
-                // If we didn't find an internal neighbour
-                return -1;
-            }
-
-            k1 = surrogate_neighbours[k2];
-
-            // The coordinates of the triangle are already (x,y).
-            // Get centroid of the neighbour (x1,y1)
-            coord_index = 2 * k1;
-            x1 = centroid_coordinates[coord_index];
-            y1 = centroid_coordinates[coord_index + 1];
-
-            // Compute x- and y- distances between the centroid of
-            // triangle k and that of its neighbour
-            dx1 = x1 - x;
-            dy1 = y1 - y;
-
-            // Set area2 as the square of the distance
-            area2 = dx1 * dx1 + dy1*dy1;
-
-            // Set dx2=(x1-x0)/((x1-x0)^2+(y1-y0)^2)
-            // and dy2=(y1-y0)/((x1-x0)^2+(y1-y0)^2) which
-            // respectively correspond to the x- and y- gradients
-            // of the conserved quantities
-            dx2 = 1.0 / area2;
-            dy2 = dx2*dy1;
-            dx2 *= dx1;
-
-
-            //-----------------------------------
-            // stage
-            //-----------------------------------
-
-            // Compute differentials
-            dq1 = stage_centroid_values[k1] - stage_centroid_values[k];
-
-            // Calculate the gradient between the centroid of triangle k
-            // and that of its neighbour
-            a = dq1*dx2;
-            b = dq1*dy2;
-
-            // Calculate provisional vertex jumps, to be limited
-            dqv[0] = a * dxv0 + b*dyv0;
-            dqv[1] = a * dxv1 + b*dyv1;
-            dqv[2] = a * dxv2 + b*dyv2;
-
-            // Now limit the jumps
-            if (dq1 >= 0.0) {
-                qmin = 0.0;
-                qmax = dq1;
-            } else {
-                qmin = dq1;
-                qmax = 0.0;
-            }
-
-            // Limit the gradient
-            limit_gradient(dqv, qmin, qmax, beta_w);
-
-            //for (i=0; i < 3; i++)
-            //{
-            stage_vertex_values[k3] = stage_centroid_values[k] + dqv[0];
-            stage_vertex_values[k3 + 1] = stage_centroid_values[k] + dqv[1];
-            stage_vertex_values[k3 + 2] = stage_centroid_values[k] + dqv[2];
-            //}
-
-            //-----------------------------------
-            // xmomentum
-            //-----------------------------------
-
-            // Compute differentials
-            dq1 = xmom_centroid_values[k1] - xmom_centroid_values[k];
-
-            // Calculate the gradient between the centroid of triangle k
-            // and that of its neighbour
-            a = dq1*dx2;
-            b = dq1*dy2;
-
-            // Calculate provisional vertex jumps, to be limited
-            dqv[0] = a * dxv0 + b*dyv0;
-            dqv[1] = a * dxv1 + b*dyv1;
-            dqv[2] = a * dxv2 + b*dyv2;
-
-            // Now limit the jumps
-            if (dq1 >= 0.0) {
-                qmin = 0.0;
-                qmax = dq1;
-            } else {
-                qmin = dq1;
-                qmax = 0.0;
-            }
-
-            // Limit the gradient
-            limit_gradient(dqv, qmin, qmax, beta_w);
-
-            //for (i=0;i<3;i++)
-            //xmom_vertex_values[k3] = xmom_centroid_values[k] + dqv[0];
-            //xmom_vertex_values[k3 + 1] = xmom_centroid_values[k] + dqv[1];
-            //xmom_vertex_values[k3 + 2] = xmom_centroid_values[k] + dqv[2];
-
-            for (i = 0; i < 3; i++) {
-                xmom_vertex_values[k3 + i] = xmom_centroid_values[k] + dqv[i];
-            }
-
-            //-----------------------------------
-            // ymomentum
-            //-----------------------------------
-
-            // Compute differentials
-            dq1 = ymom_centroid_values[k1] - ymom_centroid_values[k];
-
-            // Calculate the gradient between the centroid of triangle k
-            // and that of its neighbour
-            a = dq1*dx2;
-            b = dq1*dy2;
-
-            // Calculate provisional vertex jumps, to be limited
-            dqv[0] = a * dxv0 + b*dyv0;
-            dqv[1] = a * dxv1 + b*dyv1;
-            dqv[2] = a * dxv2 + b*dyv2;
-
-            // Now limit the jumps
-            if (dq1 >= 0.0) {
-                qmin = 0.0;
-                qmax = dq1;
-            }
-            else {
-                qmin = dq1;
-                qmax = 0.0;
-            }
-
-            // Limit the gradient
-            limit_gradient(dqv, qmin, qmax, beta_w);
-
-            //for (i=0;i<3;i++)
-            //ymom_vertex_values[k3] = ymom_centroid_values[k] + dqv[0];
-            //ymom_vertex_values[k3 + 1] = ymom_centroid_values[k] + dqv[1];
-            //ymom_vertex_values[k3 + 2] = ymom_centroid_values[k] + dqv[2];
-
-            for (i = 0; i < 3; i++) {
-                ymom_vertex_values[k3 + i] = ymom_centroid_values[k] + dqv[i];
-            }
-            //ymom_vertex_values[k3] = ymom_centroid_values[k] + dqv[0];
-            //ymom_vertex_values[k3 + 1] = ymom_centroid_values[k] + dqv[1];
-            //ymom_vertex_values[k3 + 2] = ymom_centroid_values[k] + dqv[2];
-        } // else [number_of_boundaries==2]
-
-
-
-
-    } // for k=0 to number_of_elements-1
-
-    if (extrapolate_velocity_second_order == 1) {
-        // Convert back from velocity to momentum
-        for (k = 0; k < number_of_elements; k++) {
-            k3 = 3 * k;
-            //dv0 = fmax(stage_vertex_values[k3]-bed_vertex_values[k3],minimum_allowed_height);
-            //dv1 = fmax(stage_vertex_values[k3+1]-bed_vertex_values[k3+1],minimum_allowed_height);
-            //dv2 = fmax(stage_vertex_values[k3+2]-bed_vertex_values[k3+2],minimum_allowed_height);
-            dv0 = fmax(stage_vertex_values[k3] - bed_vertex_values[k3], 0.);
-            dv1 = fmax(stage_vertex_values[k3 + 1] - bed_vertex_values[k3 + 1], 0.);
-            dv2 = fmax(stage_vertex_values[k3 + 2] - bed_vertex_values[k3 + 2], 0.);
-
-            //Correct centroid and vertex values
-            xmom_centroid_values[k] = xmom_centroid_store[k];
-            xmom_vertex_values[k3] = xmom_vertex_values[k3] * dv0;
-            xmom_vertex_values[k3 + 1] = xmom_vertex_values[k3 + 1] * dv1;
-            xmom_vertex_values[k3 + 2] = xmom_vertex_values[k3 + 2] * dv2;
-
-            ymom_centroid_values[k] = ymom_centroid_store[k];
-            ymom_vertex_values[k3] = ymom_vertex_values[k3] * dv0;
-            ymom_vertex_values[k3 + 1] = ymom_vertex_values[k3 + 1] * dv1;
-            ymom_vertex_values[k3 + 2] = ymom_vertex_values[k3 + 2] * dv2;
-
-        }
-    }
-
-
-    free(xmom_centroid_store);
-    free(ymom_centroid_store);
-    //free(stage_centroid_store);
-
-
-    return 0;
-}
-
-
-
-// Computational routine
-//int64_t _extrapolate_second_order_edge_sw(int64_t number_of_elements,
-//                                 double epsilon,
-//                                 double minimum_allowed_height,
-//                                 double beta_w,
-//                                 double beta_w_dry,
-//                                 double beta_uh,
-//                                 double beta_uh_dry,
-//                                 double beta_vh,
-//                                 double beta_vh_dry,
-//                                 int64_t* surrogate_neighbours,
-//                                 int64_t* neighbour_edges,
-//                                 int64_t* number_of_boundaries,
-//                                 double* centroid_coordinates,
-//                                 double* stage_centroid_values,
-//                                 double* xmom_centroid_values,
-//                                 double* ymom_centroid_values,
-//                                 double* bed_centroid_values,
-//                                 double* height_centroid_values,
-//                                 double* edge_coordinates,
-//                                 double* stage_edge_values,
-//                                 double* xmom_edge_values,
-//                                 double* ymom_edge_values,
-//                                 double* bed_edge_values,
-//                                 double* height_edge_values,
-//                                 double* stage_vertex_values,
-//                                 double* xmom_vertex_values,
-//                                 double* ymom_vertex_values,
-//                                 double* bed_vertex_values,
-//                                 double* height_vertex_values,
-//                                 int64_t optimise_dry_cells,
-//                                 int64_t extrapolate_velocity_second_order,
-//                                 double* x_centroid_work,
-//                                 double* y_centroid_work,
-//                                 int64_t* update_extrapolation) {
-int64_t _extrapolate_second_order_edge_sw(struct domain *D) {
-
-  // Local variables
-  double a, b; // Gradient vector used to calculate edge values from centroids
-  int64_t k, k0, k1, k2, k3, k6, coord_index, i;
-  double x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2; // Vertices of the auxiliary triangle
-  double dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, dq0, dq1, dq2, area2, inv_area2;
-  double dqv[3], qmin, qmax, hmin, hmax;
-  double hc, h0, h1, h2, beta_tmp, hfactor;
-  double dk, dk_inv, a_tmp, b_tmp, c_tmp,d_tmp;
-
-
-  memset((char*) D->x_centroid_work, 0, D->number_of_elements * sizeof (double));
-  memset((char*) D->y_centroid_work, 0, D->number_of_elements * sizeof (double));
-
-  // Parameters used to control how the limiter is forced to first-order near
-  // wet-dry regions
-  a_tmp = 0.3; // Highest depth ratio with hfactor=1
-  b_tmp = 0.1; // Highest depth ratio with hfactor=0
-  c_tmp = 1.0/(a_tmp-b_tmp);
-  d_tmp = 1.0-(c_tmp*a_tmp);
-
-  if(D->extrapolate_velocity_second_order==1){
-
-      // Replace momentum centroid with velocity centroid to allow velocity
-      // extrapolation This will be changed back at the end of the routine
-      for (k=0; k< D->number_of_elements; k++){
-
-          D->height_centroid_values[k] = fmax(D->stage_centroid_values[k] - D->bed_centroid_values[k], 0.);
-
-          dk = D->height_centroid_values[k];
-          if(dk> D->minimum_allowed_height){
-              dk_inv=1.0/dk;
-              D->x_centroid_work[k] = D->xmom_centroid_values[k];
-              D->xmom_centroid_values[k] = D->xmom_centroid_values[k]*dk_inv;
-
-              D->y_centroid_work[k] = D->ymom_centroid_values[k];
-              D->ymom_centroid_values[k] = D->ymom_centroid_values[k]*dk_inv;
-          }else{
-              D->x_centroid_work[k] = 0.;
-              D->xmom_centroid_values[k] = 0.;
-              D->y_centroid_work[k] = 0.;
-              D->ymom_centroid_values[k] = 0.;
-
-         }
-      }
-  }
-
-  // If a triangle is surrounded by dry cells (or dry cells + boundary
-  // condition) set its momentum to zero too. This prevents 'pits' of
-  // of water being trapped and unable to lose momentum, which can occur in
-  // some situations
-  for (k=0; k< D->number_of_elements;k++){
-
-      k3=k*3;
-      k0 = D->surrogate_neighbours[k3];
-      k1 = D->surrogate_neighbours[k3 + 1];
-      k2 = D->surrogate_neighbours[k3 + 2];
-
-      if(( (D->height_centroid_values[k0] < D->minimum_allowed_height) | (k0==k) ) &
-         ( (D->height_centroid_values[k1] < D->minimum_allowed_height) | (k1==k) ) &
-         ( (D->height_centroid_values[k2] < D->minimum_allowed_height) | (k2==k) ) ) {
-    	  	  //printf("Surrounded by dry cells\n");
-              D->x_centroid_work[k] = 0.;
-              D->xmom_centroid_values[k] = 0.;
-              D->y_centroid_work[k] = 0.;
-              D->ymom_centroid_values[k] = 0.;
-      }
-
-
-  }
-
-  // Begin extrapolation routine
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-
-    // Don't update the extrapolation if the flux will not be computed on the
-    // next timestep
-    if(D->update_extrapolation[k]==0){
-       continue;
-    }
-
-
-    // Useful indices
-    k3=k*3;
-    k6=k*6;
-
-    if (D->number_of_boundaries[k]==3)
-    {
-      // No neighbours, set gradient on the triangle to zero
-
-      D->stage_edge_values[k3]   = D->stage_centroid_values[k];
-      D->stage_edge_values[k3+1] = D->stage_centroid_values[k];
-      D->stage_edge_values[k3+2] = D->stage_centroid_values[k];
-
-      //xmom_centroid_values[k] = 0.;
-      //ymom_centroid_values[k] = 0.;
-
-      D->xmom_edge_values[k3]    = D->xmom_centroid_values[k];
-      D->xmom_edge_values[k3+1]  = D->xmom_centroid_values[k];
-      D->xmom_edge_values[k3+2]  = D->xmom_centroid_values[k];
-      D->ymom_edge_values[k3]    = D->ymom_centroid_values[k];
-      D->ymom_edge_values[k3+1]  = D->ymom_centroid_values[k];
-      D->ymom_edge_values[k3+2]  = D->ymom_centroid_values[k];
-
-      dk = D->height_centroid_values[k];
-      D->height_edge_values[k3] = dk;
-      D->height_edge_values[k3+1] = dk;
-      D->height_edge_values[k3+2] = dk;
-
-      continue;
-    }
-    else
-    {
-      // Triangle k has one or more neighbours.
-      // Get centroid and edge coordinates of the triangle
-
-      // Get the edge coordinates
-      xv0 = D->edge_coordinates[k6];
-      yv0 = D->edge_coordinates[k6+1];
-      xv1 = D->edge_coordinates[k6+2];
-      yv1 = D->edge_coordinates[k6+3];
-      xv2 = D->edge_coordinates[k6+4];
-      yv2 = D->edge_coordinates[k6+5];
-
-      // Get the centroid coordinates
-      coord_index = 2*k;
-      x = D->centroid_coordinates[coord_index];
-      y = D->centroid_coordinates[coord_index+1];
-
-      // Store x- and y- differentials for the edges of
-      // triangle k relative to the centroid
-      dxv0 = xv0 - x;
-      dxv1 = xv1 - x;
-      dxv2 = xv2 - x;
-      dyv0 = yv0 - y;
-      dyv1 = yv1 - y;
-      dyv2 = yv2 - y;
-
-    }
-
-
-
-    if (D->number_of_boundaries[k]<=1)
-    {
-      //==============================================
-      // Number of boundaries <= 1
-      // 'Typical case'
-      //==============================================
-
-
-      // If no boundaries, auxiliary triangle is formed
-      // from the centroids of the three neighbours
-      // If one boundary, auxiliary triangle is formed
-      // from this centroid and its two neighbours
-
-      k0 = D->surrogate_neighbours[k3];
-      k1 = D->surrogate_neighbours[k3 + 1];
-      k2 = D->surrogate_neighbours[k3 + 2];
-
-
-      // Get the auxiliary triangle's vertex coordinates
-      // (really the centroids of neighbouring triangles)
-      coord_index = 2*k0;
-      x0 = D->centroid_coordinates[coord_index];
-      y0 = D->centroid_coordinates[coord_index+1];
-
-      coord_index = 2*k1;
-      x1 = D->centroid_coordinates[coord_index];
-      y1 = D->centroid_coordinates[coord_index+1];
-
-      coord_index = 2*k2;
-      x2 = D->centroid_coordinates[coord_index];
-      y2 = D->centroid_coordinates[coord_index+1];
-
-      // Store x- and y- differentials for the vertices
-      // of the auxiliary triangle
-      dx1 = x1 - x0;
-      dx2 = x2 - x0;
-      dy1 = y1 - y0;
-      dy2 = y2 - y0;
-
-      // Calculate 2*area of the auxiliary triangle
-      // The triangle is guaranteed to be counter-clockwise
-      area2 = dy2*dx1 - dy1*dx2;
-      //if(k==54) printf("K=54\n");
-
-      //// Treat triangles with no neighbours (area2 <=0.)
-      if ((area2 <= 0.))
-      {
-
-
-          // Isolated wet cell -- constant stage/depth extrapolation
-          D->stage_edge_values[k3]   = D->stage_centroid_values[k];
-          D->stage_edge_values[k3+1] = D->stage_centroid_values[k];
-          D->stage_edge_values[k3+2] = D->stage_centroid_values[k];
-
-          dk= D->height_centroid_values[k]; //fmax(stage_centroid_values[k]-bed_centroid_values[k],0.);
-          D->height_edge_values[k3] = dk;
-          D->height_edge_values[k3+1] = dk;
-          D->height_edge_values[k3+2] = dk;
-
-          D->xmom_edge_values[k3]    = D->xmom_centroid_values[k];
-          D->xmom_edge_values[k3+1]  = D->xmom_centroid_values[k];
-          D->xmom_edge_values[k3+2]  = D->xmom_centroid_values[k];
-          D->ymom_edge_values[k3]    = D->ymom_centroid_values[k];
-          D->ymom_edge_values[k3+1]  = D->ymom_centroid_values[k];
-          D->ymom_edge_values[k3+2]  = D->ymom_centroid_values[k];
-
-          continue;
-      }
-
-      // Calculate heights of neighbouring cells
-      hc = D->height_centroid_values[k];
-      h0 = D->height_centroid_values[k0];
-      h1 = D->height_centroid_values[k1];
-      h2 = D->height_centroid_values[k2];
-
-      hmin = fmin(fmin(h0, fmin(h1, h2)), hc);
-      hmax = fmax(fmax(h0, fmax(h1, h2)), hc);
-
-      // Look for strong changes in cell depth as an indicator of near-wet-dry
-      // Reduce hfactor linearly from 1-0 between depth ratio (hmin/hc) of [a_tmp , b_tmp]
-      // NOTE: If we have a more 'second order' treatment in near dry areas (e.g. with b_tmp being negative), then
-      //       the water tends to dry more rapidly (which is in agreement with analytical results),
-      //       but is also more 'artefacty' in important cases (tendency for high velocities, etc).
-      //
-      // So hfactor = depth_ratio*(c_tmp) + d_tmp, but is clipped between 0 and 1.
-      hfactor= fmax(0., fmin(c_tmp*fmax(hmin,0.0)/fmax(hc,1.0e-06)+d_tmp,
-                           fmin(c_tmp*fmax(hc,0.)/fmax(hmax,1.0e-06)+d_tmp, 1.0))
-                  );
-      // Set hfactor to zero smothly as hmin--> minimum_allowed_height. This
-      // avoids some 'chatter' for very shallow flows
-      hfactor=fmin( 1.2*fmax(hmin- D->minimum_allowed_height,0.)/(fmax(hmin,0.)+1.* D->minimum_allowed_height), hfactor);
-
-      inv_area2 = 1.0/area2;
-      //-----------------------------------
-      // stage
-      //-----------------------------------
-
-      beta_tmp = D->beta_w_dry + (D->beta_w - D->beta_w_dry) * hfactor;
-
-      if(beta_tmp>0.){
-          // Calculate the difference between vertex 0 of the auxiliary
-          // triangle and the centroid of triangle k
-          dq0 = D->stage_centroid_values[k0] - D->stage_centroid_values[k];
-
-          // Calculate differentials between the vertices
-          // of the auxiliary triangle (centroids of neighbouring triangles)
-          dq1 = D->stage_centroid_values[k1] - D->stage_centroid_values[k0];
-          dq2 = D->stage_centroid_values[k2] - D->stage_centroid_values[k0];
-
-          // Calculate the gradient of stage on the auxiliary triangle
-          a = dy2*dq1 - dy1*dq2;
-          a *= inv_area2;
-          b = dx1*dq2 - dx2*dq1;
-          b *= inv_area2;
-          // Calculate provisional jumps in stage from the centroid
-          // of triangle k to its vertices, to be limited
-          dqv[0] = a*dxv0 + b*dyv0;
-          dqv[1] = a*dxv1 + b*dyv1;
-          dqv[2] = a*dxv2 + b*dyv2;
-
-          // Now we want to find min and max of the centroid and the
-          // vertices of the auxiliary triangle and compute jumps
-          // from the centroid to the min and max
-          find_qmin_and_qmax(dq0, dq1, dq2, &qmin, &qmax);
-
-          // Limit the gradient
-          limit_gradient(dqv, qmin, qmax, beta_tmp);
-
-          D->stage_edge_values[k3+0] = D->stage_centroid_values[k] + dqv[0];
-          D->stage_edge_values[k3+1] = D->stage_centroid_values[k] + dqv[1];
-          D->stage_edge_values[k3+2] = D->stage_centroid_values[k] + dqv[2];
-      }else{
-          // Fast alternative when beta_tmp==0
-          D->stage_edge_values[k3+0] = D->stage_centroid_values[k];
-          D->stage_edge_values[k3+1] = D->stage_centroid_values[k];
-          D->stage_edge_values[k3+2] = D->stage_centroid_values[k];
-      }
-
-
-      //-----------------------------------
-      // height
-      //-----------------------------------
-
-      if(beta_tmp>0.){
-          // Calculate the difference between vertex 0 of the auxiliary
-          // triangle and the centroid of triangle k
-          dq0 = D->height_centroid_values[k0] - D->height_centroid_values[k];
-
-          // Calculate differentials between the vertices
-          // of the auxiliary triangle (centroids of neighbouring triangles)
-          dq1 = D->height_centroid_values[k1] - D->height_centroid_values[k0];
-          dq2 = D->height_centroid_values[k2] - D->height_centroid_values[k0];
-
-          // Calculate the gradient of height on the auxiliary triangle
-          a = dy2*dq1 - dy1*dq2;
-          a *= inv_area2;
-          b = dx1*dq2 - dx2*dq1;
-          b *= inv_area2;
-          // Calculate provisional jumps in height from the centroid
-          // of triangle k to its vertices, to be limited
-          dqv[0] = a*dxv0 + b*dyv0;
-          dqv[1] = a*dxv1 + b*dyv1;
-          dqv[2] = a*dxv2 + b*dyv2;
-
-          // Now we want to find min and max of the centroid and the
-          // vertices of the auxiliary triangle and compute jumps
-          // from the centroid to the min and max
-          find_qmin_and_qmax(dq0, dq1, dq2, &qmin, &qmax);
-
-          // Limit the gradient
-          // Same beta_tmp as for stage
-          //beta_tmp = beta_uh_dry + (beta_uh - beta_uh_dry) * hfactor;
-          limit_gradient(dqv, qmin, qmax, beta_tmp);
-
-          //beta_tmp = 0. + (beta_w - 0.) * hfactor;
-
-          D->height_edge_values[k3+0] = D->height_centroid_values[k] + dqv[0];
-          D->height_edge_values[k3+1] = D->height_centroid_values[k] + dqv[1];
-          D->height_edge_values[k3+2] = D->height_centroid_values[k] + dqv[2];
-      }else{
-          // Fast alternative when beta_tmp==0
-          D->height_edge_values[k3+0] = D->height_centroid_values[k];
-          D->height_edge_values[k3+1] = D->height_centroid_values[k];
-          D->height_edge_values[k3+2] = D->height_centroid_values[k];
-      }
-      //-----------------------------------
-      // xmomentum
-      //-----------------------------------
-
-      beta_tmp = D->beta_uh_dry + (D->beta_uh - D->beta_uh_dry) * hfactor;
-      if(beta_tmp>0.){
-          // Calculate the difference between vertex 0 of the auxiliary
-          // triangle and the centroid of triangle k
-          dq0 = D->xmom_centroid_values[k0] - D->xmom_centroid_values[k];
-
-          // Calculate differentials between the vertices
-          // of the auxiliary triangle
-          dq1 = D->xmom_centroid_values[k1] - D->xmom_centroid_values[k0];
-          dq2 = D->xmom_centroid_values[k2] - D->xmom_centroid_values[k0];
-
-          // Calculate the gradient of xmom on the auxiliary triangle
-          a = dy2*dq1 - dy1*dq2;
-          a *= inv_area2;
-          b = dx1*dq2 - dx2*dq1;
-          b *= inv_area2;
-
-          // Calculate provisional jumps in xmom from the centroid
-          // of triangle k to its vertices, to be limited
-          dqv[0] = a*dxv0+b*dyv0;
-          dqv[1] = a*dxv1+b*dyv1;
-          dqv[2] = a*dxv2+b*dyv2;
-
-          // Now we want to find min and max of the centroid and the
-          // vertices of the auxiliary triangle and compute jumps
-          // from the centroid to the min and max
-          //
-          find_qmin_and_qmax(dq0, dq1, dq2, &qmin, &qmax);
-
-
-          // Limit the gradient
-          limit_gradient(dqv, qmin, qmax, beta_tmp);
-
-          for (i=0; i < 3; i++)
-          {
-            D->xmom_edge_values[k3+i] = D->xmom_centroid_values[k] + dqv[i];
-          }
-      }else{
-          // Fast alternative when beta_tmp==0
-          for (i=0; i < 3; i++)
-          {
-            D->xmom_edge_values[k3+i] = D->xmom_centroid_values[k];
-          }
-      }
-
-      //-----------------------------------
-      // ymomentum
-      //-----------------------------------
-
-      beta_tmp = D->beta_vh_dry + (D->beta_vh - D->beta_vh_dry) * hfactor;
-
-      if(beta_tmp>0.){
-          // Calculate the difference between vertex 0 of the auxiliary
-          // triangle and the centroid of triangle k
-          dq0 = D->ymom_centroid_values[k0] - D->ymom_centroid_values[k];
-
-          // Calculate differentials between the vertices
-          // of the auxiliary triangle
-          dq1 = D->ymom_centroid_values[k1] - D->ymom_centroid_values[k0];
-          dq2 = D->ymom_centroid_values[k2] - D->ymom_centroid_values[k0];
-
-          // Calculate the gradient of xmom on the auxiliary triangle
-          a = dy2*dq1 - dy1*dq2;
-          a *= inv_area2;
-          b = dx1*dq2 - dx2*dq1;
-          b *= inv_area2;
-
-          // Calculate provisional jumps in ymom from the centroid
-          // of triangle k to its vertices, to be limited
-          dqv[0] = a*dxv0 + b*dyv0;
-          dqv[1] = a*dxv1 + b*dyv1;
-          dqv[2] = a*dxv2 + b*dyv2;
-
-          // Now we want to find min and max of the centroid and the
-          // vertices of the auxiliary triangle and compute jumps
-          // from the centroid to the min and max
-          //
-          find_qmin_and_qmax(dq0, dq1, dq2, &qmin, &qmax);
-
-
-          // Limit the gradient
-          limit_gradient(dqv, qmin, qmax, beta_tmp);
-
-          for (i=0;i<3;i++)
-          {
-            D->ymom_edge_values[k3 + i] = D->ymom_centroid_values[k] + dqv[i];
-          }
-      }else{
-          // Fast alternative when beta_tmp==0
-          for (i=0;i<3;i++)
-          {
-            D->ymom_edge_values[k3 + i] = D->ymom_centroid_values[k];
-          }
-
-      }
-
-    } // End number_of_boundaries <=1
-    else
-    {
-
-      //==============================================
-      // Number of boundaries == 2
-      //==============================================
-
-      // One internal neighbour and gradient is in direction of the neighbour's centroid
-
-      // Find the only internal neighbour (k1?)
-      for (k2 = k3; k2 < k3 + 3; k2++)
-      {
-      // Find internal neighbour of triangle k
-      // k2 indexes the edges of triangle k
-
-          if (D->surrogate_neighbours[k2] != k)
-          {
-             break;
-          }
-      }
-
-      if ((k2 == k3 + 3))
-      {
-        // If we didn't find an internal neighbour
-        //report_python_error(AT, "Internal neighbour not found");
-        return -1;
-      }
-
-      k1 = D->surrogate_neighbours[k2];
-
-      // The coordinates of the triangle are already (x,y).
-      // Get centroid of the neighbour (x1,y1)
-      coord_index = 2*k1;
-      x1 = D->centroid_coordinates[coord_index];
-      y1 = D->centroid_coordinates[coord_index + 1];
-
-      // Compute x- and y- distances between the centroid of
-      // triangle k and that of its neighbour
-      dx1 = x1 - x;
-      dy1 = y1 - y;
-
-      // Set area2 as the square of the distance
-      area2 = dx1*dx1 + dy1*dy1;
-
-      // Set dx2=(x1-x0)/((x1-x0)^2+(y1-y0)^2)
-      // and dy2=(y1-y0)/((x1-x0)^2+(y1-y0)^2) which
-      // respectively correspond to the x- and y- gradients
-      // of the conserved quantities
-      dx2 = 1.0/area2;
-      dy2 = dx2*dy1;
-      dx2 *= dx1;
-
-
-      //-----------------------------------
-      // stage
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->stage_centroid_values[k1] - D->stage_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1*dx2;
-      b = dq1*dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a*dxv0 + b*dyv0;
-      dqv[1] = a*dxv1 + b*dyv1;
-      dqv[2] = a*dxv2 + b*dyv2;
-
-      // Now limit the jumps
-      if (dq1>=0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->stage_edge_values[k3] = D->stage_centroid_values[k] + dqv[0];
-      D->stage_edge_values[k3 + 1] = D->stage_centroid_values[k] + dqv[1];
-      D->stage_edge_values[k3 + 2] = D->stage_centroid_values[k] + dqv[2];
-
-      //-----------------------------------
-      // height
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->height_centroid_values[k1] - D->height_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1*dx2;
-      b = dq1*dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a*dxv0 + b*dyv0;
-      dqv[1] = a*dxv1 + b*dyv1;
-      dqv[2] = a*dxv2 + b*dyv2;
-
-      // Now limit the jumps
-      if (dq1>=0.0)
-      {
-        qmin=0.0;
-        qmax=dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->height_edge_values[k3] = D->height_centroid_values[k] + dqv[0];
-      D->height_edge_values[k3 + 1] = D->height_centroid_values[k] + dqv[1];
-      D->height_edge_values[k3 + 2] = D->height_centroid_values[k] + dqv[2];
-
-      //-----------------------------------
-      // xmomentum
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->xmom_centroid_values[k1] - D->xmom_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1*dx2;
-      b = dq1*dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a*dxv0+b*dyv0;
-      dqv[1] = a*dxv1+b*dyv1;
-      dqv[2] = a*dxv2+b*dyv2;
-
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      for (i = 0; i < 3;i++)
-      {
-          D->xmom_edge_values[k3 + i] = D->xmom_centroid_values[k] + dqv[i];
-      }
-
-      //-----------------------------------
-      // ymomentum
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->ymom_centroid_values[k1] - D->ymom_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1*dx2;
-      b = dq1*dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a*dxv0 + b*dyv0;
-      dqv[1] = a*dxv1 + b*dyv1;
-      dqv[2] = a*dxv2 + b*dyv2;
-
-      // Now limit the jumps
-      if (dq1>=0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      for (i=0;i<3;i++)
-              {
-              D->ymom_edge_values[k3 + i] = D->ymom_centroid_values[k] + dqv[i];
-              }
-    } // else [number_of_boundaries==2]
-  } // for k=0 to number_of_elements-1
-
-
-  // Compute vertex values of quantities
-  for (k=0; k< D->number_of_elements; k++){
-      if(D->extrapolate_velocity_second_order==1){
-          //Convert velocity back to momenta at centroids
-          D->xmom_centroid_values[k] = D->x_centroid_work[k];
-          D->ymom_centroid_values[k] = D->y_centroid_work[k];
-      }
-
-      // Don't proceed if we didn't update the edge/vertex values
-      if(D->update_extrapolation[k]==0){
-         continue;
-      }
-
-      k3=3*k;
-
-      // Compute stage vertex values
-      D->stage_vertex_values[k3] = D->stage_edge_values[k3+1] + D->stage_edge_values[k3+2] - D->stage_edge_values[k3] ;
-      D->stage_vertex_values[k3+1] =  D->stage_edge_values[k3] + D->stage_edge_values[k3+2]- D->stage_edge_values[k3+1];
-      D->stage_vertex_values[k3+2] =  D->stage_edge_values[k3] + D->stage_edge_values[k3+1]- D->stage_edge_values[k3+2];
-
-      // Compute height vertex values
-      D->height_vertex_values[k3] = D->height_edge_values[k3+1] + D->height_edge_values[k3+2] - D->height_edge_values[k3] ;
-      D->height_vertex_values[k3+1] =  D->height_edge_values[k3] + D->height_edge_values[k3+2]- D->height_edge_values[k3+1];
-      D->height_vertex_values[k3+2] =  D->height_edge_values[k3] + D->height_edge_values[k3+1]- D->height_edge_values[k3+2];
-
-      // If needed, convert from velocity to momenta
-      if(D->extrapolate_velocity_second_order==1){
-          // Re-compute momenta at edges
-          for (i=0; i<3; i++){
-              dk= D->height_edge_values[k3+i];
-              D->xmom_edge_values[k3+i] = D->xmom_edge_values[k3+i]*dk;
-              D->ymom_edge_values[k3+i] = D->ymom_edge_values[k3+i]*dk;
-          }
-      }
-      // Compute momenta at vertices
-      D->xmom_vertex_values[k3]   =  D->xmom_edge_values[k3+1] + D->xmom_edge_values[k3+2] - D->xmom_edge_values[k3] ;
-      D->xmom_vertex_values[k3+1] =  D->xmom_edge_values[k3] + D->xmom_edge_values[k3+2]- D->xmom_edge_values[k3+1];
-      D->xmom_vertex_values[k3+2] =  D->xmom_edge_values[k3] + D->xmom_edge_values[k3+1]- D->xmom_edge_values[k3+2];
-      D->ymom_vertex_values[k3]   =  D->ymom_edge_values[k3+1] + D->ymom_edge_values[k3+2] - D->ymom_edge_values[k3] ;
-      D->ymom_vertex_values[k3+1] =  D->ymom_edge_values[k3] + D->ymom_edge_values[k3+2]- D->ymom_edge_values[k3+1];
-      D->ymom_vertex_values[k3+2] =  D->ymom_edge_values[k3] + D->ymom_edge_values[k3+1]- D->ymom_edge_values[k3+2];
-
-      // Compute new bed elevation
-      D->bed_edge_values[k3]= D->stage_edge_values[k3]- D->height_edge_values[k3];
-      D->bed_edge_values[k3+1]= D->stage_edge_values[k3+1]- D->height_edge_values[k3+1];
-      D->bed_edge_values[k3+2]= D->stage_edge_values[k3+2]- D->height_edge_values[k3+2];
-      D->bed_vertex_values[k3] = D->bed_edge_values[k3+1] + D->bed_edge_values[k3+2] - D->bed_edge_values[k3] ;
-      D->bed_vertex_values[k3+1] =  D->bed_edge_values[k3] + D->bed_edge_values[k3+2] - D->bed_edge_values[k3+1];
-      D->bed_vertex_values[k3+2] =  D->bed_edge_values[k3] + D->bed_edge_values[k3+1] - D->bed_edge_values[k3+2];
-  }
-
-  return 0;
-}
-
-
-
-int64_t _gravity(struct domain *D) {
-
-    int64_t k, N, k3, k6;
-    double g, avg_h, zx, zy;
-    double x0, y0, x1, y1, x2, y2, z0, z1, z2;
-
-    g = D->g;
-    N = D->number_of_elements;
-
-    for (k = 0; k < N; k++) {
-        k3 = 3 * k; // base index
-
-        // Get bathymetry
-        z0 = (D->bed_vertex_values)[k3 + 0];
-        z1 = (D->bed_vertex_values)[k3 + 1];
-        z2 = (D->bed_vertex_values)[k3 + 2];
-
-        //printf("z0 %g, z1 %g, z2 %g \n",z0,z1,z2);
-
-        // Get average depth from centroid values
-        avg_h = (D->stage_centroid_values)[k] - (D->bed_centroid_values)[k];
-
-        //printf("avg_h  %g \n",avg_h);
-        // Compute bed slope
-        k6 = 6 * k; // base index
-
-        x0 = (D->vertex_coordinates)[k6 + 0];
-        y0 = (D->vertex_coordinates)[k6 + 1];
-        x1 = (D->vertex_coordinates)[k6 + 2];
-        y1 = (D->vertex_coordinates)[k6 + 3];
-        x2 = (D->vertex_coordinates)[k6 + 4];
-        y2 = (D->vertex_coordinates)[k6 + 5];
-
-        //printf("x0 %g, y0 %g, x1 %g, y1 %g, x2 %g, y2 %g \n",x0,y0,x1,y1,x2,y2);
-        _gradient(x0, y0, x1, y1, x2, y2, z0, z1, z2, &zx, &zy);
-
-        //printf("zx %g, zy %g \n",zx,zy);
-
-        // Update momentum
-        (D->xmom_explicit_update)[k] += -g * zx*avg_h;
-        (D->ymom_explicit_update)[k] += -g * zy*avg_h;
-    }
-    return 0;
-}
-
-int64_t _gravity_wb(struct domain *D) {
-
-    int64_t i, k, N, k3, k6;
-    double g, avg_h, wx, wy, fact;
-    double x0, y0, x1, y1, x2, y2;
-    double hh[3];
-    double w0, w1, w2;
-    double sidex, sidey, area;
-    double n0, n1;
-
-    g = D->g;
-
-    N = D->number_of_elements;
-    for (k = 0; k < N; k++) {
-        k3 = 3 * k; // base index
-
-        //------------------------------------
-        // Calculate side terms -ghw_x term
-        //------------------------------------
-
-        // Get vertex stage values for gradient calculation
-        w0 = (D->stage_vertex_values)[k3 + 0];
-        w1 = (D->stage_vertex_values)[k3 + 1];
-        w2 = (D->stage_vertex_values)[k3 + 2];
-
-        // Compute stage slope
-        k6 = 6 * k; // base index
-
-        x0 = (D->vertex_coordinates)[k6 + 0];
-        y0 = (D->vertex_coordinates)[k6 + 1];
-        x1 = (D->vertex_coordinates)[k6 + 2];
-        y1 = (D->vertex_coordinates)[k6 + 3];
-        x2 = (D->vertex_coordinates)[k6 + 4];
-        y2 = (D->vertex_coordinates)[k6 + 5];
-
-        //printf("x0 %g, y0 %g, x1 %g, y1 %g, x2 %g, y2 %g \n",x0,y0,x1,y1,x2,y2);
-        _gradient(x0, y0, x1, y1, x2, y2, w0, w1, w2, &wx, &wy);
-
-        avg_h = (D->stage_centroid_values)[k] - (D->bed_centroid_values)[k];
-
-        // Update using -ghw_x term
-        (D->xmom_explicit_update)[k] += -g * wx*avg_h;
-        (D->ymom_explicit_update)[k] += -g * wy*avg_h;
-
-        //------------------------------------
-        // Calculate side terms \sum_i 0.5 g l_i h_i^2 n_i
-        //------------------------------------
-
-        // Getself.stage_c = self.domain.quantities['stage'].centroid_values edge depths
-        hh[0] = (D->stage_edge_values)[k3 + 0] - (D->bed_edge_values)[k3 + 0];
-        hh[1] = (D->stage_edge_values)[k3 + 1] - (D->bed_edge_values)[k3 + 1];
-        hh[2] = (D->stage_edge_values)[k3 + 2] - (D->bed_edge_values)[k3 + 2];
-
-
-        //printf("h0,1,2 %f %f %f\n",hh[0],hh[1],hh[2]);
-
-        // Calculate the side correction term
-        sidex = 0.0;
-        sidey = 0.0;
-        for (i = 0; i < 3; i++) {
-            n0 = (D->normals)[k6 + 2 * i];
-            n1 = (D->normals)[k6 + 2 * i + 1];
-
-            //printf("n0, n1 %i %g %g\n",i,n0,n1);
-            fact = -0.5 * g * hh[i] * hh[i] * (D->edgelengths)[k3 + i];
-            sidex = sidex + fact*n0;
-            sidey = sidey + fact*n1;
-        }
-
-        // Update momentum with side terms
-        area = (D->areas)[k];
-        (D->xmom_explicit_update)[k] += -sidex / area;
-        (D->ymom_explicit_update)[k] += -sidey / area;
-
-    }
-    return 0;
-}
-
-
-void _manning_friction_flat(double g, double eps, int64_t N,
-        double* w, double* zv,
-        double* uh, double* vh,
-        double* eta, double* xmom_update, double* ymom_update) {
-
-    int64_t k, k3;
-    double S, h, z, z0, z1, z2;
-    const double one_third = 1.0/3.0; 
-    const double seven_thirds = 7.0/3.0;
-
-    for (k = 0; k < N; k++) {
-        if (eta[k] > eps) {
-            k3 = 3 * k;
-            // Get bathymetry
-            // z0 = zv[k3 + 0];
-            // z1 = zv[k3 + 1];
-            // z2 = zv[k3 + 2];
-            // z = (z0 + z1 + z2) * one_third;
-            z = zv[k];
-            h = w[k] - z;
-            if (h >= eps) {
-                S = -g * eta[k] * eta[k] * sqrt((uh[k] * uh[k] + vh[k] * vh[k]));
-                S /= pow(h, seven_thirds); //Expensive (on Ole's home computer)
-                //S /= exp((7.0/3.0)*log(h));      //seems to save about 15% over manning_friction
-                //S /= h*h*(1 + h/3.0 - h*h/9.0); //FIXME: Could use a Taylor expansion
-
-
-                //Update momentum
-                xmom_update[k] += S * uh[k];
-                ymom_update[k] += S * vh[k];
-            }
-        }
-    }
-}
-
-void _manning_friction_sloped(double g, double eps, int64_t N,
-        double* x, double* w, double* zv,
-        double* uh, double* vh,
-        double* eta, double* xmom_update, double* ymom_update) {
-
-    int64_t k, k3, k6;
-    double S, h, z, z0, z1, z2, zs, zx, zy;
-    double x0, y0, x1, y1, x2, y2;
-    const double one_third = 1.0/3.0; 
-    const double seven_thirds = 7.0/3.0;
-
-    for (k = 0; k < N; k++) {
-        if (eta[k] > eps) {
-            k3 = 3 * k;
-            // Get bathymetry
-            z0 = zv[k3 + 0];
-            z1 = zv[k3 + 1];
-            z2 = zv[k3 + 2];
-
-            // Compute bed slope
-            k6 = 6 * k; // base index
-
-            x0 = x[k6 + 0];
-            y0 = x[k6 + 1];
-            x1 = x[k6 + 2];
-            y1 = x[k6 + 3];
-            x2 = x[k6 + 4];
-            y2 = x[k6 + 5];
-
-            _gradient(x0, y0, x1, y1, x2, y2, z0, z1, z2, &zx, &zy);
-
-            zs = sqrt(1.0 + zx * zx + zy * zy);
-            z = (z0 + z1 + z2) * one_third;
-            h = w[k] - z;
-            if (h >= eps) {
-                S = -g * eta[k] * eta[k] * zs * sqrt((uh[k] * uh[k] + vh[k] * vh[k]));
-                S /= pow(h, seven_thirds); //Expensive (on Ole's home computer)
-                //S /= exp((7.0/3.0)*log(h));      //seems to save about 15% over manning_friction
-                //S /= h*h*(1 + h/3.0 - h*h/9.0); //FIXME: Could use a Taylor expansion
-
-
-                //Update momentum
-                xmom_update[k] += S * uh[k];
-                ymom_update[k] += S * vh[k];
-            }
-        }
-    }
-}
-
-// Computational function for flux computation
-int64_t _orig_fix_negative_cells(struct domain *D)
-{
-  int64_t k;
-  int64_t tff;
-  int64_t num_negative_cells = 0;
-
-  // #pragma omp parallel for private(k, tff) reduction(+:num_negative_cells)
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    tff = D->tri_full_flag[k];
-    if ((D->stage_centroid_values[k] - D->bed_centroid_values[k] < 0.0) & (tff > 0)) 
-    {
-      num_negative_cells = num_negative_cells + 1;
-      D->stage_centroid_values[k] = D->bed_centroid_values[k];
-      D->xmom_centroid_values[k] = 0.0;
-      D->ymom_centroid_values[k] = 0.0;
-    }
-  }
-  return num_negative_cells;
-}
\ No newline at end of file
diff --git a/anuga/shallow_water/sw_domain_orig_ext.pyx b/anuga/shallow_water/sw_domain_orig_ext.pyx
deleted file mode 100644
index 0d09a60dc..000000000
--- a/anuga/shallow_water/sw_domain_orig_ext.pyx
+++ /dev/null
@@ -1,553 +0,0 @@
-#cython: wraparound=False, boundscheck=False, cdivision=True, profile=False, nonecheck=False, overflowcheck=False, cdivision_warnings=False, unraisable_tracebacks=False
-import cython
-from libc.stdint cimport int64_t
-
-# import both numpy and the Cython declarations for numpy
-import numpy as np
-cimport numpy as np
-
-cdef extern from "sw_domain_orig.c" nogil:
-        struct domain:
-                int64_t number_of_elements
-                int64_t boundary_length
-                int64_t number_of_riverwall_edges
-                double epsilon
-                double H0
-                double g
-                int64_t optimise_dry_cells
-                double evolve_max_timestep
-                int64_t extrapolate_velocity_second_order
-                double minimum_allowed_height
-                double maximum_allowed_speed
-                int64_t low_froude
-                int64_t timestep_fluxcalls
-                double beta_w
-                double beta_w_dry
-                double beta_uh
-                double beta_uh_dry
-                double beta_vh
-                double beta_vh_dry
-                int64_t max_flux_update_frequency
-                int64_t ncol_riverwall_hydraulic_properties
-                int64_t* neighbours
-                int64_t* neighbour_edges
-                int64_t* surrogate_neighbours
-                double* normals
-                double* edgelengths
-                double* radii
-                double* areas
-                int64_t* edge_flux_type
-                int64_t* tri_full_flag
-                int64_t* already_computed_flux
-                double* max_speed
-                double* vertex_coordinates
-                double* edge_coordinates
-                double* centroid_coordinates
-                int64_t* number_of_boundaries
-                double* stage_edge_values
-                double* xmom_edge_values
-                double* ymom_edge_values
-                double* bed_edge_values
-                double* height_edge_values
-                double* stage_centroid_values
-                double* xmom_centroid_values
-                double* ymom_centroid_values
-                double* bed_centroid_values
-                double* height_centroid_values
-                double* stage_vertex_values
-                double* xmom_vertex_values
-                double* ymom_vertex_values
-                double* bed_vertex_values
-                double* height_vertex_values
-                double* stage_boundary_values
-                double* xmom_boundary_values
-                double* ymom_boundary_values
-                double* bed_boundary_values
-                double* stage_explicit_update
-                double* xmom_explicit_update
-                double* ymom_explicit_update
-                int64_t* flux_update_frequency
-                int64_t* update_next_flux
-                int64_t* update_extrapolation
-                double* edge_timestep
-                double* edge_flux_work
-                double* pressuregrad_work
-                double* x_centroid_work
-                double* y_centroid_work
-                double* boundary_flux_sum
-                int64_t* allow_timestep_increase
-                double* riverwall_elevation
-                int64_t* riverwall_rowIndex
-                double* riverwall_hydraulic_properties
-                double* stage_semi_implicit_update
-                double* xmom_semi_implicit_update
-                double* ymom_semi_implicit_update
-
-        struct edge:
-                pass
-
-        int64_t _compute_flux_update_frequency(domain* D, double timestep)
-        double _compute_fluxes_central(domain* D, double timestep)
-        double _protect_new(domain* D)
-        int64_t _extrapolate_second_order_edge_sw(domain* D)
-        int64_t _extrapolate_second_order_sw(domain* D)
-        int64_t _rotate(double *q, double n1, double n2)
-        int64_t _gravity(domain* D)
-        int64_t _gravity_wb(domain* D)
-        void _manning_friction_flat(double g, double eps, int64_t N, double* w, double* zv, double* uh, double* vh, double* eta, double* xmom, double* ymom)
-        void _manning_friction_sloped(double g, double eps, int64_t N, double* x, double* w, double* zv, double* uh, double* vh, double* eta, double* xmom_update, double* ymom_update)
-
-        int64_t _flux_function_central(double *q_left, double *q_right,
-                                   double h_left, double h_right,
-                           	   double hle, double hre,
-                           	   double n1, double n2,
-                           	   double epsilon,
-                           	   double ze,
-                           	   double limiting_threshold,
-                           	   double g,
-                           	   double *edgeflux, double *max_speed,
-                           	   double *pressure_flux, double hc,
-                           	   double hc_n,
-                           	   int64_t low_froude)
-
-        int64_t _orig_fix_negative_cells(domain* D)
-
-
-cdef int64_t pointer_flag = 0
-cdef int64_t parameter_flag = 0
-
-cdef inline get_python_domain_parameters(domain *D, object domain_object):
-        
-        D.number_of_elements = domain_object.number_of_elements
-        D.boundary_length = domain_object.boundary_length 
-        D.number_of_riverwall_edges = domain_object.number_of_riverwall_edges
-        D.epsilon = domain_object.epsilon
-        D.H0 = domain_object.H0
-        D.g = domain_object.g
-        D.optimise_dry_cells = domain_object.optimise_dry_cells
-        D.evolve_max_timestep = domain_object.evolve_max_timestep
-        D.minimum_allowed_height = domain_object.minimum_allowed_height
-        D.maximum_allowed_speed = domain_object.maximum_allowed_speed
-        D.timestep_fluxcalls = domain_object.timestep_fluxcalls
-        D.low_froude = domain_object.low_froude
-        D.extrapolate_velocity_second_order = domain_object.extrapolate_velocity_second_order
-        D.beta_w = domain_object.beta_w
-        D.beta_w_dry = domain_object.beta_w_dry
-        D.beta_uh = domain_object.beta_uh
-        D.beta_uh_dry = domain_object.beta_uh_dry
-        D.beta_vh = domain_object.beta_vh
-        D.beta_vh_dry = domain_object.beta_vh_dry
-        D.max_flux_update_frequency = domain_object.max_flux_update_frequency
-
-
-cdef inline get_python_domain_pointers(domain *D, object domain_object):
-
-        cdef int64_t[:,::1] neighbours
-        cdef int64_t[:,::1] neighbour_edges
-        cdef double[:,::1] normals
-        cdef double[:,::1] edgelengths
-        cdef double[::1]   radii
-        cdef double[::1]   areas
-        cdef int64_t[::1]  edge_flux_type
-        cdef int64_t[::1]  tri_full_flag
-        cdef int64_t[:,::1] already_computed_flux
-        cdef double[:,::1] vertex_coordinates
-        cdef double[:,::1] edge_coordinates
-        cdef double[:,::1] centroid_coordinates
-        cdef int64_t[::1]  number_of_boundaries
-        cdef int64_t[:,::1] surrogate_neighbours
-        cdef double[::1]   max_speed
-        cdef int64_t[::1]  flux_update_frequency
-        cdef int64_t[::1]  update_next_flux
-        cdef int64_t[::1]  update_extrapolation
-        cdef int64_t[::1]  allow_timestep_increase
-        cdef double[::1]   edge_timestep
-        cdef double[::1]   edge_flux_work
-        cdef double[::1]   pressuregrad_work
-        cdef double[::1]   x_centroid_work
-        cdef double[::1]   y_centroid_work
-        cdef double[::1]   boundary_flux_sum
-        cdef double[::1]   riverwall_elevation
-        cdef int64_t[::1]  riverwall_rowIndex
-        cdef double[:,::1] riverwall_hydraulic_properties
-        cdef double[:,::1] edge_values
-        cdef double[::1]   centroid_values
-        cdef double[:,::1] vertex_values
-        cdef double[::1]   boundary_values
-        cdef double[::1]   explicit_update
-        cdef double[::1]   semi_implicit_update
-
-        cdef object quantities
-        cdef object riverwallData
-
-        #------------------------------------------------------
-        # Domain structures
-        #------------------------------------------------------
-        neighbours = domain_object.neighbours
-        D.neighbours = &neighbours[0,0]
-
-        surrogate_neighbours = domain_object.surrogate_neighbours
-        D.surrogate_neighbours = &surrogate_neighbours[0,0]
-
-        neighbour_edges = domain_object.neighbour_edges
-        D.neighbour_edges = &neighbour_edges[0,0]
-
-        normals = domain_object.normals
-        D.normals = &normals[0,0]
-
-        edgelengths = domain_object.edgelengths
-        D.edgelengths = &edgelengths[0,0]
-
-        radii = domain_object.radii
-        D.radii = &radii[0]
-
-        areas = domain_object.areas
-        D.areas = &areas[0]
-
-        edge_flux_type = domain_object.edge_flux_type
-        D.edge_flux_type = &edge_flux_type[0]
-
-        tri_full_flag = domain_object.tri_full_flag
-        D.tri_full_flag = &tri_full_flag[0]
-
-        already_computed_flux = domain_object.already_computed_flux
-        D.already_computed_flux = &already_computed_flux[0,0]
-
-        vertex_coordinates = domain_object.vertex_coordinates
-        D.vertex_coordinates = &vertex_coordinates[0,0]
-
-        edge_coordinates = domain_object.edge_coordinates
-        D.edge_coordinates = &edge_coordinates[0,0]
-
-        centroid_coordinates = domain_object.centroid_coordinates
-        D.centroid_coordinates = &centroid_coordinates[0,0]
-
-        max_speed = domain_object.max_speed
-        D.max_speed = &max_speed[0]
-
-        number_of_boundaries = domain_object.number_of_boundaries
-        D.number_of_boundaries = &number_of_boundaries[0]
-
-        flux_update_frequency = domain_object.flux_update_frequency
-        D.flux_update_frequency = &flux_update_frequency[0]
-
-        update_next_flux = domain_object.update_next_flux
-        D.update_next_flux = &update_next_flux[0]
-
-        update_extrapolation = domain_object.update_extrapolation
-        D.update_extrapolation = &update_extrapolation[0]
-
-        allow_timestep_increase = domain_object.allow_timestep_increase
-        D.allow_timestep_increase = &allow_timestep_increase[0]
-
-        edge_timestep = domain_object.edge_timestep
-        D.edge_timestep = &edge_timestep[0]
-
-        edge_flux_work = domain_object.edge_flux_work
-        D.edge_flux_work = &edge_flux_work[0]
-
-        pressuregrad_work = domain_object.pressuregrad_work
-        D.pressuregrad_work = &pressuregrad_work[0]
-
-        x_centroid_work = domain_object.x_centroid_work
-        D.x_centroid_work = &x_centroid_work[0]
-
-        y_centroid_work = domain_object.y_centroid_work
-        D.y_centroid_work = &y_centroid_work[0]
-
-        boundary_flux_sum = domain_object.boundary_flux_sum
-        D.boundary_flux_sum = &boundary_flux_sum[0]
-
-        #------------------------------------------------------
-        # Quantity structures
-        #------------------------------------------------------
-        quantities = domain_object.quantities
-        stage = quantities["stage"]
-        xmomentum = quantities["xmomentum"]
-        ymomentum = quantities["ymomentum"]
-        elevation = quantities["elevation"]
-        height = quantities["height"]
-
-        edge_values = stage.edge_values
-        D.stage_edge_values = &edge_values[0,0]
-
-        edge_values = xmomentum.edge_values
-        D.xmom_edge_values = &edge_values[0,0]
-
-        edge_values = ymomentum.edge_values
-        D.ymom_edge_values = &edge_values[0,0]
-
-        edge_values = elevation.edge_values
-        D.bed_edge_values = &edge_values[0,0]
-
-        edge_values = height.edge_values
-        D.height_edge_values = &edge_values[0,0]
-
-        centroid_values = stage.centroid_values
-        D.stage_centroid_values = &centroid_values[0]
-
-        centroid_values = xmomentum.centroid_values
-        D.xmom_centroid_values = &centroid_values[0]
-
-        centroid_values = ymomentum.centroid_values
-        D.ymom_centroid_values = &centroid_values[0]
-
-        centroid_values = elevation.centroid_values
-        D.bed_centroid_values = &centroid_values[0]
-
-        centroid_values = height.centroid_values
-        D.height_centroid_values = &centroid_values[0]
-
-        vertex_values = stage.vertex_values
-        D.stage_vertex_values = &vertex_values[0,0]
-
-        vertex_values = xmomentum.vertex_values
-        D.xmom_vertex_values = &vertex_values[0,0]
-
-        vertex_values = ymomentum.vertex_values
-        D.ymom_vertex_values = &vertex_values[0,0]
-
-        vertex_values = elevation.vertex_values
-        D.bed_vertex_values = &vertex_values[0,0]
-
-        vertex_values = height.vertex_values
-        D.height_vertex_values = &vertex_values[0,0]
-
-        boundary_values = stage.boundary_values
-        D.stage_boundary_values = &boundary_values[0]
-
-        boundary_values = xmomentum.boundary_values
-        D.xmom_boundary_values = &boundary_values[0]
-
-        boundary_values = ymomentum.boundary_values
-        D.ymom_boundary_values = &boundary_values[0]
-
-        boundary_values = elevation.boundary_values
-        D.bed_boundary_values = &boundary_values[0]
-
-        explicit_update = stage.explicit_update
-        D.stage_explicit_update = &explicit_update[0]
-
-        explicit_update = xmomentum.explicit_update
-        D.xmom_explicit_update = &explicit_update[0]
-
-        explicit_update = ymomentum.explicit_update
-        D.ymom_explicit_update = &explicit_update[0]
-
-        semi_implicit_update = stage.semi_implicit_update
-        D.stage_semi_implicit_update = &semi_implicit_update[0]
-
-        semi_implicit_update = xmomentum.semi_implicit_update
-        D.xmom_semi_implicit_update = &semi_implicit_update[0]
-
-        semi_implicit_update = ymomentum.semi_implicit_update
-        D.ymom_semi_implicit_update = &semi_implicit_update[0]
-
-        #------------------------------------------------------
-        # Riverwall structures
-        #------------------------------------------------------
-        riverwallData = domain_object.riverwallData
-
-        riverwall_elevation = riverwallData.riverwall_elevation
-        D.riverwall_elevation = &riverwall_elevation[0]
-
-        riverwall_rowIndex = riverwallData.hydraulic_properties_rowIndex
-        D.riverwall_rowIndex = &riverwall_rowIndex[0]
-
-        D.ncol_riverwall_hydraulic_properties = riverwallData.ncol_hydraulic_properties
-
-        riverwall_hydraulic_properties = riverwallData.hydraulic_properties
-        D.riverwall_hydraulic_properties = &riverwall_hydraulic_properties[0,0]
-
-
-#===============================================================================
-
-def rotate(np.ndarray[double, ndim=1, mode="c"] q not None, np.ndarray[double, ndim=1, mode="c"] normal not None, int64_t direction):
-
-        assert normal.shape[0] == 2, "Normal vector must have 2 components"
-
-        cdef np.ndarray[double, ndim=1, mode="c"] r
-        cdef double n1, n2
-
-        n1 = normal[0]
-        n2 = normal[1]
-
-        if direction == -1:
-                n2 = -n2
-
-        r = np.ascontiguousarray(np.copy(q))
-
-        _rotate(&r[0], n1, n2)
-
-        return r
-
-
-def flux_function_central(np.ndarray[double, ndim=1, mode="c"] normal not None,\
-                          np.ndarray[double, ndim=1, mode="c"] ql not None,\
-                          np.ndarray[double, ndim=1, mode="c"] qr not None,\
-                          double h_left,\
-                          double h_right,\
-			  double hle,\
-			  double hre,\
-                          np.ndarray[double, ndim=1, mode="c"] edgeflux not None,\
-                          double epsilon,\
-			  double ze,\
-                          double g,\
-                          double H0,\
-			  double hc,\
-			  double hc_n,\
-			  int64_t low_froude):
-
-        cdef double h0, limiting_threshold, max_speed, pressure_flux
-        cdef int64_t err
-
-        h0 = H0*H0
-        limiting_threshold = 10*H0
-
-        err = _flux_function_central(&ql[0], &qr[0],
-	      			     h_left, h_right, hle, hre, normal[0], normal[1],
-				     epsilon, ze, limiting_threshold, g,
-				     &edgeflux[0], &max_speed, &pressure_flux,
-				     hc, hc_n, low_froude)
-
-        assert err >= 0, "Discontinuous Elevation"
-
-        return max_speed, pressure_flux
-
-
-
-def compute_fluxes_ext_central(object domain_object, double timestep):
-
-        cdef domain D
-
-        # FIXME SR: These should presumably only be called at the start of evolve loop
-        # FIXME SR: How do we store D in the domain object?
-        get_python_domain_parameters(&D, domain_object)
-        get_python_domain_pointers(&D, domain_object)
-
-        with nogil:
-                timestep = _compute_fluxes_central(&D, timestep)
-
-        return timestep
-
-
-def extrapolate_second_order_sw(object domain_object):
-
-        cdef domain D
-        cdef int64_t e
-
-        get_python_domain_parameters(&D, domain_object)
-        get_python_domain_pointers(&D, domain_object)
-
-        with nogil:
-             e = _extrapolate_second_order_sw(&D)
-
-        if e == -1:
-                return None
-
-# Existing code
-def extrapolate_second_order_edge_sw(object domain_object):
-
-        cdef domain D
-        cdef int64_t e
-
-        get_python_domain_parameters(&D, domain_object)
-        get_python_domain_pointers(&D, domain_object)
-
-        with nogil:
-                e = _extrapolate_second_order_edge_sw(&D)
-
-        if e == -1:
-                return None
-
-def protect_new(object domain_object):
-
-        cdef domain D
-
-        cdef double mass_error
-
-        get_python_domain_parameters(&D, domain_object)
-        get_python_domain_pointers(&D, domain_object)
-
-        with nogil:
-                mass_error = _protect_new(&D)
-
-        return mass_error
-
-def compute_flux_update_frequency(object domain_object, double timestep):
-
-        cdef domain D
-
-        get_python_domain_parameters(&D, domain_object)
-        get_python_domain_pointers(&D, domain_object)
-
-        with nogil:
-                _compute_flux_update_frequency(&D, timestep)
-
-
-def gravity(object domain_object):
-
-        cdef domain D
-
-        get_python_domain_parameters(&D, domain_object)
-        get_python_domain_pointers(&D, domain_object)
-
-        err = _gravity(&D)
-
-        if err == -1:
-                return None
-
-def gravity_wb(object domain_object):
-
-        cdef domain D
-
-        get_python_domain_parameters(&D, domain_object)
-        get_python_domain_pointers(&D, domain_object)
-
-        err = _gravity_wb(&D)
-
-        if err == -1:
-                return None
-
-def manning_friction_flat(double g,\
-                        double eps,\
-                        np.ndarray[double, ndim=1, mode="c"] w not None,\
-                        np.ndarray[double, ndim=1, mode="c"] uh not None,\
-                        np.ndarray[double, ndim=1, mode="c"] vh not None,\
-                        np.ndarray[double, ndim=1, mode="c"] z not None,\
-                        np.ndarray[double, ndim=1, mode="c"] eta not None,\
-                        np.ndarray[double, ndim=1, mode="c"] xmom not None,\
-                        np.ndarray[double, ndim=1, mode="c"] ymom not None):
-
-        cdef int64_t N
-
-        N = w.shape[0]
-        _manning_friction_flat(g, eps, N, &w[0], &z[0], &uh[0], &vh[0], &eta[0], &xmom[0], &ymom[0])
-
-def manning_friction_sloped(double g, double eps,\
-                        np.ndarray[double, ndim=2, mode="c"] x not None,\
-                        np.ndarray[double, ndim=1, mode="c"] w not None,\
-                        np.ndarray[double, ndim=1, mode="c"] uh not None,\
-                        np.ndarray[double, ndim=1, mode="c"] vh not None,\
-                        np.ndarray[double, ndim=2, mode="c"] z not None,\
-                        np.ndarray[double, ndim=1, mode="c"] eta not None,\
-                        np.ndarray[double, ndim=1, mode="c"] xmom not None,\
-                        np.ndarray[double, ndim=1, mode="c"] ymom not None):
-
-        cdef int64_t N
-
-        N = w.shape[0]
-        _manning_friction_sloped(g, eps, N, &x[0,0], &w[0], &z[0,0], &uh[0], &vh[0], &eta[0], &xmom[0], &ymom[0])
-
-def fix_negative_cells(object domain_object):
-
-        cdef domain D
-        cdef int64_t num_negative_cells
-
-        get_python_domain_parameters(&D, domain_object)
-        get_python_domain_pointers(&D, domain_object)
-
-        with nogil:
-                num_negative_cells = _orig_fix_negative_cells(&D)
-
-        return num_negative_cells
diff --git a/anuga/shallow_water/sw_domain_simd.c b/anuga/shallow_water/sw_domain_simd.c
deleted file mode 100644
index aea9b2362..000000000
--- a/anuga/shallow_water/sw_domain_simd.c
+++ /dev/null
@@ -1,1958 +0,0 @@
-// Python - C extension module for shallow_water.py
-//
-// To compile (Python2.6):
-//  gcc -c swb2_domain_ext.c -I/usr/include/python2.6 -o domain_ext.o -Wall -O
-//  gcc -shared swb2_domain_ext.o  -o swb2_domain_ext.so
-//
-// or use python compile.py
-//
-// See the module swb_domain.py for more documentation on
-// how to use this module
-//
-//
-// Stephen Roberts, ANU 2009
-// Ole Nielsen, GA 2004
-// Gareth Davies, GA 2011
-
-#include "math.h"
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <stdint.h>
-
-
-#include "sw_domain.h"
-
-const double pi = 3.14159265358979;
-
-// Trick to compute n modulo d (n%d in python) when d is a power of 2
-uint64_t __mod_of_power_2(uint64_t n, uint64_t d)
-{
-  return (n & (d - 1));
-}
-
-// Computational function for rotation
-int64_t __rotate(double *q, double n1, double n2)
-{
-  /*Rotate the last  2 coordinates of q (q[1], q[2])
-    from x,y coordinates to coordinates based on normal vector (n1, n2).
-
-    Result is returned in array 2x1 r
-    To rotate in opposite direction, call rotate with (q, n1, -n2)
-
-    Contents of q are changed by this function */
-
-  double q1, q2;
-
-  // Shorthands
-  q1 = q[1]; // x coordinate
-  q2 = q[2]; // y coordinate
-
-  // Rotate
-  q[1] = n1 * q1 + n2 * q2;
-  q[2] = -n2 * q1 + n1 * q2;
-
-  return 0;
-}
-
-// Innermost flux function (using stage w=z+h)
-int64_t __flux_function_central(double *q_left, double *q_right,
-                            double h_left, double h_right,
-                            double hle, double hre,
-                            double n1, double n2,
-                            double epsilon,
-                            double ze,
-                            double limiting_threshold,
-                            double g,
-                            double *edgeflux, double *max_speed,
-                            double *pressure_flux, double hc,
-                            double hc_n,
-                            int64_t low_froude)
-{
-
-  /*Compute fluxes between volumes for the shallow water wave equation
-    cast in terms of the 'stage', w = h+z using
-    the 'central scheme' as described in
-
-    Kurganov, Noelle, Petrova. 'Semidiscrete Central-Upwind Schemes For
-    Hyperbolic Conservation Laws and Hamilton-Jacobi Equations'.
-    Siam J. Sci. Comput. Vol. 23, No. 3, pp. 707-740.
-
-    The implemented formula is given in equation (3.15) on page 714
-
-    FIXME: Several variables in this interface are no longer used, clean up
-  */
-
-  int64_t i;
-
-  double uh_left, vh_left, u_left;
-  double uh_right, vh_right, u_right;
-  double s_min, s_max, soundspeed_left, soundspeed_right;
-  double denom, inverse_denominator;
-  double tmp, local_fr, v_right, v_left;
-  double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3];
-
-  if (h_left == 0. && h_right == 0.)
-  {
-    // Quick exit
-    memset(edgeflux, 0, 3 * sizeof(double));
-    *max_speed = 0.0;
-    *pressure_flux = 0.;
-    return 0;
-  }
-  // Copy conserved quantities to protect from modification
-  q_left_rotated[0] = q_left[0];
-  q_right_rotated[0] = q_right[0];
-  q_left_rotated[1] = q_left[1];
-  q_right_rotated[1] = q_right[1];
-  q_left_rotated[2] = q_left[2];
-  q_right_rotated[2] = q_right[2];
-
-  // Align x- and y-momentum with x-axis
-  __rotate(q_left_rotated, n1, n2);
-  __rotate(q_right_rotated, n1, n2);
-
-  // Compute speeds in x-direction
-  // w_left = q_left_rotated[0];
-  uh_left = q_left_rotated[1];
-  vh_left = q_left_rotated[2];
-  if (hle > 0.0)
-  {
-    tmp = 1.0 / hle;
-    u_left = uh_left * tmp; // max(h_left, 1.0e-06);
-    uh_left = h_left * u_left;
-    v_left = vh_left * tmp; // Only used to define local_fr
-    vh_left = h_left * tmp * vh_left;
-  }
-  else
-  {
-    u_left = 0.;
-    uh_left = 0.;
-    vh_left = 0.;
-    v_left = 0.;
-  }
-
-  // u_left = _compute_speed(&uh_left, &hle,
-  //             epsilon, h0, limiting_threshold);
-
-  // w_right = q_right_rotated[0];
-  uh_right = q_right_rotated[1];
-  vh_right = q_right_rotated[2];
-  if (hre > 0.0)
-  {
-    tmp = 1.0 / hre;
-    u_right = uh_right * tmp; // max(h_right, 1.0e-06);
-    uh_right = h_right * u_right;
-    v_right = vh_right * tmp; // Only used to define local_fr
-    vh_right = h_right * tmp * vh_right;
-  }
-  else
-  {
-    u_right = 0.;
-    uh_right = 0.;
-    vh_right = 0.;
-    v_right = 0.;
-  }
-  // u_right = _compute_speed(&uh_right, &hre,
-  //               epsilon, h0, limiting_threshold);
-
-  // Maximal and minimal wave speeds
-  soundspeed_left = sqrt(g * h_left);
-  soundspeed_right = sqrt(g * h_right);
-  // soundspeed_left  = sqrt(g*hle);
-  // soundspeed_right = sqrt(g*hre);
-
-  // Something that scales like the Froude number
-  // We will use this to scale the diffusive component of the UH/VH fluxes.
-
-  // local_fr = sqrt(
-  //     max(0.001, min(1.0,
-  //         (u_right*u_right + u_left*u_left + v_right*v_right + v_left*v_left)/
-  //         (soundspeed_left*soundspeed_left + soundspeed_right*soundspeed_right + 1.0e-10))));
-  if (low_froude == 1)
-  {
-    local_fr = sqrt(
-        fmax(0.001, fmin(1.0,
-                         (u_right * u_right + u_left * u_left + v_right * v_right + v_left * v_left) /
-                             (soundspeed_left * soundspeed_left + soundspeed_right * soundspeed_right + 1.0e-10))));
-  }
-  else if (low_froude == 2)
-  {
-    local_fr = sqrt((u_right * u_right + u_left * u_left + v_right * v_right + v_left * v_left) /
-                    (soundspeed_left * soundspeed_left + soundspeed_right * soundspeed_right + 1.0e-10));
-    local_fr = sqrt(fmin(1.0, 0.01 + fmax(local_fr - 0.01, 0.0)));
-  }
-  else
-  {
-    local_fr = 1.0;
-  }
-  // printf("local_fr %e \n:", local_fr);
-
-  s_max = fmax(u_left + soundspeed_left, u_right + soundspeed_right);
-  if (s_max < 0.0)
-  {
-    s_max = 0.0;
-  }
-
-  // if( hc < 1.0e-03){
-  //   s_max = 0.0;
-  // }
-
-  s_min = fmin(u_left - soundspeed_left, u_right - soundspeed_right);
-  if (s_min > 0.0)
-  {
-    s_min = 0.0;
-  }
-
-  // if( hc_n < 1.0e-03){
-  //   s_min = 0.0;
-  // }
-
-  // Flux formulas
-  flux_left[0] = u_left * h_left;
-  flux_left[1] = u_left * uh_left; //+ 0.5*g*h_left*h_left;
-  flux_left[2] = u_left * vh_left;
-
-  flux_right[0] = u_right * h_right;
-  flux_right[1] = u_right * uh_right; //+ 0.5*g*h_right*h_right;
-  flux_right[2] = u_right * vh_right;
-
-  // Flux computation
-  denom = s_max - s_min;
-  if (denom < epsilon)
-  {
-    // Both wave speeds are very small
-    memset(edgeflux, 0, 3 * sizeof(double));
-
-    *max_speed = 0.0;
-    //*pressure_flux = 0.0;
-    *pressure_flux = 0.5 * g * 0.5 * (h_left * h_left + h_right * h_right);
-  }
-  else
-  {
-    // Maximal wavespeed
-    *max_speed = fmax(s_max, -s_min);
-
-    inverse_denominator = 1.0 / fmax(denom, 1.0e-100);
-    for (i = 0; i < 3; i++)
-    {
-      edgeflux[i] = s_max * flux_left[i] - s_min * flux_right[i];
-
-      // Standard smoothing term
-      // edgeflux[i] += 1.0*(s_max*s_min)*(q_right_rotated[i] - q_left_rotated[i]);
-      // Smoothing by stage alone can cause high velocities / slow draining for nearly dry cells
-      if (i == 0)
-        edgeflux[i] += (s_max * s_min) * (fmax(q_right_rotated[i], ze) - fmax(q_left_rotated[i], ze));
-      // if(i==0) edgeflux[i] += (s_max*s_min)*(h_right - h_left);
-      if (i == 1)
-        edgeflux[i] += local_fr * (s_max * s_min) * (uh_right - uh_left);
-      if (i == 2)
-        edgeflux[i] += local_fr * (s_max * s_min) * (vh_right - vh_left);
-
-      edgeflux[i] *= inverse_denominator;
-    }
-    // Separate pressure flux, so we can apply different wet-dry hacks to it
-    *pressure_flux = 0.5 * g * (s_max * h_left * h_left - s_min * h_right * h_right) * inverse_denominator;
-
-    // Rotate back
-    __rotate(edgeflux, n1, -n2);
-  }
-
-  return 0;
-}
-
-int64_t __simd__flux_function_central(double q_left0, double q_left1, double q_left2,
-                                   double q_right0, double q_right1, double q_right2,
-                                   double h_left, double h_right,
-                                   double hle, double hre,
-                                   double n1, double n2,
-                                   double epsilon,
-                                   double ze,
-                                   double limiting_threshold,
-                                   double g,
-                                   double *edgeflux0, double *edgeflux1, double *edgeflux2,
-                                   double *max_speed,
-                                   double *pressure_flux, double hc,
-                                   double hc_n,
-                                   int64_t low_froude)
-{
-
-  double edgeflux[3];
-  double q_left[3];
-  double q_right[3];
-
-  int64_t ierr;
-
-  edgeflux[0] = *edgeflux0;
-  edgeflux[1] = *edgeflux1;
-  edgeflux[2] = *edgeflux2;
-
-  q_left[0] = q_left0;
-  q_left[1] = q_left1;
-  q_left[2] = q_left2;
-
-  q_right[0] = q_right0;
-  q_right[1] = q_right1;
-  q_right[2] = q_right2;
-
-  ierr = __flux_function_central(q_left, q_right,
-                                 h_left, h_right,
-                                 hle, hre,
-                                 n1, n2,
-                                 epsilon,
-                                 ze,
-                                 limiting_threshold,
-                                 g,
-                                 edgeflux, max_speed,
-                                 pressure_flux, hc,
-                                 hc_n,
-                                 low_froude);
-
-  *edgeflux0 = edgeflux[0];
-  *edgeflux1 = edgeflux[1];
-  *edgeflux2 = edgeflux[2];
-
-  return ierr;
-}
-
-double __adjust_edgeflux_with_weir(double *edgeflux,
-                                   double h_left, double h_right,
-                                   double g, double weir_height,
-                                   double Qfactor,
-                                   double s1, double s2,
-                                   double h1, double h2,
-                                   double *max_speed_local)
-{
-  // Adjust the edgeflux to agree with a weir relation [including
-  // subergence], but smoothly vary to shallow water solution when
-  // the flow over the weir is much deeper than the weir, or the
-  // upstream/downstream water elevations are too similar
-  double rw, rw2; // 'Raw' weir fluxes
-  double rwRat, hdRat, hdWrRat, scaleFlux, minhd, maxhd;
-  double w1, w2; // Weights for averaging
-  double newFlux;
-  double twothirds = (2.0 / 3.0);
-  // Following constants control the 'blending' with the shallow water solution
-  // They are now user-defined
-  // double s1=0.9; // At this submergence ratio, begin blending with shallow water solution
-  // double s2=0.95; // At this submergence ratio, completely use shallow water solution
-  // double h1=1.0; // At this (tailwater height above weir) / (weir height) ratio, begin blending with shallow water solution
-  // double h2=1.5; // At this (tailwater height above weir) / (weir height) ratio, completely use the shallow water solution
-
-  if ((h_left <= 0.0) && (h_right <= 0.0))
-  {
-    return 0;
-  }
-
-  minhd = fmin(h_left, h_right);
-  maxhd = fmax(h_left, h_right);
-  // 'Raw' weir discharge = Qfactor*2/3*H*(2/3*g*H)**0.5
-  rw = Qfactor * twothirds * maxhd * sqrt(twothirds * g * maxhd);
-  // Factor for villemonte correction
-  rw2 = Qfactor * twothirds * minhd * sqrt(twothirds * g * minhd);
-  // Useful ratios
-  rwRat = rw2 / fmax(rw, 1.0e-100);
-  hdRat = minhd / fmax(maxhd, 1.0e-100);
-
-  // (tailwater height above weir)/weir_height ratio
-  hdWrRat = minhd / fmax(weir_height, 1.0e-100);
-
-  // Villemonte (1947) corrected weir flow with submergence
-  // Q = Q1*(1-Q2/Q1)**0.385
-  rw = rw * pow(1.0 - rwRat, 0.385);
-
-  if (h_right > h_left)
-  {
-    rw *= -1.0;
-  }
-
-  if ((hdRat < s2) & (hdWrRat < h2))
-  {
-    // Rescale the edge fluxes so that the mass flux = desired flux
-    // Linearly shift to shallow water solution between hdRat = s1 and s2
-    // and between hdWrRat = h1 and h2
-
-    //
-    // WEIGHT WITH RAW SHALLOW WATER FLUX BELOW
-    // This ensures that as the weir gets very submerged, the
-    // standard shallow water equations smoothly take over
-    //
-
-    // Weighted average constants to transition to shallow water eqn flow
-    w1 = fmin(fmax(hdRat - s1, 0.) / (s2 - s1), 1.0);
-
-    // Adjust again when the head is too deep relative to the weir height
-    w2 = fmin(fmax(hdWrRat - h1, 0.) / (h2 - h1), 1.0);
-
-    newFlux = (rw * (1.0 - w1) + w1 * edgeflux[0]) * (1.0 - w2) + w2 * edgeflux[0];
-
-    if (fabs(edgeflux[0]) > 1.0e-100)
-    {
-      scaleFlux = newFlux / edgeflux[0];
-    }
-    else
-    {
-      scaleFlux = 0.;
-    }
-
-    scaleFlux = fmax(scaleFlux, 0.);
-
-    edgeflux[0] = newFlux;
-
-    // FIXME: Do this in a cleaner way
-    // IDEA: Compute momentum flux implied by weir relations, and use
-    //       those in a weighted average (rather than the rescaling trick here)
-    // If we allow the scaling to momentum to be unbounded,
-    // velocity spikes can arise for very-shallow-flooded walls
-    edgeflux[1] *= fmin(scaleFlux, 10.);
-    edgeflux[2] *= fmin(scaleFlux, 10.);
-  }
-
-  // Adjust the max speed
-  if (fabs(edgeflux[0]) > 0.)
-  {
-    *max_speed_local = sqrt(g * (maxhd + weir_height)) + fabs(edgeflux[0] / (maxhd + 1.0e-12));
-  }
-  //*max_speed_local += fabs(edgeflux[0])/(maxhd+1.0e-100);
-  //*max_speed_local *= fmax(scaleFlux, 1.0);
-
-  return 0;
-}
-
-double __simd__adjust_edgeflux_with_weir(double *edgeflux0, double *edgeflux1, double *edgeflux2,
-                                          double h_left, double h_right,
-                                          double g, double weir_height,
-                                          double Qfactor,
-                                          double s1, double s2,
-                                          double h1, double h2,
-                                          double *max_speed_local)
-{
-
-  double edgeflux[3];
-  int64_t ierr;
-
-  edgeflux[0] = *edgeflux0;
-  edgeflux[1] = *edgeflux1;
-  edgeflux[2] = *edgeflux2;
-
-  ierr = __adjust_edgeflux_with_weir(edgeflux0, h_left, h_right,
-                                     g, weir_height,
-                                     Qfactor, s1, s2, h1, h2,
-                                     max_speed_local);
-  *edgeflux0 = edgeflux[0];
-  *edgeflux1 = edgeflux[1];
-  *edgeflux2 = edgeflux[2];
-
-  return ierr;
-}
-
-// Computational function for flux computation
-double _simd_compute_fluxes_central(struct domain *D,
-                                      double timestep)
-{
-  // Local variables
-  int64_t K = D->number_of_elements;
-  // int64_t KI, KI2, KI3, B, RW, RW5, SubSteps;
-  int64_t substep_count;
-
-  double max_speed_local, length, inv_area, zl, zr;
-  double h_left, h_right, z_half; // For andusse scheme
-  // FIXME: limiting_threshold is not used for DE1
-  double limiting_threshold = 10 * D->H0;
-  int64_t low_froude = D->low_froude;
-  double g = D->g;
-  double epsilon = D->epsilon;
-  int64_t ncol_riverwall_hydraulic_properties = D->ncol_riverwall_hydraulic_properties;
-
-  // Workspace (making them static actually made function slightly slower (Ole))
-  double ql[3];
-  double qr[3];
-  double edgeflux[3]; // Work array for summing up fluxes
-  double pressuregrad_work;
-  double edge_timestep;
-  double normal_x, normal_y;
-  // static double local_timestep;
-
-  double hle, hre, zc, zc_n, Qfactor, s1, s2, h1, h2;
-  double pressure_flux, hc, hc_n;
-  double h_left_tmp, h_right_tmp;
-  double speed_max_last, weir_height;
-  int64_t RiverWall_count;
-
-  //
-  int64_t k, i, m, n, ii;
-  int64_t ki, nm = 0, ki2; // Index shorthands
-
-  static int64_t call = 0; // Static local variable flagging already computed flux
-  static int64_t timestep_fluxcalls = 1;
-  static int64_t base_call = 1;
-
-  call++; // Flag 'id' of flux calculation for this timestep
-
-  if (D->timestep_fluxcalls != timestep_fluxcalls)
-  {
-    timestep_fluxcalls = D->timestep_fluxcalls;
-    base_call = call;
-  }
-
-  // Which substep of the timestepping method are we on?
-  substep_count = (call - base_call) % D->timestep_fluxcalls;
-
-  double local_timestep = 1.0e+100;
-  double boundary_flux_sum_substep = 0.0; 
-
-// For all triangles
-// #pragma omp parallel for simd default(none) shared(D, substep_count, K) \
-//                                      firstprivate(ncol_riverwall_hydraulic_properties, epsilon, g, low_froude, limiting_threshold) \
-//                                      private(i, ki, ki2, n, m, nm, ii,                                                  \
-//                                      max_speed_local, length, inv_area, zl, zr,                                         \
-//                                      h_left, h_right,                                                                   \
-//                                      z_half, ql,  pressuregrad_work,                                                    \
-//                                      qr, edgeflux, edge_timestep, normal_x, normal_y,                                   \
-//                                      hle, hre, zc, zc_n, Qfactor, s1, s2, h1, h2, pressure_flux, hc, hc_n,              \
-//                                      h_left_tmp, h_right_tmp, speed_max_last, weir_height, RiverWall_count)             \
-//                                      reduction(min : local_timestep) reduction(+:boundary_flux_sum_substep)
-  for (k = 0; k < K; k++)
-  {
-    speed_max_last = 0.0;
-    // Set explicit_update to zero for all conserved_quantities.
-    // This assumes compute_fluxes called before forcing terms
-    D->stage_explicit_update[k] = 0.0;
-    D->xmom_explicit_update[k] = 0.0;
-    D->ymom_explicit_update[k] = 0.0;
-
-    // Loop through neighbours and compute edge flux for each
-    for (i = 0; i < 3; i++)
-    {
-      ki = 3 * k + i; // Linear index to edge i of triangle k
-      ki2 = 2 * ki;   // k*6 + i*2
-
-      // Get left hand side values from triangle k, edge i
-      ql[0] = D->stage_edge_values[ki];
-      ql[1] = D->xmom_edge_values[ki];
-      ql[2] = D->ymom_edge_values[ki];
-      zl    = D->bed_edge_values[ki];
-      hle   = D->height_edge_values[ki];
-
-      hc = D->height_centroid_values[k];
-      zc = D->bed_centroid_values[k];
-
-      // Get right hand side values either from neighbouring triangle
-      // or from boundary array (Quantities at neighbour on nearest face).
-      n = D->neighbours[ki];
-      hc_n = hc;
-      zc_n = D->bed_centroid_values[k];
-      if (n < 0)
-      {
-        // Neighbour is a boundary condition
-        m = -n - 1; // Convert negative flag to boundary index
-
-        qr[0] = D->stage_boundary_values[m];
-        qr[1] = D->xmom_boundary_values[m];
-        qr[2] = D->ymom_boundary_values[m];
-        zr = zl;                   // Extend bed elevation to boundary
-        hre = fmax(qr[0] - zr, 0.0); // hle;
-      }
-      else
-      {
-        // Neighbour is a real triangle
-        hc_n = D->height_centroid_values[n];
-        zc_n = D->bed_centroid_values[n];
-
-        m = D->neighbour_edges[ki];
-        nm = n * 3 + m; // Linear index (triangle n, edge m)
-
-        qr[0] = D->stage_edge_values[nm];
-        qr[1] = D->xmom_edge_values[nm];
-        qr[2] = D->ymom_edge_values[nm];
-        zr = D->bed_edge_values[nm];
-        hre = D->height_edge_values[nm];
-      }
-
-      // Audusse magic for well balancing
-      z_half = fmax(zl, zr);
-
-      // Account for riverwalls
-      if (D->edge_flux_type[ki] == 1)
-      {
-        RiverWall_count = D->edge_river_wall_counter[ki];
-
-        // Set central bed to riverwall elevation
-        z_half = fmax(D->riverwall_elevation[RiverWall_count - 1], z_half);
-      }
-
-      // Define h left/right for Audusse flux method
-      h_left = fmax(hle + zl - z_half, 0.);
-      h_right = fmax(hre + zr - z_half, 0.);
-
-      normal_x = D->normals[ki2];
-      normal_y = D->normals[ki2 + 1];
-
-      // Edge flux computation (triangle k, edge i)
-      __flux_function_central(ql, qr,
-                              h_left, h_right,
-                              hle, hre,
-                              normal_x, normal_y,
-                              epsilon, z_half, limiting_threshold, g,
-                              edgeflux, &max_speed_local, &pressure_flux,
-                              hc, hc_n, low_froude);
-
-      // Force weir discharge to match weir theory
-      if (D->edge_flux_type[ki] == 1)
-      {
-
-        RiverWall_count = D->edge_river_wall_counter[ki];
-
-        // printf("RiverWall_count %ld\n", RiverWall_count);
-
-        ii = D->riverwall_rowIndex[RiverWall_count - 1] * ncol_riverwall_hydraulic_properties;
-
-        // Get Qfactor index - multiply the idealised weir discharge by this constant factor
-        // Get s1, submergence ratio at which we start blending with the shallow water solution
-        // Get s2, submergence ratio at which we entirely use the shallow water solution
-        // Get h1, tailwater head / weir height at which we start blending with the shallow water solution
-        // Get h2, tailwater head / weir height at which we entirely use the shallow water solution
-        Qfactor = D->riverwall_hydraulic_properties[ii];
-        s1 = D->riverwall_hydraulic_properties[ii + 1];
-        s2 = D->riverwall_hydraulic_properties[ii + 2];
-        h1 = D->riverwall_hydraulic_properties[ii + 3];
-        h2 = D->riverwall_hydraulic_properties[ii + 4];
-
-        weir_height = fmax(D->riverwall_elevation[RiverWall_count - 1] - fmin(zl, zr), 0.); // Reference weir height
-
-        // Use first-order h's for weir -- as the 'upstream/downstream' heads are
-        //  measured away from the weir itself
-        h_left_tmp = fmax(D->stage_centroid_values[k] - z_half, 0.);
-
-        if (n >= 0)
-        {
-          h_right_tmp = fmax(D->stage_centroid_values[n] - z_half, 0.);
-        }
-        else
-        {
-          h_right_tmp = fmax(hc_n + zr - z_half, 0.);
-        }
-
-        // If the weir is not higher than both neighbouring cells, then
-        // do not try to match the weir equation. If we do, it seems we
-        // can get mass conservation issues (caused by large weir
-        // fluxes in such situations)
-        if (D->riverwall_elevation[RiverWall_count - 1] > fmax(zc, zc_n))
-        {
-          // Weir flux adjustment
-          __adjust_edgeflux_with_weir(edgeflux, h_left_tmp, h_right_tmp, g,
-                                             weir_height, Qfactor,
-                                             s1, s2, h1, h2, &max_speed_local);
-        }
-      }
-
-      // Multiply edgeflux by edgelength
-      length = D->edgelengths[ki];
-      edgeflux[0] = -edgeflux[0]*length;
-      edgeflux[1] = -edgeflux[1]*length;
-      edgeflux[2] = -edgeflux[2]*length;
-
-      // bedslope_work contains all gravity related terms
-      pressuregrad_work = length * (-g * 0.5 * (h_left * h_left - hle * hle - (hle + hc) * (zl - zc)) + pressure_flux);
-
-      // Update timestep based on edge i and possibly neighbour n
-      // NOTE: We should only change the timestep on the 'first substep'
-      // of the timestepping method [substep_count==0]
-      if (substep_count == 0)
-      {
-
-        // Compute the 'edge-timesteps' (useful for setting flux_update_frequency)
-        edge_timestep = D->radii[k] *1.0 / fmax(max_speed_local, epsilon);
-
-        // Update the timestep
-        if ((D->tri_full_flag[k] == 1))
-        {
-          if (max_speed_local > epsilon)
-          {
-            // Apply CFL condition for triangles joining this edge (triangle k and triangle n)
-
-            // CFL for triangle k
-            local_timestep = fmin(local_timestep, edge_timestep);
-
-            speed_max_last = fmax(speed_max_last, max_speed_local);
-          }
-        }
-      }
-
-
-      D->stage_explicit_update[k] += edgeflux[0];
-      D->xmom_explicit_update[k]  += edgeflux[1];
-      D->ymom_explicit_update[k]  += edgeflux[2];
-
-      // If this cell is not a ghost, and the neighbour is a
-      // boundary condition OR a ghost cell, then add the flux to the
-      // boundary_flux_integral
-      if (((n < 0) & (D->tri_full_flag[k] == 1)) | ((n >= 0) && ((D->tri_full_flag[k] == 1) & (D->tri_full_flag[n] == 0))))
-      {
-        // boundary_flux_sum is an array with length = timestep_fluxcalls
-        // For each sub-step, we put the boundary flux sum in.
-        boundary_flux_sum_substep += edgeflux[0];
-      }
-
-      D->xmom_explicit_update[k] -= D->normals[ki2] * pressuregrad_work;
-      D->ymom_explicit_update[k] -= D->normals[ki2 + 1] * pressuregrad_work;
-
-    } // End edge i (and neighbour n)
-
-    // Keep track of maximal speeds
-    if (substep_count == 0)
-      D->max_speed[k] = speed_max_last; // max_speed;
-
-    // Normalise triangle k by area and store for when all conserved
-    // quantities get updated
-    inv_area = 1.0 / D->areas[k];
-    D->stage_explicit_update[k] *= inv_area;
-    D->xmom_explicit_update[k] *= inv_area;
-    D->ymom_explicit_update[k] *= inv_area;  
-
-  } // End triangle k
-
-
-
-//   // Now add up stage, xmom, ymom explicit updates
-
-// #pragma omp parallel for private(k, i, ki, ki2, ki3, n, inv_area) reduction(+:boundary_flux_sum_substep)
-//   for (k = 0; k < K; k++)
-//   {
-//     for (i = 0; i < 3; i++)
-//     {
-//       // FIXME: Make use of neighbours to efficiently set things
-//       ki = 3 * k + i;
-//       ki2 = ki * 2;
-//       ki3 = ki * 3;
-//       n = D->neighbours[ki];
-
-//       D->stage_explicit_update[k] += D->edge_flux_work[ki3 + 0];
-//       D->xmom_explicit_update[k] += D->edge_flux_work[ki3 + 1];
-//       D->ymom_explicit_update[k] += D->edge_flux_work[ki3 + 2];
-
-//       // If this cell is not a ghost, and the neighbour is a
-//       // boundary condition OR a ghost cell, then add the flux to the
-//       // boundary_flux_integral
-//       if (((n < 0) & (D->tri_full_flag[k] == 1)) | ((n >= 0) && ((D->tri_full_flag[k] == 1) & (D->tri_full_flag[n] == 0))))
-//       {
-//         // boundary_flux_sum is an array with length = timestep_fluxcalls
-//         // For each sub-step, we put the boundary flux sum in.
-//         boundary_flux_sum_substep += D->edge_flux_work[ki3];
-//       }
-
-//       D->xmom_explicit_update[k] -= D->normals[ki2] * D->pressuregrad_work[ki];
-//       D->ymom_explicit_update[k] -= D->normals[ki2 + 1] * D->pressuregrad_work[ki];
-
-//     } // end edge i
-
-//     // Normalise triangle k by area and store for when all conserved
-//     // quantities get updated
-//     inv_area = 1.0 / D->areas[k];
-//     D->stage_explicit_update[k] *= inv_area;
-//     D->xmom_explicit_update[k] *= inv_area;
-//     D->ymom_explicit_update[k] *= inv_area;
-
-//   } // end cell k
-
-  // variable to accumulate D->boundary_flux_sum[substep_count]
-  D->boundary_flux_sum[substep_count] = boundary_flux_sum_substep;  
-
-  // Ensure we only update the timestep on the first call within each rk2/rk3 step
-  if (substep_count == 0)
-    timestep = local_timestep;
-
-  return timestep;
-}
-
-// Computational function for flux computation
-// with riverWall_count pulled out of triangle loop
-double _compute_fluxes_central_parallel_data_flow(struct domain *D, double timestep)
-{
-
-  // Local variables
-  double max_speed_local, length, inv_area, zl, zr;
-  double h_left, h_right, z_half; // For andusse scheme
-  // FIXME: limiting_threshold is not used for DE1
-  double limiting_threshold = 10 * D->H0;
-  int64_t low_froude = D->low_froude;
-  //
-  int64_t k, i, m, n, ii;
-  int64_t ki, nm = 0, ki2, ki3; // Index shorthands
-  // Workspace (making them static actually made function slightly slower (Ole))
-  double ql[3], qr[3], edgeflux[3]; // Work array for summing up fluxes
-  double bedslope_work;
-  static double local_timestep;
-  int64_t RiverWall_count, substep_count;
-  double hle, hre, zc, zc_n, Qfactor, s1, s2, h1, h2;
-  double pressure_flux, hc, hc_n, tmp;
-  double h_left_tmp, h_right_tmp;
-  static int64_t call = 0; // Static local variable flagging already computed flux
-  static int64_t timestep_fluxcalls = 1;
-  static int64_t base_call = 1;
-  double speed_max_last, weir_height;
-
-  call++; // Flag 'id' of flux calculation for this timestep
-
-  if (D->timestep_fluxcalls != timestep_fluxcalls)
-  {
-    timestep_fluxcalls = D->timestep_fluxcalls;
-    base_call = call;
-  }
-
-  // Set explicit_update to zero for all conserved_quantities.
-  // This assumes compute_fluxes called before forcing terms
-
-  // #pragma omp parallel for private(k)
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    D->stage_explicit_update[k] = 0.0;
-    D->xmom_explicit_update[k] = 0.0;
-    D->ymom_explicit_update[k] = 0.0;
-  }
-  // memset((char*) D->stage_explicit_update, 0, D->number_of_elements * sizeof (double));
-  // memset((char*) D->xmom_explicit_update, 0, D->number_of_elements * sizeof (double));
-  // memset((char*) D->ymom_explicit_update, 0, D->number_of_elements * sizeof (double));
-
-  // Counter for riverwall edges
-  RiverWall_count = 0;
-  // Which substep of the timestepping method are we on?
-  substep_count = (call - base_call) % D->timestep_fluxcalls;
-
-  // printf("call = %d substep_count = %d base_call = %d \n",call,substep_count, base_call);
-
-  // Fluxes are not updated every timestep,
-  // but all fluxes ARE updated when the following condition holds
-  if (D->allow_timestep_increase[0] == 1)
-  {
-    // We can only increase the timestep if all fluxes are allowed to be updated
-    // If this is not done the timestep can't increase (since local_timestep is static)
-    local_timestep = 1.0e+100;
-  }
-
-  // For all triangles
-  // Pull the edge_river_wall count outside parallel loop as in needs to be done sequentially
-  // move it to the initiation of the riverwall so only calculated once
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    for (i = 0; i < 3; i++)
-    {
-      ki = 3 * k + i;
-      D->edge_river_wall_counter[ki] = 0;
-      if (D->edge_flux_type[ki] == 1)
-      {
-        // Update counter of riverwall edges
-        RiverWall_count += 1;
-        D->edge_river_wall_counter[ki] = RiverWall_count;
-
-        // printf("RiverWall_count %d   edge_counter %d \n", RiverWall_count, D->edge_river_wall_counter[ki]);
-      }
-    }
-  }
-
-  RiverWall_count = 0;
-
-  // For all triangles
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    speed_max_last = 0.0;
-
-    // Loop through neighbours and compute edge flux for each
-    for (i = 0; i < 3; i++)
-    {
-      ki = 3 * k + i; // Linear index to edge i of triangle k
-      ki2 = 2 * ki;   // k*6 + i*2
-      ki3 = 3 * ki;
-
-      // Get left hand side values from triangle k, edge i
-      ql[0] = D->stage_edge_values[ki];
-      ql[1] = D->xmom_edge_values[ki];
-      ql[2] = D->ymom_edge_values[ki];
-      zl = D->bed_edge_values[ki];
-      hc = D->height_centroid_values[k];
-      zc = D->bed_centroid_values[k];
-      hle = D->height_edge_values[ki];
-
-      // Get right hand side values either from neighbouring triangle
-      // or from boundary array (Quantities at neighbour on nearest face).
-      n = D->neighbours[ki];
-      hc_n = hc;
-      zc_n = D->bed_centroid_values[k];
-      if (n < 0)
-      {
-        // Neighbour is a boundary condition
-        m = -n - 1; // Convert negative flag to boundary index
-
-        qr[0] = D->stage_boundary_values[m];
-        qr[1] = D->xmom_boundary_values[m];
-        qr[2] = D->ymom_boundary_values[m];
-        zr = zl;                    // Extend bed elevation to boundary
-        hre = fmax(qr[0] - zr, 0.); // hle;
-      }
-      else
-      {
-        // Neighbour is a real triangle
-        hc_n = D->height_centroid_values[n];
-        zc_n = D->bed_centroid_values[n];
-        m = D->neighbour_edges[ki];
-        nm = n * 3 + m; // Linear index (triangle n, edge m)
-
-        qr[0] = D->stage_edge_values[nm];
-        qr[1] = D->xmom_edge_values[nm];
-        qr[2] = D->ymom_edge_values[nm];
-        zr = D->bed_edge_values[nm];
-        hre = D->height_edge_values[nm];
-      }
-
-      // Audusse magic
-      z_half = fmax(zl, zr);
-
-      //// Account for riverwalls
-      if (D->edge_flux_type[ki] == 1)
-      {
-        if (n >= 0 && D->edge_flux_type[nm] != 1)
-        {
-          printf("Riverwall Error\n");
-        }
-        // Update counter of riverwall edges == index of
-        // riverwall_elevation + riverwall_rowIndex
-
-        // RiverWall_count += 1;
-        RiverWall_count = D->edge_river_wall_counter[ki];
-
-        // Set central bed to riverwall elevation
-        z_half = fmax(D->riverwall_elevation[RiverWall_count - 1], z_half);
-      }
-
-      // Define h left/right for Audusse flux method
-      h_left = fmax(hle + zl - z_half, 0.);
-      h_right = fmax(hre + zr - z_half, 0.);
-
-      // Edge flux computation (triangle k, edge i)
-      __flux_function_central(ql, qr,
-                              h_left, h_right,
-                              hle, hre,
-                              D->normals[ki2], D->normals[ki2 + 1],
-                              D->epsilon, z_half, limiting_threshold, D->g,
-                              edgeflux, &max_speed_local, &pressure_flux, hc, hc_n, low_froude);
-
-      // Force weir discharge to match weir theory
-      if (D->edge_flux_type[ki] == 1)
-      {
-        ii = D->riverwall_rowIndex[RiverWall_count - 1] * D->ncol_riverwall_hydraulic_properties;
-
-        // Get Qfactor index - multiply the idealised weir discharge by this constant factor
-        // Get s1, submergence ratio at which we start blending with the shallow water solution
-        // Get s2, submergence ratio at which we entirely use the shallow water solution
-        // Get h1, tailwater head / weir height at which we start blending with the shallow water solution
-        // Get h2, tailwater head / weir height at which we entirely use the shallow water solution
-        Qfactor = D->riverwall_hydraulic_properties[ii];
-        s1 = D->riverwall_hydraulic_properties[ii + 1];
-        s2 = D->riverwall_hydraulic_properties[ii + 2];
-        h1 = D->riverwall_hydraulic_properties[ii + 3];
-        h2 = D->riverwall_hydraulic_properties[ii + 4];
-
-        weir_height = fmax(D->riverwall_elevation[RiverWall_count - 1] - fmin(zl, zr), 0.); // Reference weir height
-
-        // Use first-order h's for weir -- as the 'upstream/downstream' heads are
-        //  measured away from the weir itself
-        h_left_tmp = fmax(D->stage_centroid_values[k] - z_half, 0.);
-
-        if (n >= 0)
-        {
-          h_right_tmp = fmax(D->stage_centroid_values[n] - z_half, 0.);
-        }
-        else
-        {
-          h_right_tmp = fmax(hc_n + zr - z_half, 0.);
-        }
-
-        // If the weir is not higher than both neighbouring cells, then
-        // do not try to match the weir equation. If we do, it seems we
-        // can get mass conservation issues (caused by large weir
-        // fluxes in such situations)
-        if (D->riverwall_elevation[RiverWall_count - 1] > fmax(zc, zc_n))
-        {
-          // Weir flux adjustment
-          __adjust_edgeflux_with_weir(edgeflux, h_left_tmp, h_right_tmp, D->g,
-                                      weir_height, Qfactor,
-                                      s1, s2, h1, h2, &max_speed_local);
-        }
-      }
-
-      // Multiply edgeflux by edgelength
-      length = D->edgelengths[ki];
-      edgeflux[0] *= length;
-      edgeflux[1] *= length;
-      edgeflux[2] *= length;
-
-      D->edge_flux_work[ki3 + 0] = -edgeflux[0];
-      D->edge_flux_work[ki3 + 1] = -edgeflux[1];
-      D->edge_flux_work[ki3 + 2] = -edgeflux[2];
-
-      // bedslope_work contains all gravity related terms
-      bedslope_work = length * (-D->g * 0.5 * (h_left * h_left - hle * hle - (hle + hc) * (zl - zc)) + pressure_flux);
-
-      D->pressuregrad_work[ki] = bedslope_work;
-
-      // Update timestep based on edge i and possibly neighbour n
-      // NOTE: We should only change the timestep on the 'first substep'
-      //  of the timestepping method [substep_count==0]
-      if (substep_count == 0)
-      {
-
-        // Compute the 'edge-timesteps' (useful for setting flux_update_frequency)
-        tmp = 1.0 / fmax(max_speed_local, D->epsilon);
-        D->edge_timestep[ki] = D->radii[k] * tmp;
-
-        // Update the timestep
-        if ((D->tri_full_flag[k] == 1))
-        {
-
-          speed_max_last = fmax(speed_max_last, max_speed_local);
-
-          if (max_speed_local > D->epsilon)
-          {
-            // Apply CFL condition for triangles joining this edge (triangle k and triangle n)
-
-            // CFL for triangle k
-            local_timestep = fmin(local_timestep, D->edge_timestep[ki]);
-
-            // if (n >= 0) {
-            //     // Apply CFL condition for neigbour n (which is on the ith edge of triangle k)
-            //    local_timestep = fmin(local_timestep, D->edge_timestep[nm]);
-            // }
-          }
-        }
-      }
-
-    } // End edge i (and neighbour n)
-
-    // Keep track of maximal speeds
-    if (substep_count == 0)
-      D->max_speed[k] = speed_max_last; // max_speed;
-
-  } // End triangle k
-
-  // Now add up stage, xmom, ymom explicit updates
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    hc = fmax(D->stage_centroid_values[k] - D->bed_centroid_values[k], 0.);
-
-    for (i = 0; i < 3; i++)
-    {
-      // FIXME: Make use of neighbours to efficiently set things
-      ki = 3 * k + i;
-      ki2 = ki * 2;
-      ki3 = ki * 3;
-      n = D->neighbours[ki];
-
-      D->stage_explicit_update[k] += D->edge_flux_work[ki3 + 0];
-      D->xmom_explicit_update[k] += D->edge_flux_work[ki3 + 1];
-      D->ymom_explicit_update[k] += D->edge_flux_work[ki3 + 2];
-
-      // If this cell is not a ghost, and the neighbour is a
-      // boundary condition OR a ghost cell, then add the flux to the
-      // boundary_flux_integral
-      if (((n < 0) & (D->tri_full_flag[k] == 1)) | ((n >= 0) && ((D->tri_full_flag[k] == 1) & (D->tri_full_flag[n] == 0))))
-      {
-        // boundary_flux_sum is an array with length = timestep_fluxcalls
-        // For each sub-step, we put the boundary flux sum in.
-        D->boundary_flux_sum[substep_count] += D->edge_flux_work[ki3];
-      }
-
-      D->xmom_explicit_update[k] -= D->normals[ki2] * D->pressuregrad_work[ki];
-      D->ymom_explicit_update[k] -= D->normals[ki2 + 1] * D->pressuregrad_work[ki];
-
-    } // end edge i
-
-    // Normalise triangle k by area and store for when all conserved
-    // quantities get updated
-    inv_area = 1.0 / D->areas[k];
-    D->stage_explicit_update[k] *= inv_area;
-    D->xmom_explicit_update[k] *= inv_area;
-    D->ymom_explicit_update[k] *= inv_area;
-
-  } // end cell k
-
-  // Ensure we only update the timestep on the first call within each rk2/rk3 step
-  if (substep_count == 0)
-    timestep = local_timestep;
-
-  return timestep;
-}
-
-
-
-
-
-// Protect against the water elevation falling below the triangle bed
-double _simd_protect(struct domain *D)
-{
-
-  int64_t k, k3, K;
-  double hc, bmin;
-  double mass_error = 0.;
-
-  // double *wc;
-  // double *zc;
-  // double *wv;
-  // double *xmomc;
-  // double *ymomc;
-  // double *areas;
-
-  double minimum_allowed_height;
-
-  minimum_allowed_height = D->minimum_allowed_height;
-
-  K = D->number_of_elements;
-
-  // wc = D->stage_centroid_values;
-  // zc = D->bed_centroid_values;
-  // wv = D->stage_vertex_values;
-  // xmomc = D->xmom_centroid_values;
-  // ymomc = D->xmom_centroid_values;
-  // areas = D->areas;
-
-  // This acts like minimum_allowed height, but scales with the vertical
-  // distance between the bed_centroid_value and the max bed_edge_value of
-  // every triangle.
-  // double minimum_relative_height=0.05;
-  // int64_t mass_added = 0;
-
-  // Protect against inifintesimal and negative heights
-  // if (maximum_allowed_speed < epsilon) {
-  // #pragma omp parallel for private(k, k3, hc, bmin ) schedule(static) reduction(+ : mass_error) firstprivate (minimum_allowed_height)
-  for (k = 0; k < K; k++)
-  {
-    k3 = 3*k;
-    hc = D->stage_centroid_values[k] - D->bed_centroid_values[k];
-    if (hc < minimum_allowed_height * 1.0)
-    {
-      // Set momentum to zero and ensure h is non negative
-      D->xmom_centroid_values[k] = 0.;
-      D->xmom_centroid_values[k] = 0.;
-      if (hc <= 0.0)
-      {
-        bmin = D->bed_centroid_values[k];
-        // Minimum allowed stage = bmin
-
-        // WARNING: ADDING MASS if wc[k]<bmin
-        if (D->stage_centroid_values[k] < bmin)
-        {
-          mass_error += (bmin - D->stage_centroid_values[k]) * D->areas[k];
-          // mass_added = 1; //Flag to warn of added mass
-
-          D->stage_centroid_values[k] = bmin;
-
-          // FIXME: Set vertex values as well. Seems that this shouldn't be
-          // needed. However, from memory this is important at the first
-          // time step, for 'dry' areas where the designated stage is
-          // less than the bed centroid value
-          D->stage_vertex_values[k3] = bmin;     // min(bmin, wc[k]); //zv[3*k]-minimum_allowed_height);
-          D->stage_vertex_values[k3 + 1] = bmin; // min(bmin, wc[k]); //zv[3*k+1]-minimum_allowed_height);
-          D->stage_vertex_values[k3 + 2] = bmin; // min(bmin, wc[k]); //zv[3*k+2]-minimum_allowed_height);
-        }
-      }
-    }
-  }
-
-  // if(mass_added == 1){
-  //   printf("Cumulative mass protection: %f m^3 \n", mass_error);
-  // }
-
-  return mass_error;
-}
-
-
-static inline int64_t __find_qmin_and_qmax(double dq0, double dq1, double dq2,
-                         double *qmin, double *qmax)
-{
-  // Considering the centroid of an FV triangle and the vertices of its
-  // auxiliary triangle, find
-  // qmin=min(q)-qc and qmax=max(q)-qc,
-  // where min(q) and max(q) are respectively min and max over the
-  // four values (at the centroid of the FV triangle and the auxiliary
-  // triangle vertices),
-  // and qc is the centroid
-  // dq0=q(vertex0)-q(centroid of FV triangle)
-  // dq1=q(vertex1)-q(vertex0)
-  // dq2=q(vertex2)-q(vertex0)
-
-  // This is a simple implementation
-  *qmax = fmax(fmax(dq0, fmax(dq0 + dq1, dq0 + dq2)), 0.0);
-  *qmin = fmin(fmin(dq0, fmin(dq0 + dq1, dq0 + dq2)), 0.0);
-
-  return 0;
-}
-
-static inline int64_t __limit_gradient(double *dqv, double qmin, double qmax, double beta_w)
-{
-  // Given provisional jumps dqv from the FV triangle centroid to its
-  // vertices/edges, and jumps qmin (qmax) between the centroid of the FV
-  // triangle and the minimum (maximum) of the values at the auxiliary triangle
-  // vertices (which are centroids of neighbour mesh triangles), calculate a
-  // multiplicative factor phi by which the provisional vertex jumps are to be
-  // limited
-
-  int64_t i;
-  double r = 1000.0, r0 = 1.0, phi = 1.0;
-  static double TINY = 1.0e-100; // to avoid machine accuracy problems.
-  // FIXME: Perhaps use the epsilon used elsewhere.
-
-  // Any provisional jump with magnitude < TINY does not contribute to
-  // the limiting process.
-  // return 0;
-
-  for (i = 0; i < 3; i++)
-  {
-    if (dqv[i] < -TINY)
-      r0 = qmin / dqv[i];
-
-    if (dqv[i] > TINY)
-      r0 = qmax / dqv[i];
-
-    r = fmin(r0, r);
-  }
-
-  phi = fmin(r * beta_w, 1.0);
-  // phi=1.;
-  dqv[0] = dqv[0] * phi;
-  dqv[1] = dqv[1] * phi;
-  dqv[2] = dqv[2] * phi;
-
-  return 0;
-}
-
-static inline void __calc_edge_values(double beta_tmp, double cv_k, double cv_k0, double cv_k1, double cv_k2,
-                        double dxv0, double dxv1, double dxv2, double dyv0, double dyv1, double dyv2,
-                        double dx1, double dx2, double dy1, double dy2, double inv_area2,
-                        double *edge_values)
-{
-  double dqv[3];
-  double dq0, dq1, dq2;
-  double a, b;
-  double qmin, qmax;
-
-  if (beta_tmp > 0.)
-  {
-    // Calculate the difference between vertex 0 of the auxiliary
-    // triangle and the centroid of triangle k
-    dq0 = cv_k0 - cv_k;
-
-    // Calculate differentials between the vertices
-    // of the auxiliary triangle (centroids of neighbouring triangles)
-    dq1 = cv_k1 - cv_k0;
-    dq2 = cv_k2 - cv_k0;
-
-    // Calculate the gradient of stage on the auxiliary triangle
-    a = dy2 * dq1 - dy1 * dq2;
-    a *= inv_area2;
-    b = dx1 * dq2 - dx2 * dq1;
-    b *= inv_area2;
-    // Calculate provisional jumps in stage from the centroid
-    // of triangle k to its vertices, to be limited
-    dqv[0] = a * dxv0 + b * dyv0;
-    dqv[1] = a * dxv1 + b * dyv1;
-    dqv[2] = a * dxv2 + b * dyv2;
-
-    // Now we want to find min and max of the centroid and the
-    // vertices of the auxiliary triangle and compute jumps
-    // from the centroid to the min and max
-    __find_qmin_and_qmax(dq0, dq1, dq2, &qmin, &qmax);
-
-    // Limit the gradient
-    __limit_gradient(dqv, qmin, qmax, beta_tmp);
-
-    edge_values[0] = cv_k + dqv[0];
-    edge_values[1] = cv_k + dqv[1];
-    edge_values[2] = cv_k + dqv[2];
-  }
-  else
-  {
-    // Fast alternative when beta_tmp==0
-    edge_values[0] = cv_k;
-    edge_values[1] = cv_k;
-    edge_values[2] = cv_k;
-  }
-}
-
-static inline void __calc_edge_values_2_bdy(double beta, double cv_k, double cv_k0, 
-                        double dxv0, double dxv1, double dxv2, double dyv0, double dyv1, double dyv2,
-                        double dx1, double dx2, double dy1, double dy2, double inv_area2,
-                        double *edge_values)
-{
-  double dqv[3];
-  double dq0, dq1, dq2;
-  double a, b;
-  double qmin, qmax;
-
-
-  // Compute differentials
-  dq1 = cv_k0 - cv_k;
-
-  // Calculate the gradient between the centroid of triangle k
-  // and that of its neighbour
-  a = dq1 * dx2;
-  b = dq1 * dy2;
-
-  // Calculate provisional edge jumps, to be limited
-  dqv[0] = a * dxv0 + b * dyv0;
-  dqv[1] = a * dxv1 + b * dyv1;
-  dqv[2] = a * dxv2 + b * dyv2;
-
-  // Now limit the jumps
-  if (dq1 >= 0.0)
-  {
-    qmin = 0.0;
-    qmax = dq1;
-  }
-  else
-  {
-    qmin = dq1;
-    qmax = 0.0;
-  }
-
-  // Limit the gradient
-  __limit_gradient(dqv, qmin, qmax, beta);
-
-  edge_values[0] = cv_k + dqv[0];
-  edge_values[1] = cv_k + dqv[1];
-  edge_values[2] = cv_k + dqv[2];
-
-}
-
-
-
-
-
-// Computational routine
-int64_t _simd_extrapolate_second_order_edge_sw(struct domain *D, int64_t verbose)
-{
-
-  // Local variables
-  double a, b; // Gradient vector used to calculate edge values from centroids
-  int64_t k, k0, k1, k2, k3, k6, coord_index, i;
-  double x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2; // Vertices of the auxiliary triangle
-  double dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, dq1, area2, inv_area2;
-  double dqv[3], qmin, qmax, hmin, hmax;
-  double hc, h0, h1, h2, beta_tmp, hfactor;
-  double dk, dk_inv, a_tmp, b_tmp, c_tmp, d_tmp;
-  double edge_values[3];
-  double cv_k, cv_k0, cv_k1, cv_k2;
-
-  double x_centroid_work;
-  double xmom_centroid_values;
-  double y_centroid_work;
-  double ymom_centroid_values;
-
-  double minimum_allowed_height = D->minimum_allowed_height;
-  int64_t number_of_elements = D->number_of_elements;
-  int64_t extrapolate_velocity_second_order = D->extrapolate_velocity_second_order;
-
-  //int64_t verbose = 1;
-
-
-  // Parameters used to control how the limiter is forced to first-order near
-  // wet-dry regions
-  a_tmp = 0.3; // Highest depth ratio with hfactor=1
-  b_tmp = 0.1; // Highest depth ratio with hfactor=0
-  c_tmp = 1.0 / (a_tmp - b_tmp);
-  d_tmp = 1.0 - (c_tmp * a_tmp);
-
-  // Replace momentum centroid with velocity centroid to allow velocity
-  // extrapolation This will be changed back at the end of the routine
-
-  // Need to calculate height xmom and ymom centroid values for all triangles 
-  // before extrapolation and limiting
-
-  // #pragma omp parallel for simd shared(D) default(none) private(dk, dk_inv) firstprivate(number_of_elements, minimum_allowed_height, extrapolate_velocity_second_order)
-    for (k = 0; k < number_of_elements; k++)
-    {
-    dk = fmax(D->stage_centroid_values[k] - D->bed_centroid_values[k], 0.0);
-
-    D->height_centroid_values[k] = dk;
-    D->x_centroid_work[k] = 0.0;
-    D->y_centroid_work[k] = 0.0;
-
-    if (dk <= minimum_allowed_height)
-      {
-        D->x_centroid_work[k] = 0.0;
-        D->xmom_centroid_values[k] = 0.0;
-        D->y_centroid_work[k] = 0.0;
-        D->ymom_centroid_values[k] = 0.0;
-      }
-
-    if (extrapolate_velocity_second_order == 1)
-    {
-      if (dk > minimum_allowed_height)
-      {
-        dk_inv = 1.0 / dk;
-        D->x_centroid_work[k] = D->xmom_centroid_values[k];
-        D->xmom_centroid_values[k] = D->xmom_centroid_values[k] * dk_inv;
-
-        D->y_centroid_work[k] = D->ymom_centroid_values[k];
-        D->ymom_centroid_values[k] = D->ymom_centroid_values[k] * dk_inv;
-      }
-    }
-    } // end of for
-
-  if (verbose == 1)
-  {
-    printf("x_centroid_work after loop 1\n");
-    for (k=0; k < number_of_elements; k++)
-    {
-     printf("%ld %f \n",k, D->x_centroid_work[k]); 
-    }
-    
-    printf("xmom_centroid_values after loop 1\n");
-    for (k=0; k < number_of_elements; k++)
-    {
-     printf("%ld %f \n",k, D->xmom_centroid_values[k]); 
-    }
-  }
-
-  // Begin extrapolation routine
-
-// #pragma omp parallel for simd private(k0, k1, k2, k3, k6, coord_index, i, \
-//                           dx1, dx2, dy1, dy2, dxv0, dxv1, dxv2, dyv0, dyv1, dyv2, \
-//                           x_centroid_work, xmom_centroid_values, y_centroid_work, ymom_centroid_values, \
-//                           dq1, area2, inv_area2, \
-//                           cv_k, cv_k0, cv_k1, cv_k2, edge_values, \
-//                           x, y, x0, y0, x1, y1, x2, y2, xv0, yv0, xv1, yv1, xv2, yv2, \
-//                           dqv, qmin, qmax, hmin, hmax, \
-//                           hc, h0, h1, h2, beta_tmp, hfactor, \
-//                           dk, dk_inv, a, b) default(none) shared(D) \
-//                           firstprivate(number_of_elements, minimum_allowed_height, extrapolate_velocity_second_order, c_tmp, d_tmp)
-  for (k = 0; k < number_of_elements; k++)
-  {
-
-    //printf("%ld, %e \n",k, D->height_centroid_values[k]);
-    //printf("%ld,  %e, %e, %e, %e \n",k, x_centroid_work,xmom_centroid_values,y_centroid_work,ymom_centroid_values);
-    //printf("%ld,  %e, %e, %e, %e \n",k, D->x_centroid_work[k],D->xmom_centroid_values[k],D->y_centroid_work[k],D->ymom_centroid_values[k]);
-
-
-    // Useful indices
-    k2 = k * 2;
-    k3 = k * 3;
-    k6 = k * 6;
-
-    // Get the edge coordinates
-    xv0 = D->edge_coordinates[k6 + 0];
-    yv0 = D->edge_coordinates[k6 + 1];
-    xv1 = D->edge_coordinates[k6 + 2];
-    yv1 = D->edge_coordinates[k6 + 3];
-    xv2 = D->edge_coordinates[k6 + 4];
-    yv2 = D->edge_coordinates[k6 + 5];
-
-    // Get the centroid coordinates
-    x = D->centroid_coordinates[k2 + 0];
-    y = D->centroid_coordinates[k2 + 1];
-
-    // Store x- and y- differentials for the edges of
-    // triangle k relative to the centroid
-    dxv0 = xv0 - x;
-    dxv1 = xv1 - x;
-    dxv2 = xv2 - x;
-    dyv0 = yv0 - y;
-    dyv1 = yv1 - y;
-    dyv2 = yv2 - y;
-
-    // If no boundaries, auxiliary triangle is formed
-    // from the centroids of the three neighbours
-    // If one boundary, auxiliary triangle is formed
-    // from this centroid and its two neighbours
-
-    k0 = D->surrogate_neighbours[k3 + 0];
-    k1 = D->surrogate_neighbours[k3 + 1];
-    k2 = D->surrogate_neighbours[k3 + 2];
-
-    // Get the auxiliary triangle's vertex coordinates
-    // (normally the centroids of neighbouring triangles)
-    coord_index = 2 * k0;
-    x0 = D->centroid_coordinates[coord_index + 0];
-    y0 = D->centroid_coordinates[coord_index + 1];
-
-    coord_index = 2 * k1;
-    x1 = D->centroid_coordinates[coord_index + 0];
-    y1 = D->centroid_coordinates[coord_index + 1];
-
-    coord_index = 2 * k2;
-    x2 = D->centroid_coordinates[coord_index + 0];
-    y2 = D->centroid_coordinates[coord_index + 1];
-
-    // Store x- and y- differentials for the vertices
-    // of the auxiliary triangle
-    dx1 = x1 - x0;
-    dx2 = x2 - x0;
-    dy1 = y1 - y0;
-    dy2 = y2 - y0;
-
-    // Calculate 2*area of the auxiliary triangle
-    // The triangle is guaranteed to be counter-clockwise
-    area2 = dy2 * dx1 - dy1 * dx2;
-
-    if (((D->height_centroid_values[k0] < minimum_allowed_height) | (k0 == k)) &
-        ((D->height_centroid_values[k1] < minimum_allowed_height) | (k1 == k)) &
-        ((D->height_centroid_values[k2] < minimum_allowed_height) | (k2 == k)))
-    {
-      // printf("Surrounded by dry cells\n");
-      D->x_centroid_work[k] = 0.;
-      D->xmom_centroid_values[k] = 0.;
-      D->y_centroid_work[k] = 0.;
-      D->ymom_centroid_values[k] = 0.;
-    }
-
-    // Limit the edge values
-    if (D->number_of_boundaries[k] == 3)
-    {
-      // Very unlikely
-      // No neighbours, set gradient on the triangle to zero
-
-      //printf("%ld 3 boundaries\n",k);
-
-      D->stage_edge_values[k3 + 0] = D->stage_centroid_values[k];
-      D->stage_edge_values[k3 + 1] = D->stage_centroid_values[k];
-      D->stage_edge_values[k3 + 2] = D->stage_centroid_values[k];
-
-      D->xmom_edge_values[k3 + 0] = D->xmom_centroid_values[k];
-      D->xmom_edge_values[k3 + 1] = D->xmom_centroid_values[k];
-      D->xmom_edge_values[k3 + 2] = D->xmom_centroid_values[k];
-
-      D->ymom_edge_values[k3 + 0] = D->ymom_centroid_values[k];
-      D->ymom_edge_values[k3 + 1] = D->ymom_centroid_values[k];
-      D->ymom_edge_values[k3 + 2] = D->ymom_centroid_values[k];
-
-      dk = D->height_centroid_values[k];
-      D->height_edge_values[k3 + 0] = dk;
-      D->height_edge_values[k3 + 1] = dk;
-      D->height_edge_values[k3 + 2] = dk;
-
-    }
-    else if (D->number_of_boundaries[k] <= 1)
-    {
-      //==============================================
-      // Number of boundaries <= 1
-      // 'Typical case'
-      //==============================================
-      //printf("%ld boundaries <= 1\n",k);
-
-      // Calculate heights of neighbouring cells
-      hc = D->height_centroid_values[k];
-      h0 = D->height_centroid_values[k0];
-      h1 = D->height_centroid_values[k1];
-      h2 = D->height_centroid_values[k2];
-
-      hmin = fmin(fmin(h0, fmin(h1, h2)), hc);
-      hmax = fmax(fmax(h0, fmax(h1, h2)), hc);
-
-      // Look for strong changes in cell depth as an indicator of near-wet-dry
-      // Reduce hfactor linearly from 1-0 between depth ratio (hmin/hc) of [a_tmp , b_tmp]
-      // NOTE: If we have a more 'second order' treatment in near dry areas (e.g. with b_tmp being negative), then
-      //       the water tends to dry more rapidly (which is in agreement with analytical results),
-      //       but is also more 'artefacty' in important cases (tendency for high velocities, etc).
-      //
-      // So hfactor = depth_ratio*(c_tmp) + d_tmp, but is clipped between 0 and 1.
-      hfactor = fmax(0., fmin(c_tmp * fmax(hmin, 0.0) / fmax(hc, 1.0e-06) + d_tmp,
-                              fmin(c_tmp * fmax(hc, 0.) / fmax(hmax, 1.0e-06) + d_tmp, 1.0)));
-      // Set hfactor to zero smothly as hmin--> minimum_allowed_height. This
-      // avoids some 'chatter' for very shallow flows
-      hfactor = fmin(1.2 * fmax(hmin - D->minimum_allowed_height, 0.) / (fmax(hmin, 0.) + 1. * D->minimum_allowed_height), hfactor);
-
-      inv_area2 = 1.0 / area2;
-
-      //-----------------------------------
-      // stage
-      //-----------------------------------
-      beta_tmp = D->beta_w_dry + (D->beta_w - D->beta_w_dry) * hfactor;
-
-      cv_k  = D->stage_centroid_values[k];
-      cv_k0 = D->stage_centroid_values[k0];
-      cv_k1 = D->stage_centroid_values[k1];
-      cv_k2 = D->stage_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->stage_edge_values[k3 + 0] = edge_values[0];
-      D->stage_edge_values[k3 + 1] = edge_values[1];
-      D->stage_edge_values[k3 + 2] = edge_values[2];  
-
-      //-----------------------------------
-      // height
-      //-----------------------------------
-
-      cv_k  = D->height_centroid_values[k];
-      cv_k0 = D->height_centroid_values[k0];
-      cv_k1 = D->height_centroid_values[k1];
-      cv_k2 = D->height_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->height_edge_values[k3 + 0] = edge_values[0];
-      D->height_edge_values[k3 + 1] = edge_values[1];
-      D->height_edge_values[k3 + 2] = edge_values[2]; 
-
-    
-      //-----------------------------------
-      // xmomentum
-      //-----------------------------------
-
-      beta_tmp = D->beta_uh_dry + (D->beta_uh - D->beta_uh_dry) * hfactor;
-
-      cv_k  = D->xmom_centroid_values[k];
-      cv_k0 = D->xmom_centroid_values[k0];
-      cv_k1 = D->xmom_centroid_values[k1];
-      cv_k2 = D->xmom_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->xmom_edge_values[k3 + 0] = edge_values[0];
-      D->xmom_edge_values[k3 + 1] = edge_values[1];
-      D->xmom_edge_values[k3 + 2] = edge_values[2]; 
-
-      //-----------------------------------
-      // ymomentum
-      //-----------------------------------
-
-      beta_tmp = D->beta_vh_dry + (D->beta_vh - D->beta_vh_dry) * hfactor;
-
-      cv_k  = D->ymom_centroid_values[k];
-      cv_k0 = D->ymom_centroid_values[k0];
-      cv_k1 = D->ymom_centroid_values[k1];
-      cv_k2 = D->ymom_centroid_values[k2];
-
-      __calc_edge_values(beta_tmp, 
-                         cv_k, 
-                         cv_k0,
-                         cv_k1,
-                         cv_k2,
-                         dxv0, dxv1, dxv2, dyv0, dyv1, dyv2,
-                         dx1, dx2, dy1, dy2, inv_area2, edge_values);
-
-      D->ymom_edge_values[k3 + 0] = edge_values[0];
-      D->ymom_edge_values[k3 + 1] = edge_values[1];
-      D->ymom_edge_values[k3 + 2] = edge_values[2]; 
-
-    } // End number_of_boundaries <=1
-    else
-    {
-      //printf("%ld 2 boundaries\n",k);
-      //==============================================
-      // Number of boundaries == 2
-      //==============================================
-
-      // One internal neighbour and gradient is in direction of the neighbour's centroid
-
-      // Find the only internal neighbour (k1?)
-      for (k2 = k3; k2 < k3 + 3; k2++)
-      {
-        // Find internal neighbour of triangle k
-        // k2 indexes the edges of triangle k
-
-        if (D->surrogate_neighbours[k2] != k)
-        {
-          break;
-        }
-      }
-
-      // if ((k2 == k3 + 3))
-      // {
-      //   // If we didn't find an internal neighbour
-      //   // report_python_error(AT, "Internal neighbour not found");
-      //   return -1;
-      // }
-
-      k1 = D->surrogate_neighbours[k2];
-
-      // The coordinates of the triangle are already (x,y).
-      // Get centroid of the neighbour (x1,y1)
-      coord_index = 2 * k1;
-      x1 = D->centroid_coordinates[coord_index + 0];
-      y1 = D->centroid_coordinates[coord_index + 1];
-
-      // Compute x- and y- distances between the centroid of
-      // triangle k and that of its neighbour
-      dx1 = x1 - x;
-      dy1 = y1 - y;
-
-      // Set area2 as the square of the distance
-      area2 = dx1 * dx1 + dy1 * dy1;
-
-      // Set dx2=(x1-x0)/((x1-x0)^2+(y1-y0)^2)
-      // and dy2=(y1-y0)/((x1-x0)^2+(y1-y0)^2) which
-      // respectively correspond to the x- and y- gradients
-      // of the conserved quantities
-      dx2 = 1.0 / area2;
-      dy2 = dx2 * dy1;
-      dx2 *= dx1;
-
-      //-----------------------------------
-      // stage
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->stage_centroid_values[k1] - D->stage_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
-
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->stage_edge_values[k3 + 0] = D->stage_centroid_values[k] + dqv[0];
-      D->stage_edge_values[k3 + 1] = D->stage_centroid_values[k] + dqv[1];
-      D->stage_edge_values[k3 + 2] = D->stage_centroid_values[k] + dqv[2];
-
-      //-----------------------------------
-      // height
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->height_centroid_values[k1] - D->height_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
-
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->height_edge_values[k3 + 0] = D->height_centroid_values[k] + dqv[0];
-      D->height_edge_values[k3 + 1] = D->height_centroid_values[k] + dqv[1];
-      D->height_edge_values[k3 + 2] = D->height_centroid_values[k] + dqv[2];
-
-      //-----------------------------------
-      // xmomentum
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->xmom_centroid_values[k1] - D->xmom_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
-
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->xmom_edge_values[k3 + 0] = D->xmom_centroid_values[k] + dqv[0];
-      D->xmom_edge_values[k3 + 1] = D->xmom_centroid_values[k] + dqv[1];
-      D->xmom_edge_values[k3 + 2] = D->xmom_centroid_values[k] + dqv[2];
-
-      //-----------------------------------
-      // ymomentum
-      //-----------------------------------
-
-      // Compute differentials
-      dq1 = D->ymom_centroid_values[k1] - D->ymom_centroid_values[k];
-
-      // Calculate the gradient between the centroid of triangle k
-      // and that of its neighbour
-      a = dq1 * dx2;
-      b = dq1 * dy2;
-
-      // Calculate provisional edge jumps, to be limited
-      dqv[0] = a * dxv0 + b * dyv0;
-      dqv[1] = a * dxv1 + b * dyv1;
-      dqv[2] = a * dxv2 + b * dyv2;
-
-      // Now limit the jumps
-      if (dq1 >= 0.0)
-      {
-        qmin = 0.0;
-        qmax = dq1;
-      }
-      else
-      {
-        qmin = dq1;
-        qmax = 0.0;
-      }
-
-      // Limit the gradient
-      __limit_gradient(dqv, qmin, qmax, D->beta_w);
-
-      D->ymom_edge_values[k3 + 0] = D->ymom_centroid_values[k] + dqv[0];
-      D->ymom_edge_values[k3 + 1] = D->ymom_centroid_values[k] + dqv[1];
-      D->ymom_edge_values[k3 + 2] = D->ymom_centroid_values[k] + dqv[2];
-
-    } // else [number_of_boundaries]
-
-  // printf("%ld, bed    %e, %e, %e\n",k, D->bed_edge_values[k3],D->bed_edge_values[k3 + 1],D->bed_edge_values[k3 + 2] );
-  // printf("%ld, stage  %e, %e, %e\n",k, D->stage_edge_values[k3],D->stage_edge_values[k3 + 1],D->stage_edge_values[k3 + 2] );
-  // printf("%ld, height %e, %e, %e\n",k, D->height_edge_values[k3],D->height_edge_values[k3 + 1],D->height_edge_values[k3 + 2] );
-  // printf("%ld, xmom   %e, %e, %e\n",k, D->xmom_edge_values[k3],D->xmom_edge_values[k3 + 1],D->xmom_edge_values[k3 + 2] );
-  // printf("%ld, ymom   %e, %e, %e\n",k, D->ymom_edge_values[k3],D->ymom_edge_values[k3 + 1],D->ymom_edge_values[k3 + 2] );
-
-    // If needed, convert from velocity to momenta
-    if (D->extrapolate_velocity_second_order == 1)
-    {
-      // Re-compute momenta at edges
-      for (i = 0; i < 3; i++)
-      {
-        dk = D->height_edge_values[k3 + i];
-        D->xmom_edge_values[k3 + i] = D->xmom_edge_values[k3 + i] * dk;
-        D->ymom_edge_values[k3 + i] = D->ymom_edge_values[k3 + i] * dk;
-      }
-    }
-
-    // Compute new bed elevation
-    D->bed_edge_values[k3 + 0] = D->stage_edge_values[k3 + 0] - D->height_edge_values[k3 + 0];
-    D->bed_edge_values[k3 + 1] = D->stage_edge_values[k3 + 1] - D->height_edge_values[k3 + 1];
-    D->bed_edge_values[k3 + 2] = D->stage_edge_values[k3 + 2] - D->height_edge_values[k3 + 2];
-
-
-    // FIXME SR: Do we need vertex values every inner timestep?
-
-    // Compute stage vertex values
-    D->stage_vertex_values[k3 + 0] = D->stage_edge_values[k3 + 1] + D->stage_edge_values[k3 + 2] - D->stage_edge_values[k3 + 0];
-    D->stage_vertex_values[k3 + 1] = D->stage_edge_values[k3 + 0] + D->stage_edge_values[k3 + 2] - D->stage_edge_values[k3 + 1];
-    D->stage_vertex_values[k3 + 2] = D->stage_edge_values[k3 + 0] + D->stage_edge_values[k3 + 1] - D->stage_edge_values[k3 + 2];
-
-    // Compute height vertex values
-    D->height_vertex_values[k3 + 0] = D->height_edge_values[k3 + 1] + D->height_edge_values[k3 + 2] - D->height_edge_values[k3 + 0];
-    D->height_vertex_values[k3 + 1] = D->height_edge_values[k3 + 0] + D->height_edge_values[k3 + 2] - D->height_edge_values[k3 + 1];
-    D->height_vertex_values[k3 + 2] = D->height_edge_values[k3 + 0] + D->height_edge_values[k3 + 1] - D->height_edge_values[k3 + 2];
-
-    // Compute momenta at vertices
-    D->xmom_vertex_values[k3 + 0] = D->xmom_edge_values[k3 + 1] + D->xmom_edge_values[k3 + 2] - D->xmom_edge_values[k3 + 0];
-    D->xmom_vertex_values[k3 + 1] = D->xmom_edge_values[k3 + 0] + D->xmom_edge_values[k3 + 2] - D->xmom_edge_values[k3 + 1];
-    D->xmom_vertex_values[k3 + 2] = D->xmom_edge_values[k3 + 0] + D->xmom_edge_values[k3 + 1] - D->xmom_edge_values[k3 + 2];
-
-    D->ymom_vertex_values[k3 + 0] = D->ymom_edge_values[k3 + 1] + D->ymom_edge_values[k3 + 2] - D->ymom_edge_values[k3 + 0];
-    D->ymom_vertex_values[k3 + 1] = D->ymom_edge_values[k3 + 0] + D->ymom_edge_values[k3 + 2] - D->ymom_edge_values[k3 + 1];
-    D->ymom_vertex_values[k3 + 2] = D->ymom_edge_values[k3 + 0] + D->ymom_edge_values[k3 + 1] - D->ymom_edge_values[k3 + 2];
-
-    
-    D->bed_vertex_values[k3 + 0] = D->bed_edge_values[k3 + 1] + D->bed_edge_values[k3 + 2] - D->bed_edge_values[k3 + 0];
-    D->bed_vertex_values[k3 + 1] = D->bed_edge_values[k3 + 0] + D->bed_edge_values[k3 + 2] - D->bed_edge_values[k3 + 1];
-    D->bed_vertex_values[k3 + 2] = D->bed_edge_values[k3 + 0] + D->bed_edge_values[k3 + 1] - D->bed_edge_values[k3 + 2];
-
-
-
-  }   // for k=0 to number_of_elements-1
-
-
-  if (verbose == 1)
-  {
-    printf("stage_edge_values after loop 2\n");
-    for (k=0; k < number_of_elements; k++)
-    {
-     for (i=0; i<3; i++)
-     {
-      int64_t ki = 3*k+i;
-      printf("%ld %ld %f \n",k, i, D->stage_edge_values[ki]); 
-     }
-    }
-  }
-  
-// Fix xmom and ymom centroid values
-// #pragma omp parallel for simd private(k3, i, dk) firstprivate(extrapolate_velocity_second_order)
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    if (extrapolate_velocity_second_order == 1)
-    {
-      // Convert velocity back to momenta at centroids
-      D->xmom_centroid_values[k] = D->x_centroid_work[k];
-      D->ymom_centroid_values[k] = D->y_centroid_work[k];
-    }
-
-  }
-
-  return 0;
-}
-
-
-// Computational function for flux computation
-int64_t _simd_fix_negative_cells(struct domain *D)
-{
-  int64_t k;
-  int64_t tff;
-  int64_t num_negative_cells = 0;
-
-  // #pragma omp parallel for private(k, tff) reduction(+:num_negative_cells)
-  for (k = 0; k < D->number_of_elements; k++)
-  {
-    tff = D->tri_full_flag[k];
-    if ((D->stage_centroid_values[k] - D->bed_centroid_values[k] < 0.0) & (tff > 0)) 
-    {
-      num_negative_cells = num_negative_cells + 1;
-      D->stage_centroid_values[k] = D->bed_centroid_values[k];
-      D->xmom_centroid_values[k] = 0.0;
-      D->ymom_centroid_values[k] = 0.0;
-    }
-  }
-  return num_negative_cells;
-}
diff --git a/anuga/shallow_water/sw_domain_simd_ext.pyx b/anuga/shallow_water/sw_domain_simd_ext.pyx
deleted file mode 100644
index d865473f8..000000000
--- a/anuga/shallow_water/sw_domain_simd_ext.pyx
+++ /dev/null
@@ -1,408 +0,0 @@
-#cython: wraparound=False, boundscheck=True, cdivision=True, profile=False, nonecheck=False, overflowcheck=False, cdivision_warnings=False, unraisable_tracebacks=False
-
-#wraparound=False, boundscheck=False, cdivision=True, profile=False, nonecheck=False, overflowcheck=False, cdivision_warnings=False, unraisable_tracebacks=False
-import cython
-from libc.stdint cimport int64_t
-
-# import both numpy and the Cython declarations for numpy
-import numpy as np
-cimport numpy as np
-
-cdef extern from "sw_domain_simd.c" nogil:
-	struct domain:
-		int64_t number_of_elements
-		int64_t boundary_length
-		int64_t number_of_riverwall_edges
-		double epsilon
-		double H0
-		double g
-		int64_t optimise_dry_cells
-		double evolve_max_timestep
-		int64_t extrapolate_velocity_second_order
-		double minimum_allowed_height
-		double maximum_allowed_speed
-		int64_t low_froude
-		int64_t timestep_fluxcalls
-		double beta_w
-		double beta_w_dry
-		double beta_uh
-		double beta_uh_dry
-		double beta_vh
-		double beta_vh_dry
-		int64_t max_flux_update_frequency
-		int64_t ncol_riverwall_hydraulic_properties
-		int64_t* neighbours
-		int64_t* neighbour_edges
-		int64_t* surrogate_neighbours
-		double* normals
-		double* edgelengths
-		double* radii
-		double* areas
-		int64_t* edge_flux_type
-		int64_t* tri_full_flag
-		int64_t* already_computed_flux
-		double* max_speed
-		double* vertex_coordinates
-		double* edge_coordinates
-		double* centroid_coordinates
-		int64_t* number_of_boundaries
-		double* stage_edge_values
-		double* xmom_edge_values
-		double* ymom_edge_values
-		double* bed_edge_values
-		double* height_edge_values
-		double* stage_centroid_values
-		double* xmom_centroid_values
-		double* ymom_centroid_values
-		double* bed_centroid_values
-		double* height_centroid_values
-		double* stage_vertex_values
-		double* xmom_vertex_values
-		double* ymom_vertex_values
-		double* bed_vertex_values
-		double* height_vertex_values
-		double* stage_boundary_values
-		double* xmom_boundary_values
-		double* ymom_boundary_values
-		double* bed_boundary_values
-		double* stage_explicit_update
-		double* xmom_explicit_update
-		double* ymom_explicit_update
-		int64_t* flux_update_frequency
-		int64_t* update_next_flux
-		int64_t* update_extrapolation
-		double* edge_timestep
-		double* edge_flux_work
-		double* neigh_work
-		double* pressuregrad_work
-		double* x_centroid_work
-		double* y_centroid_work
-		double* boundary_flux_sum
-		int64_t* allow_timestep_increase
-		double* riverwall_elevation
-		int64_t* riverwall_rowIndex
-		double* riverwall_hydraulic_properties
-		int64_t* edge_river_wall_counter
-		double* stage_semi_implicit_update
-		double* xmom_semi_implicit_update
-		double* ymom_semi_implicit_update		
-
-
-	struct edge:
-		pass
-
-	double _simd_compute_fluxes_central(domain* D, double timestep)
-	double _simd_protect(domain* D)
-	int64_t _simd_extrapolate_second_order_edge_sw(domain* D, int64_t verbose)
-	int64_t _simd_fix_negative_cells(domain* D)
-
-
-
-cdef int64_t pointer_flag = 0
-cdef int64_t parameter_flag = 0
-
-cdef inline get_python_domain_parameters(domain *D, object domain_object):
-
-	D.number_of_elements = domain_object.number_of_elements
-	D.boundary_length = domain_object.boundary_length 
-	D.number_of_riverwall_edges = domain_object.number_of_riverwall_edges
-	D.epsilon = domain_object.epsilon
-	D.H0 = domain_object.H0
-	D.g = domain_object.g
-	D.optimise_dry_cells = domain_object.optimise_dry_cells
-	D.evolve_max_timestep = domain_object.evolve_max_timestep
-	D.minimum_allowed_height = domain_object.minimum_allowed_height
-	D.maximum_allowed_speed = domain_object.maximum_allowed_speed
-	D.timestep_fluxcalls = domain_object.timestep_fluxcalls
-	D.low_froude = domain_object.low_froude
-	D.extrapolate_velocity_second_order = domain_object.extrapolate_velocity_second_order
-	D.beta_w = domain_object.beta_w
-	D.beta_w_dry = domain_object.beta_w_dry
-	D.beta_uh = domain_object.beta_uh
-	D.beta_uh_dry = domain_object.beta_uh_dry
-	D.beta_vh = domain_object.beta_vh
-	D.beta_vh_dry = domain_object.beta_vh_dry
-	D.max_flux_update_frequency = domain_object.max_flux_update_frequency
-		
-
-cdef inline get_python_domain_pointers(domain *D, object domain_object):
-
-	cdef int64_t[:,::1]   neighbours
-	cdef int64_t[:,::1]   neighbour_edges
-	cdef double[:,::1] normals
-	cdef double[:,::1] edgelengths
-	cdef double[::1]   radii
-	cdef double[::1]   areas
-	cdef int64_t[::1]     edge_flux_type
-	cdef int64_t[::1]     tri_full_flag
-	cdef int64_t[:,::1]   already_computed_flux
-	cdef double[:,::1] vertex_coordinates
-	cdef double[:,::1] edge_coordinates
-	cdef double[:,::1] centroid_coordinates
-	cdef int64_t[::1]     number_of_boundaries
-	cdef int64_t[:,::1]   surrogate_neighbours
-	cdef double[::1]   max_speed
-	cdef int64_t[::1]     flux_update_frequency
-	cdef int64_t[::1]     update_next_flux
-	cdef int64_t[::1]     update_extrapolation
-	cdef int64_t[::1]     allow_timestep_increase
-	cdef double[::1]   edge_timestep
-	cdef double[::1]   edge_flux_work
-	cdef double[::1]   neigh_work
-	cdef double[::1]   pressuregrad_work
-	cdef double[::1]   x_centroid_work
-	cdef double[::1]   y_centroid_work
-	cdef double[::1]   boundary_flux_sum
-	cdef double[::1]   riverwall_elevation
-	cdef int64_t[::1]     riverwall_rowIndex
-	cdef double[:,::1] riverwall_hydraulic_properties
-	cdef int64_t[::1]     edge_river_wall_counter
-	cdef double[:,::1] edge_values
-	cdef double[::1]   centroid_values
-	cdef double[:,::1] vertex_values
-	cdef double[::1]   boundary_values
-	cdef double[::1]   explicit_update
-	
-	cdef object quantities
-	cdef object riverwallData
-
-	#------------------------------------------------------
-	# Domain structures
-	#------------------------------------------------------
-	neighbours = domain_object.neighbours
-	D.neighbours = &neighbours[0,0]
-	
-	surrogate_neighbours = domain_object.surrogate_neighbours
-	D.surrogate_neighbours = &surrogate_neighbours[0,0]
-
-	neighbour_edges = domain_object.neighbour_edges
-	D.neighbour_edges = &neighbour_edges[0,0]
-
-	normals = domain_object.normals
-	D.normals = &normals[0,0]
-
-	edgelengths = domain_object.edgelengths
-	D.edgelengths = &edgelengths[0,0]
-
-	radii = domain_object.radii
-	D.radii = &radii[0]
-
-	areas = domain_object.areas
-	D.areas = &areas[0]
-
-	edge_flux_type = domain_object.edge_flux_type
-	D.edge_flux_type = &edge_flux_type[0]
-
-	tri_full_flag = domain_object.tri_full_flag
-	D.tri_full_flag = &tri_full_flag[0]
-
-	already_computed_flux = domain_object.already_computed_flux
-	D.already_computed_flux = &already_computed_flux[0,0]
-
-	vertex_coordinates = domain_object.vertex_coordinates
-	D.vertex_coordinates = &vertex_coordinates[0,0]
-
-	edge_coordinates = domain_object.edge_coordinates
-	D.edge_coordinates = &edge_coordinates[0,0]
-
-	centroid_coordinates = domain_object.centroid_coordinates
-	D.centroid_coordinates = &centroid_coordinates[0,0]
-
-	max_speed = domain_object.max_speed
-	D.max_speed = &max_speed[0]
-
-	number_of_boundaries = domain_object.number_of_boundaries
-	D.number_of_boundaries = &number_of_boundaries[0]
-
-	flux_update_frequency = domain_object.flux_update_frequency
-	D.flux_update_frequency = &flux_update_frequency[0]
-
-	update_next_flux = domain_object.update_next_flux
-	D.update_next_flux = &update_next_flux[0]
-
-	update_extrapolation = domain_object.update_extrapolation
-	D.update_extrapolation = &update_extrapolation[0]
-
-	allow_timestep_increase = domain_object.allow_timestep_increase
-	D.allow_timestep_increase = &allow_timestep_increase[0]
-
-	edge_timestep = domain_object.edge_timestep
-	D.edge_timestep = &edge_timestep[0]
-
-	edge_flux_work = domain_object.edge_flux_work
-	D.edge_flux_work = &edge_flux_work[0]
-
-	neigh_work = domain_object.neigh_work
-	D.neigh_work = &neigh_work[0]
-
-	pressuregrad_work = domain_object.pressuregrad_work
-	D.pressuregrad_work = &pressuregrad_work[0]
-
-	x_centroid_work = domain_object.x_centroid_work
-	D.x_centroid_work = &x_centroid_work[0]
-
-	y_centroid_work = domain_object.y_centroid_work
-	D.y_centroid_work = &y_centroid_work[0]
-
-	boundary_flux_sum = domain_object.boundary_flux_sum
-	D.boundary_flux_sum = &boundary_flux_sum[0]
-
-	edge_river_wall_counter = domain_object.edge_river_wall_counter
-	D.edge_river_wall_counter  = &edge_river_wall_counter[0]
-
-	#------------------------------------------------------
-	# Quantity structures
-	#------------------------------------------------------
-	quantities = domain_object.quantities
-	stage = quantities["stage"]
-	xmomentum = quantities["xmomentum"]
-	ymomentum = quantities["ymomentum"]
-	elevation = quantities["elevation"]
-	height = quantities["height"]
-
-	edge_values = stage.edge_values
-	D.stage_edge_values = &edge_values[0,0]
-
-	edge_values = xmomentum.edge_values
-	D.xmom_edge_values = &edge_values[0,0]
-
-	edge_values = ymomentum.edge_values
-	D.ymom_edge_values = &edge_values[0,0]
-
-	edge_values = elevation.edge_values
-	D.bed_edge_values = &edge_values[0,0]
-
-	edge_values = height.edge_values
-	D.height_edge_values = &edge_values[0,0]
-
-	centroid_values = stage.centroid_values
-	D.stage_centroid_values = &centroid_values[0]
-
-	centroid_values = xmomentum.centroid_values
-	D.xmom_centroid_values = &centroid_values[0]
-
-	centroid_values = ymomentum.centroid_values
-	D.ymom_centroid_values = &centroid_values[0]
-
-	centroid_values = elevation.centroid_values
-	D.bed_centroid_values = &centroid_values[0]
-
-	centroid_values = height.centroid_values
-	D.height_centroid_values = &centroid_values[0]
-
-	vertex_values = stage.vertex_values
-	D.stage_vertex_values = &vertex_values[0,0]
-
-	vertex_values = xmomentum.vertex_values
-	D.xmom_vertex_values = &vertex_values[0,0]
-
-	vertex_values = ymomentum.vertex_values
-	D.ymom_vertex_values = &vertex_values[0,0]
-
-	vertex_values = elevation.vertex_values
-	D.bed_vertex_values = &vertex_values[0,0]
-
-	vertex_values = height.vertex_values
-	D.height_vertex_values = &vertex_values[0,0]
-
-	boundary_values = stage.boundary_values
-	D.stage_boundary_values = &boundary_values[0]
-
-	boundary_values = xmomentum.boundary_values
-	D.xmom_boundary_values = &boundary_values[0]
-
-	boundary_values = ymomentum.boundary_values
-	D.ymom_boundary_values = &boundary_values[0]
-
-	boundary_values = elevation.boundary_values
-	D.bed_boundary_values = &boundary_values[0]
-
-	explicit_update = stage.explicit_update
-	D.stage_explicit_update = &explicit_update[0]
-
-	explicit_update = xmomentum.explicit_update
-	D.xmom_explicit_update = &explicit_update[0]
-
-	explicit_update = ymomentum.explicit_update
-	D.ymom_explicit_update = &explicit_update[0]
-
-	#------------------------------------------------------
-	# Riverwall structures
-	#------------------------------------------------------
-	riverwallData = domain_object.riverwallData
-
-	riverwall_elevation = riverwallData.riverwall_elevation
-	D.riverwall_elevation = &riverwall_elevation[0]
-
-	riverwall_rowIndex = riverwallData.hydraulic_properties_rowIndex
-	D.riverwall_rowIndex = &riverwall_rowIndex[0]
-
-	D.ncol_riverwall_hydraulic_properties = riverwallData.ncol_hydraulic_properties
-
-	riverwall_hydraulic_properties = riverwallData.hydraulic_properties
-	D.riverwall_hydraulic_properties = &riverwall_hydraulic_properties[0,0]
-
-
-
-#===============================================================================
-
-def compute_fluxes_ext_central(object domain_object, double timestep):
-
-	cdef domain D
-
-	get_python_domain_parameters(&D, domain_object)
-	get_python_domain_pointers(&D, domain_object)
-
-	with nogil:
-		timestep =  _simd_compute_fluxes_central(&D, timestep)
-
-	return timestep
-
-def extrapolate_second_order_edge_sw(object domain_object, int64_t verbose = 0):
-
-	cdef domain D
-	cdef int64_t e
-
-	get_python_domain_parameters(&D, domain_object)
-	get_python_domain_pointers(&D, domain_object)
-
-	with nogil:
-		e = _simd_extrapolate_second_order_edge_sw(&D, verbose)
-
-	if e == -1:
-		return None
-
-def protect_new(object domain_object):
-
-	cdef domain D
-
-	cdef double mass_error
-
-	get_python_domain_parameters(&D, domain_object)
-	get_python_domain_pointers(&D, domain_object)
-
-	with nogil:
-		mass_error = _simd_protect(&D)
-
-
-	return mass_error
-
-def compute_flux_update_frequency(object domain_object, double timestep):
-
-	pass
-
-def fix_negative_cells(object domain_object):
-
-	cdef domain D
-	cdef int64_t num_negative_cells
-
-	get_python_domain_parameters(&D, domain_object)
-	get_python_domain_pointers(&D, domain_object)
-
-	with nogil:
-		num_negative_cells = _simd_fix_negative_cells(&D)
-
-	return num_negative_cells
-
-
-
diff --git a/anuga/shallow_water/tests/meson.build b/anuga/shallow_water/tests/meson.build
index 22b8ccb59..f2e78dc24 100644
--- a/anuga/shallow_water/tests/meson.build
+++ b/anuga/shallow_water/tests/meson.build
@@ -3,9 +3,7 @@
 python_sources = [
 '__init__.py',
 'test_data_manager.py',
-'test_DE_orig.py',
 'test_DE_openmp.py',
-'test_DE_simd.py',
 'test_DE_cuda.py',
 'test_forcing.py',
 'test_friction.py',
@@ -24,4 +22,4 @@ py3.install_sources(
 )
 
 subdir('data')
-subdir('urs_test_data')
\ No newline at end of file
+subdir('urs_test_data')
diff --git a/anuga/shallow_water/tests/run_CUDA_cft.py b/anuga/shallow_water/tests/run_CUDA_cft.py
index 3b25ee39a..68f46ad0d 100644
--- a/anuga/shallow_water/tests/run_CUDA_cft.py
+++ b/anuga/shallow_water/tests/run_CUDA_cft.py
@@ -176,7 +176,7 @@ def stagefun(x,y):
 
 nvtxRangePush('compute forcing terms on gpu for domain2')
 from anuga.shallow_water.shallow_water_domain import manning_friction_implicit
-domain2.set_multiprocessor_mode(4)
+domain2.set_multiprocessor_mode(2)
 manning_friction_implicit(domain2)
 nvtxRangePop()
 
diff --git a/anuga/shallow_water/tests/run_CUDA_extrapolate.py b/anuga/shallow_water/tests/run_CUDA_extrapolate.py
index a91b60e51..023398f38 100644
--- a/anuga/shallow_water/tests/run_CUDA_extrapolate.py
+++ b/anuga/shallow_water/tests/run_CUDA_extrapolate.py
@@ -153,7 +153,7 @@ def stagefun(x,y):
 
 nvtxRangePush('distribute domain2')
 # Now run the distribute procedure on the GPU
-domain2.set_multiprocessor_mode(4)
+domain2.set_multiprocessor_mode(2)
 domain2.distribute_to_vertices_and_edges()
 nvtxRangePop()
 
diff --git a/anuga/shallow_water/tests/run_CUDA_protect_negative.py b/anuga/shallow_water/tests/run_CUDA_protect_negative.py
index 57e736bcc..71f89fea5 100644
--- a/anuga/shallow_water/tests/run_CUDA_protect_negative.py
+++ b/anuga/shallow_water/tests/run_CUDA_protect_negative.py
@@ -143,7 +143,7 @@ def stagefun(x,y):
 
 
 nvtxRangePush('initialise gpu_interface : domain2')
-domain2.set_multiprocessor_mode(4)
+domain2.set_multiprocessor_mode(2)
 nvtxRangePop()
 
 # import pdb; pdb.set_trace()
diff --git a/anuga/shallow_water/tests/run_CUDA_update_conserved_quantities.py b/anuga/shallow_water/tests/run_CUDA_update_conserved_quantities.py
index f18c60115..7e4de0bc9 100644
--- a/anuga/shallow_water/tests/run_CUDA_update_conserved_quantities.py
+++ b/anuga/shallow_water/tests/run_CUDA_update_conserved_quantities.py
@@ -179,7 +179,7 @@ def stagefun(x,y):
 ymom2_centroid_values_before = num.copy(ymom2.centroid_values)
 
 
-nvtxRangePush('update conserved quantities kernal : domain2')
+nvtxRangePush('update conserved quantities kernel : domain2')
 num_negative_ids = gpu_domain2.update_conserved_quantities_kernal()
 
 print('num_negative_ids => ', num_negative_ids)
diff --git a/anuga/shallow_water/tests/run_DE_cuda.py b/anuga/shallow_water/tests/run_DE_cuda.py
index e4177e908..9b48e7093 100644
--- a/anuga/shallow_water/tests/run_DE_cuda.py
+++ b/anuga/shallow_water/tests/run_DE_cuda.py
@@ -142,7 +142,7 @@ def stagefun(x,y):
 nvtxRangePop()
 
 # nvtxRangePush('initialise gpu_interface domain2')
-# domain2.set_multiprocessor_mode(4)
+# domain2.set_multiprocessor_mode(2)
 # nvtxRangePop()
 
 # nvtxRangePush('compute fluxes domain2')
diff --git a/anuga/shallow_water/tests/run_cupy_flux_distribute.py b/anuga/shallow_water/tests/run_cupy_flux_distribute.py
index 523a4948b..5e0e1134f 100644
--- a/anuga/shallow_water/tests/run_cupy_flux_distribute.py
+++ b/anuga/shallow_water/tests/run_cupy_flux_distribute.py
@@ -86,7 +86,7 @@ def stagefun(x,y):
 
 nvtxRangePush('create domain2')
 domain2 = create_domain('domain_cuda')
-domain2.set_multiprocessor_mode(4)
+domain2.set_multiprocessor_mode(2)
 
 
 quantities2 = domain2.quantities
diff --git a/anuga/shallow_water/tests/test_DE_cuda.py b/anuga/shallow_water/tests/test_DE_cuda.py
index b7faa1b92..6c9e72adc 100644
--- a/anuga/shallow_water/tests/test_DE_cuda.py
+++ b/anuga/shallow_water/tests/test_DE_cuda.py
@@ -80,10 +80,10 @@ def stagefun(x,y):
 
 
         domain1 = create_domain('domain_original')
-        domain1.set_multiprocessor_mode(0)
+        domain1.set_multiprocessor_mode(1)
 
         domain2 = create_domain('domain_cuda')
-        domain2.set_multiprocessor_mode(0) # will change to 2 once burn in
+        domain2.set_multiprocessor_mode(1) # will change to 2 once burn in
 
         #------------------------------
         #Evolve the system through time
@@ -99,7 +99,7 @@ def stagefun(x,y):
         #----------------------------------------
         # Now just run the cuda code on domain2
         #----------------------------------------
-        domain2.set_multiprocessor_mode(4)
+        domain2.set_multiprocessor_mode(2)
         timestep = 0.1
 
         domain1.distribute_to_vertices_and_edges()
@@ -234,10 +234,10 @@ def create_domain(name='domain'):
 
 
         domain1 = create_domain('domain_original')
-        domain1.set_multiprocessor_mode(0)
+        domain1.set_multiprocessor_mode(1)
 
         domain2 = create_domain('domain_cuda')
-        domain2.set_multiprocessor_mode(0) # will change to 4 once burn in
+        domain2.set_multiprocessor_mode(1) # will change to 4 once burn in
 
         #------------------------------
         #Evolve the system through time
@@ -254,7 +254,7 @@ def create_domain(name='domain'):
        #----------------------------------------
         # Now just run the cuda code on domain2
         #----------------------------------------
-        domain2.set_multiprocessor_mode(4)
+        domain2.set_multiprocessor_mode(2)
         timestep = 0.1
 
         domain1.distribute_to_vertices_and_edges()
@@ -315,6 +315,6 @@ def create_domain(name='domain'):
         #pprint.pprint(domain2.edge_timestep)    
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_DE_cuda, 'test_')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_DE_cuda)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/shallow_water/tests/test_DE_openmp.py b/anuga/shallow_water/tests/test_DE_openmp.py
index fd2464991..b11245420 100644
--- a/anuga/shallow_water/tests/test_DE_openmp.py
+++ b/anuga/shallow_water/tests/test_DE_openmp.py
@@ -99,7 +99,7 @@ def stagefun(x,y):
         #----------------------------------------
         # Now just run the openmp code on domain2
         #----------------------------------------
-        domain2.set_multiprocessor_mode(2)
+        domain2.set_multiprocessor_mode(1)
         timestep = 0.1
 
         domain1.distribute_to_vertices_and_edges()
@@ -231,10 +231,10 @@ def create_domain(name='domain'):
 
 
         domain1 = create_domain('domain_base')
-        domain1.set_multiprocessor_mode(0)
+        domain1.set_multiprocessor_mode(1)
 
         domain2 = create_domain('domain_openmp')
-        domain2.set_multiprocessor_mode(0) # will change to 2 once burn in
+        domain2.set_multiprocessor_mode(1) # will change to 2 once burn in
 
         #------------------------------
         #Evolve the system through time
@@ -250,7 +250,7 @@ def create_domain(name='domain'):
         #----------------------------------------
         # Now just run the openmp code on domain2
         #----------------------------------------
-        domain2.set_multiprocessor_mode(2)
+        domain2.set_multiprocessor_mode(1)
         timestep = 0.1
 
         domain1.distribute_to_vertices_and_edges()
@@ -330,6 +330,6 @@ def create_domain(name='domain'):
   
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_DE_openmp, 'test_runup')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_DE_openmp)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/shallow_water/tests/test_DE_orig.py b/anuga/shallow_water/tests/test_DE_orig.py
deleted file mode 100644
index c1e78e037..000000000
--- a/anuga/shallow_water/tests/test_DE_orig.py
+++ /dev/null
@@ -1,110 +0,0 @@
-"""  Test environmental forcing - rain, wind, etc.
-"""
-
-import unittest, os
-
-import anuga
-
-from anuga import Reflective_boundary
-from anuga import rectangular_cross_domain
-
-from anuga import Domain
-
-#from anuga_tsunami import Domain
-
-import numpy as num
-import warnings
-import time
-
-
-
-class Test_DE_domain(unittest.TestCase):
-    def setUp(self):
-        pass
-
-    def tearDown(self):
-        try:
-            os.remove('runup_sinusoid_de1.sww')
-        except:
-            pass
-
-
-    def test_runup_sinusoid(self):
-        """ Run a version of the validation test runup_sinusoid
-        to ensure limiting solution has small velocity
-        """
-
-        points, vertices, boundary = anuga.rectangular_cross(20,20, len1=1., len2=1.)
-
-
-        domain=Domain(points,vertices,boundary)    # Create Domain
-        domain.set_flow_algorithm('DE1')
-
-        domain.set_name('runup_sinusoid_de1')                         # Output to file runup.sww
-        domain.set_datadir('.')                          # Use current folder
-        domain.set_quantities_to_be_stored({'stage': 2, 'xmomentum': 2, 'ymomentum': 2, 'elevation': 2})
-        #domain.set_store_vertices_uniquely(True)
-        
-        #------------------
-        # Define topography
-        #------------------
-        scale_me=1.0
-
-        def topography(x,y):
-            return (-x/2.0 +0.05*num.sin((x+y)*50.0))*scale_me
-
-        def stagefun(x,y):
-            stge=-0.2*scale_me #+0.01*(x>0.9)
-            return stge
-
-        domain.set_quantity('elevation',topography) 
-        domain.get_quantity('elevation').smooth_vertex_values()
-        domain.set_quantity('friction',0.03) 
-
-
-        domain.set_quantity('stage', stagefun) 
-        domain.get_quantity('stage').smooth_vertex_values()
-
-
-        #--------------------------
-        # Setup boundary conditions
-        #--------------------------
-        Br=anuga.Reflective_boundary(domain) # Solid reflective wall
-
-        #----------------------------------------------
-        # Associate boundary tags with boundary objects
-        #----------------------------------------------
-        domain.set_boundary({'left': Br, 'right': Br, 'top': Br, 'bottom':Br})
-
-        #------------------------------
-        #Evolve the system through time
-        #------------------------------
-
-        for t in domain.evolve(yieldstep=7.0,finaltime=7.0):
-            #print domain.timestepping_statistics()
-            #xx = domain.quantities['xmomentum'].centroid_values
-            #yy = domain.quantities['ymomentum'].centroid_values
-            #dd = domain.quantities['stage'].centroid_values - domain.quantities['elevation'].centroid_values
-
-            #dd = (dd)*(dd>1.0e-03)+1.0e-03
-            #vv = ( (xx/dd)**2 + (yy/dd)**2)**0.5
-            #vv = vv*(dd>1.0e-03)
-            #print 'Peak velocity is: ', vv.max(), vv.argmax()
-            #print 'Volume is', sum(dd_raw*domain.areas)
-            pass
-
-        xx = domain.quantities['xmomentum'].centroid_values
-        yy = domain.quantities['ymomentum'].centroid_values
-        dd = domain.quantities['stage'].centroid_values - domain.quantities['elevation'].centroid_values
-        #dd_raw=1.0*dd
-        dd = (dd)*(dd>1.0e-03)+1.0e-03
-        vv = ((xx/dd)**2 + (yy/dd)**2)**0.5
-
-        assert num.all(vv<2.0e-02)
-
-
-            
-if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_DE1_domain, 'test')
-    runner = unittest.TextTestRunner(verbosity=1)
-    runner.run(suite)
diff --git a/anuga/shallow_water/tests/test_DE_simd.py b/anuga/shallow_water/tests/test_DE_simd.py
deleted file mode 100644
index ac2e22034..000000000
--- a/anuga/shallow_water/tests/test_DE_simd.py
+++ /dev/null
@@ -1,335 +0,0 @@
-"""  Test environmental forcing - rain, wind, etc.
-"""
-
-import unittest, os
-
-import anuga
-
-from anuga import Reflective_boundary
-from anuga import rectangular_cross_domain
-
-from anuga import Domain
-
-import numpy as num
-import warnings
-import time
-import math
-
-
-
-class Test_DE_simd(unittest.TestCase):
-    def setUp(self):
-        pass
-
-    def tearDown(self):
-        for file in ['domain_base.sww', 'domain_openmp.sww', 'domain_simd_depth_yeah.asc', 'domain_openmp_depth_yeah.prj']:
-            try:
-                os.remove(file)
-            except:
-                pass 
-
-
-    def test_runup_simd(self):
-        """ Run a version of the validation test runup_sinusoid
-        to ensure limiting solution has small velocity
-        """
-
-        def create_domain(name='domain'):
-            domain = anuga.rectangular_cross_domain(2,2, len1=1., len2=1.)
-
-            domain.set_flow_algorithm('DE0')
-            domain.set_low_froude(0)
-            domain.set_multiprocessor_mode(0)
-        
-            domain.set_name(name)  
-            domain.set_datadir('.')
-        
-            #------------------
-            # Define topography
-            #------------------
-            scale_me=1.0
-
-            def topography(x,y):
-                return (-x/2.0 +0.05*num.sin((x+y)*50.0))*scale_me
-
-            def stagefun(x,y):
-                stage=-0.2*scale_me #+0.01*(x>0.9)
-                return stage
-
-            domain.set_quantity('elevation',topography)     # Use function for elevation
-            domain.set_quantity('friction',0.03)            # Constant friction
-            domain.set_quantity('stage', stagefun)          # Constant negative initial stage
-
-            #--------------------------
-            # Setup boundary conditions
-            #--------------------------
-            Br=anuga.Reflective_boundary(domain)                 # Solid reflective wall
-            Bd=anuga.Dirichlet_boundary([-0.1*scale_me,0.,0.])   # Constant boundary values -- not used in this example
-
-            #----------------------------------------------
-            # Associate boundary tags with boundary objects
-            #----------------------------------------------
-            domain.set_boundary({'left': Br, 'right': Bd, 'top': Br, 'bottom':Br})
-
-            return domain
-
-        print('')
-        print(70*'=')
-        print('Test Runup')
-        print(70*'=')
-
-
-        domain1 = create_domain('domain_base')
-        domain1.set_multiprocessor_mode(0)
-
-        domain2 = create_domain('domain_openmp')
-        domain2.set_multiprocessor_mode(0) # will change to 2 once burn in
-
-        #------------------------------
-        #Evolve the system through time
-        #------------------------------
-        print('Evolve domain1')
-        for t in domain1.evolve(yieldstep=0.1,finaltime=0.1):
-            domain1.print_timestepping_statistics()
-
-        print('Evolve domain2')
-        for t in domain2.evolve(yieldstep=0.1,finaltime=0.1):
-            domain2.print_timestepping_statistics()
-
-        #----------------------------------------
-        # Now just run the openmp code on domain2
-        #----------------------------------------
-        domain2.set_multiprocessor_mode(1)
-        timestep = 0.1
-
-        domain1.distribute_to_vertices_and_edges()
-        domain1.compute_fluxes()
-        timestep1 = domain1.flux_timestep
-
-        domain2.distribute_to_vertices_and_edges()
-        domain2.compute_fluxes()
-        timestep2 = domain2.flux_timestep
-
-        # Compare update arrays and timestep
-
-
-        print('domain1 timestep ', timestep1)
-        print('domain2 timestep ', timestep2)
-
-        quantities1 = domain1.quantities
-        stage1 = quantities1["stage"]
-        xmom1 = quantities1["xmomentum"]
-        ymom1 = quantities1["ymomentum"]
-
-        max_speed_1 = domain1.max_speed
-
-
-        quantities2 = domain2.quantities
-        stage2 = quantities2["stage"]
-        xmom2 = quantities2["xmomentum"]
-        ymom2 = quantities2["ymomentum"]
-
-        max_speed_2 = domain2.max_speed
-
-
-        print('timestep error              ', abs(timestep1-timestep2))
-        print('stage explicit update error ', num.linalg.norm(stage1.explicit_update-stage2.explicit_update))
-        print('xmom  explicit update error ', num.linalg.norm(xmom1.explicit_update-xmom2.explicit_update))
-        print('ymom  explicit update error ', num.linalg.norm(ymom1.explicit_update-ymom2.explicit_update))
-        print('max_speed error             ', num.linalg.norm(max_speed_1-max_speed_2))
-        #print('edge timestep error         ', num.linalg.norm(domain1.edge_timestep-domain2.edge_timestep))
-        #print('pressure work error         ', num.linalg.norm(domain1.pressuregrad_work-domain2.pressuregrad_work))
-        #print('edge flux work error        ', num.linalg.norm(domain1.edge_flux_work-domain2.edge_flux_work))
-
-
-
-        assert num.allclose(timestep1,timestep2)
-        assert num.allclose(stage1.explicit_update,stage2.explicit_update)
-        assert num.allclose(xmom1.explicit_update,xmom2.explicit_update)
-        assert num.allclose(ymom1.explicit_update,ymom2.explicit_update)
-        assert num.allclose(max_speed_1,max_speed_2)
-        #assert num.allclose(domain1.edge_timestep,domain2.edge_timestep)
-        #assert num.allclose(domain1.pressuregrad_work,domain2.pressuregrad_work)
-        #assert num.allclose(domain1.edge_flux_work,domain2.edge_flux_work)
-
-        # ki3 = num.argmax(num.abs(domain1.edge_flux_work-domain2.edge_flux_work))
-
-        # ki = ki3//3
-        # q = ki3%3
-        # k = ki//3
-        # e = ki%3
-
-        # print('edge_flux_work ki,q,k,e ', ki, q, k, e)
-
-        # import pprint
-
-        # #pprint.pprint(domain1.edge_flux_work)
-        # edge_flux_diff = domain2.edge_flux_work- domain1.edge_flux_work
-        # edge_timestep_diff =  domain2.edge_timestep- domain1.edge_timestep
-        # #pprint.pprint(domain2.edge_flux_work- domain1.edge_flux_work)
-
-        # for k in range(domain2.number_of_elements):
-        #     for i in range(3):
-        #         ki = 3*k+i
-        #         ki3 = 3*ki
-        #         print(k,i, domain2.neighbours[k,i], edge_timestep_diff[ki], edge_flux_diff[ki3],edge_flux_diff[ki3+1],edge_flux_diff[ki3+2])
-
-
-
-    def test_riverwall_simd(self):
-
-        def create_domain(name='domain'):
-
-            bounding_polygon = [[0.0, 0.0],
-                    [20.0, 0.0],
-                    [20.0, 10.0],
-                    [0.0, 10.0]]
-
-            boundary_tags={'bottom': [0],
-               'right': [1],
-               'top': [2],
-               'left': [3]
-              }
-
-
-            riverWalls = { 'wall1': [[5.0,0.0,   0.5], [5.0,4.0,  0.5]],
-               'wall2': [[15.0,0.0, -0.5], [15.0,4.0,-0.5]],
-               'wall3': [[10.0,10.0, 0.0], [10.0,6.0, 0.0]]
-             }
-
-
-              
-            domain = anuga.create_domain_from_regions(bounding_polygon, 
-                                           boundary_tags,
-                                           maximum_triangle_area = 0.4,
-                                           breaklines = riverWalls.values())
-
-            domain.set_name(name)
-            domain.set_multiprocessor_mode(1)
-
-            #Initial Conditions
-            domain.set_quantity('elevation', lambda x,y : -x/10, location='centroids') # Use function for elevation
-            domain.set_quantity('friction', 0.01, location='centroids')                # Constant friction 
-            domain.set_quantity('stage', expression='elevation', location='centroids') # Dry Bed 
-
-            # Boundary Conditions
-            Bi = anuga.Dirichlet_boundary([0.4, 0, 0])         # Inflow
-            Bo = anuga.Dirichlet_boundary([-2, 0, 0])          # Inflow
-            Br = anuga.Reflective_boundary(domain)            # Solid reflective wall
-
-            domain.set_boundary({'left': Bi, 'right': Bo, 'top': Br, 'bottom': Br})
-
-            # Setup RiverWall
-            domain.riverwallData.create_riverwalls(riverWalls, verbose=False)
-
-            return domain
-
-        print('')
-        print(70*'=')
-        print('Test Riverwall')
-        print(70*'=')
-
-
-        domain1 = create_domain('domain_base')
-        domain1.set_multiprocessor_mode(0)
-
-        domain2 = create_domain('domain_openmp')
-        domain2.set_multiprocessor_mode(0) # will change to 2 once burn in
-
-        #------------------------------
-        #Evolve the system through time
-        #------------------------------
-        print('Evolve domain1')
-        for t in domain1.evolve(yieldstep=0.1,finaltime=0.1):
-            domain1.print_timestepping_statistics()
-
-        print('Evolve domain2')
-        for t in domain2.evolve(yieldstep=0.1,finaltime=0.1):
-            domain2.print_timestepping_statistics()
-
-        #----------------------------------------
-        # Now just run the openmp code on domain2
-        #----------------------------------------
-        domain2.set_multiprocessor_mode(2)
-        timestep = 0.1
-
-        domain1.distribute_to_vertices_and_edges()
-        domain1.compute_fluxes()
-        timestep1 = domain1.flux_timestep
-
-        domain2.distribute_to_vertices_and_edges()
-        domain2.compute_fluxes()
-        timestep2 = domain2.flux_timestep
-
-        # Compare update arrays and timestep
-
-
-        print('domain1 timestep ', timestep1)
-        print('domain2 timestep ', timestep2)
-
-        quantities1 = domain1.quantities
-        stage1 = quantities1["stage"]
-        xmom1 = quantities1["xmomentum"]
-        ymom1 = quantities1["ymomentum"]
-
-        max_speed_1 = domain1.max_speed
-
-
-        quantities2 = domain2.quantities
-        stage2 = quantities2["stage"]
-        xmom2 = quantities2["xmomentum"]
-        ymom2 = quantities2["ymomentum"]
-
-        max_speed_2 = domain2.max_speed
-
-        print(f'domain1 max_speed {num.max(max_speed_1)} min_speed  {num.min(max_speed_1)}')
-        print(f'domain2 max_speed {num.max(max_speed_2)} min_speed  {num.min(max_speed_2)}')
-
-
-        print('timestep error              ', abs(timestep1-timestep2))
-        print('stage explicit update error ', num.linalg.norm(stage1.explicit_update-stage2.explicit_update))
-        print('xmom  explicit update error ', num.linalg.norm(xmom1.explicit_update-xmom2.explicit_update))
-        print('ymom  explicit update error ', num.linalg.norm(ymom1.explicit_update-ymom2.explicit_update))
-        print('max_speed error             ', num.linalg.norm(max_speed_1-max_speed_2))
-        #print('edge timestep error         ', num.linalg.norm(domain1.edge_timestep-domain2.edge_timestep))
-        #print('pressure work error         ', num.linalg.norm(domain1.pressuregrad_work-domain2.pressuregrad_work))
-        #print('edge flux work error        ', num.linalg.norm(domain1.edge_flux_work-domain2.edge_flux_work))
-
-
-
-        assert num.allclose(timestep1,timestep2)
-        assert num.allclose(stage1.explicit_update,stage2.explicit_update)
-        assert num.allclose(xmom1.explicit_update,xmom2.explicit_update)
-        assert num.allclose(ymom1.explicit_update,ymom2.explicit_update)
-        assert num.allclose(max_speed_1,max_speed_2)
-        #assert num.allclose(domain1.edge_timestep,domain2.edge_timestep)
-        #assert num.allclose(domain1.pressuregrad_work,domain2.pressuregrad_work)
-        #assert num.allclose(domain1.edge_flux_work,domain2.edge_flux_work)
-
-        # ki3 = num.argmax(num.abs(domain1.edge_flux_work-domain2.edge_flux_work))
-
-        # ki = ki3//3
-        # q = ki3%3
-        # k = ki//3
-        # e = ki%3
-
-        # print('edge_flux_work ki,q,k,e ', ki, q, k, e)
-
-        # import pprint
-
-        # #pprint.pprint(domain1.edge_flux_work)
-        # edge_flux_diff = domain2.edge_flux_work- domain1.edge_flux_work
-        # edge_timestep_diff =  domain2.edge_timestep- domain1.edge_timestep
-        # #pprint.pprint(domain2.edge_flux_work- domain1.edge_flux_work)
-
-        # for k in range(domain2.number_of_elements):
-        #     for i in range(3):
-        #         ki = 3*k+i
-        #         ki3 = 3*ki
-        #         print(k,i, domain2.neighbours[k,i], edge_timestep_diff[ki], edge_flux_diff[ki3],edge_flux_diff[ki3+1],edge_flux_diff[ki3+2])
-  
-
-if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_DE_openmp, 'test_runup')
-    runner = unittest.TextTestRunner(verbosity=1)
-    runner.run(suite)
diff --git a/anuga/shallow_water/tests/test_data_manager.py b/anuga/shallow_water/tests/test_data_manager.py
index d2b6b0825..a7f80b25a 100644
--- a/anuga/shallow_water/tests/test_data_manager.py
+++ b/anuga/shallow_water/tests/test_data_manager.py
@@ -846,8 +846,7 @@ def test_points2polygon(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    #suite = unittest.makeSuite(Test_Data_Manager, 'test_sww2domain2')
-    suite = unittest.makeSuite(Test_Data_Manager, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Data_Manager)
     
     
     
diff --git a/anuga/shallow_water/tests/test_forcing.py b/anuga/shallow_water/tests/test_forcing.py
index de07c6eca..44135d9d1 100644
--- a/anuga/shallow_water/tests/test_forcing.py
+++ b/anuga/shallow_water/tests/test_forcing.py
@@ -2380,6 +2380,6 @@ def test_inflow_catch_too_few_triangles(self):
 
             
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Forcing, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Forcing)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/shallow_water/tests/test_friction.py b/anuga/shallow_water/tests/test_friction.py
index 27dbb9554..5b2dc320b 100644
--- a/anuga/shallow_water/tests/test_friction.py
+++ b/anuga/shallow_water/tests/test_friction.py
@@ -27,7 +27,7 @@ def tearDown(self):
         
 
 
-    def test_manning_friction_implicit(self):
+    def test_manning_friction_flat_implicit(self):
         """Test the manning friction implicit forcing term
         """
 
@@ -76,8 +76,70 @@ def test_manning_friction_implicit(self):
         assert num.allclose(domain.quantities['ymomentum'].semi_implicit_update, ymon_semi_implicit_update)
 
 
+    def test_manning_friction_sloped_implicit(self):
+        """Test the manning friction sloped implicit forcing term
+        """
+
+        a = [0.0, 0.0]
+        b = [0.0, 2.0]
+        c = [2.0, 0.0]
+        d = [0.0, 4.0]
+        e = [2.0, 2.0]
+        f = [4.0, 0.0]
+
+        points = [a, b, c, d, e, f]
+        #             bac,     bce,     ecf,     dbe
+        vertices = [[1,0,2], [1,2,4], [4,2,5], [3,1,4]]
+
+        domain = Domain(points, vertices)
+
+        def slope(x, y):
+            """Define a slope for the surface"""
+            return 0.5 * x + 0.5 * y
+
+        def stage(x, y):
+            """Define a stage that is 1m above the slope"""
+            return slope(x, y) + 1.0
+
+        #Flat surface with 1m of water
+        domain.set_quantity('elevation', slope)
+        domain.set_quantity('stage', stage)
+        domain.set_quantity('friction', 1.00)
+        domain.set_quantity('xmomentum', 1.00)
+        domain.set_quantity('ymomentum', 2.00)
+
+        domain.use_sloped_mannings = True
+
+        Br = Reflective_boundary(domain)
+        domain.set_boundary({'exterior': Br})
+
+        #Test friction forcing term
+        domain.compute_forcing_terms()
+
+
+        import pprint
+        pprint.pprint(domain.quantities['xmomentum'].explicit_update)
+        pprint.pprint(domain.quantities['ymomentum'].explicit_update)
+
+        pprint.pprint(domain.quantities['xmomentum'].semi_implicit_update)
+        pprint.pprint(domain.quantities['ymomentum'].semi_implicit_update)
+
+        #xmon_semi_implicit_update = np.array([-21.91346618, -21.91346618, -21.91346618, -21.91346618])
+        #ymon_semi_implicit_update = np.array([-43.82693236, -43.82693236, -43.82693236, -43.82693236])
+
+        xmon_semi_implicit_update = np.array([-26.83840532, -26.83840532, -26.83840532, -26.83840532])
+        ymon_semi_implicit_update = np.array([-53.67681064, -53.67681064, -53.67681064, -53.67681064])
+
+        assert num.allclose(domain.quantities['stage'].explicit_update, 0.0)
+        assert num.allclose(domain.quantities['xmomentum'].explicit_update, 0.0)
+        assert num.allclose(domain.quantities['ymomentum'].explicit_update, 0.0)
+
+        assert num.allclose(domain.quantities['xmomentum'].semi_implicit_update, xmon_semi_implicit_update)
+        assert num.allclose(domain.quantities['ymomentum'].semi_implicit_update, ymon_semi_implicit_update)
+
+
             
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Friction, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Friction)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/shallow_water/tests/test_loadsave.py b/anuga/shallow_water/tests/test_loadsave.py
index ef3172ab7..d7043faea 100644
--- a/anuga/shallow_water/tests/test_loadsave.py
+++ b/anuga/shallow_water/tests/test_loadsave.py
@@ -207,6 +207,6 @@ def test_get_flow_through_cross_section_with_geo(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_LoadSave, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_LoadSave)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)    
diff --git a/anuga/shallow_water/tests/test_local_extrapolation_and_flux_updating.py b/anuga/shallow_water/tests/test_local_extrapolation_and_flux_updating.py
index eb318a568..2618df0fc 100644
--- a/anuga/shallow_water/tests/test_local_extrapolation_and_flux_updating.py
+++ b/anuga/shallow_water/tests/test_local_extrapolation_and_flux_updating.py
@@ -108,7 +108,9 @@ def test_local_extrapolation_and_flux_updating_DE1(self):
         return
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_local_extrapolation_and_flux_updating, 'test')
+    suite = unittest.TestSuite([
+        Test_local_extrapolation_and_flux_updating('test_local_extrapolation_and_flux_updating_DE0'),
+        Test_local_extrapolation_and_flux_updating('test_local_extrapolation_and_flux_updating_DE1')])
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
 
diff --git a/anuga/shallow_water/tests/test_most2nc.py b/anuga/shallow_water/tests/test_most2nc.py
index 9e1f0af58..b9d0dd4ab 100644
--- a/anuga/shallow_water/tests/test_most2nc.py
+++ b/anuga/shallow_water/tests/test_most2nc.py
@@ -47,6 +47,6 @@ def test_small_nxn(self):
         os.remove('test.nc')
         
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_most2nc,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_most2nc)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/shallow_water/tests/test_shallow_water_domain.py b/anuga/shallow_water/tests/test_shallow_water_domain.py
index e5d64a1e6..03049434a 100644
--- a/anuga/shallow_water/tests/test_shallow_water_domain.py
+++ b/anuga/shallow_water/tests/test_shallow_water_domain.py
@@ -50,8 +50,8 @@
 from pprint import pprint
 
 # Get gateway to C implementation of flux function for direct testing
-from anuga.shallow_water.sw_domain_orig_ext import flux_function_central as flux_function
-from anuga.shallow_water.sw_domain_orig_ext import rotate
+from anuga.shallow_water.sw_domain_openmp_ext import flux_function_central as flux_function
+from anuga.shallow_water.sw_domain_openmp_ext import rotate
 
 
 def set_bottom_friction(tag, elements, domain):
@@ -3036,7 +3036,7 @@ def stage(x, y):
             assert num.allclose(domain.quantities[name].semi_implicit_update, 0)
 
         #domain.compute_forcing_terms()
-        from anuga.shallow_water.sw_domain_orig_ext import gravity
+        from anuga.shallow_water.sw_domain_openmp_ext import gravity
         gravity(domain)
 
         #print domain.quantities['xmomentum'].explicit_update
@@ -3083,7 +3083,7 @@ def stage(x, y):
             assert num.allclose(domain.quantities[name].semi_implicit_update, 0)
 
         #domain.compute_forcing_terms()
-        from anuga.shallow_water.sw_domain_orig_ext import gravity
+        from anuga.shallow_water.sw_domain_openmp_ext import gravity
         gravity(domain)
 
         #print domain.quantities['xmomentum'].explicit_update
@@ -3129,7 +3129,7 @@ def stage(x, y):
             assert num.allclose(domain.quantities[name].explicit_update, 0)
             assert num.allclose(domain.quantities[name].semi_implicit_update, 0)
 
-        from anuga.shallow_water.sw_domain_orig_ext import gravity_wb
+        from anuga.shallow_water.sw_domain_openmp_ext import gravity_wb
         gravity_wb(domain)
 
 
@@ -3176,7 +3176,7 @@ def stage(x, y):
             assert num.allclose(domain.quantities[name].explicit_update, 0)
             assert num.allclose(domain.quantities[name].semi_implicit_update, 0)
 
-        from anuga.shallow_water.sw_domain_orig_ext import gravity_wb
+        from anuga.shallow_water.sw_domain_openmp_ext import gravity_wb
         gravity_wb(domain)
 
 
@@ -4171,7 +4171,7 @@ def slope(x, y):
         # Import underlying routine locally.
         # FIXME (Ole): This routine might have to be removed entirely as likely
         # superseded by extrapolate_second_order_edge_sw
-        from anuga.shallow_water.sw_domain_orig_ext import extrapolate_second_order_sw
+        from anuga.shallow_water.sw_domain_openmp_ext import extrapolate_second_order_sw
 
         extrapolate_second_order_sw(domain)
 
@@ -9122,7 +9122,8 @@ def test_that_mesh_methods_exist(self):
 #################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Shallow_Water, 'test_another_runup_example')
-    #suite = unittest.makeSuite(Test_Shallow_Water, 'test')
+    suite = unittest.TestSuite([
+    Test_Shallow_Water('test_another_runup_example')
+])
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/shallow_water/tests/test_sww_interrogate.py b/anuga/shallow_water/tests/test_sww_interrogate.py
index 08d2fa291..e7a2715f6 100644
--- a/anuga/shallow_water/tests/test_sww_interrogate.py
+++ b/anuga/shallow_water/tests/test_sww_interrogate.py
@@ -1059,7 +1059,7 @@ def topography(x, y):
  
  
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_sww_Interrogate, 'test_')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_sww_Interrogate)
     runner = unittest.TextTestRunner() #verbosity=2)
     runner.run(suite)
                
diff --git a/anuga/shallow_water/tests/test_system.py b/anuga/shallow_water/tests/test_system.py
index 633fa8ac1..a7bf27cab 100644
--- a/anuga/shallow_water/tests/test_system.py
+++ b/anuga/shallow_water/tests/test_system.py
@@ -190,6 +190,6 @@ def test_boundary_timeII(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_system,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_system)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/shallow_water/tests/test_timezone.py b/anuga/shallow_water/tests/test_timezone.py
index 906c0c7ea..ae7f47918 100644
--- a/anuga/shallow_water/tests/test_timezone.py
+++ b/anuga/shallow_water/tests/test_timezone.py
@@ -138,6 +138,6 @@ def test_domainTZ_starttime_naive_datetime(self):
         assert str(domain.get_datetime()) == '2021-03-21 18:31:00+11:00'
             
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Timzone, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Timezone)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/structures/boyd_pipe_operator.py b/anuga/structures/boyd_pipe_operator.py
index b83f24383..a30b10c1a 100644
--- a/anuga/structures/boyd_pipe_operator.py
+++ b/anuga/structures/boyd_pipe_operator.py
@@ -20,7 +20,6 @@ class Boyd_pipe_operator(anuga.Structure_operator):
     mannings_rougness,
     """
 
-
     def __init__(self,
                  domain,
                  losses,
@@ -242,7 +241,6 @@ def boyd_pipe_function(depth,
     For these conditions we also would like to assess the pipe flow characteristics as it leaves the pipe
     """
 
-
     # Note this errors if blockage is set to 1.0 (ie 100% blockaage) and i have no idea how to fix it
     if blockage >= 1.0:
         Q = barrel_velocity = outlet_culvert_depth = 0.0
diff --git a/anuga/structures/meson.build b/anuga/structures/meson.build
index 35127cde9..19af69068 100644
--- a/anuga/structures/meson.build
+++ b/anuga/structures/meson.build
@@ -10,7 +10,6 @@ python_sources = [
 'internal_boundary_functions.py',
 'internal_boundary_operator.py',
 'riverwall.py',
-'setup.py',
 'structure_operator.py',
 'weir_orifice_trapezoid_operator.py',
   ]
@@ -20,4 +19,4 @@ py3.install_sources(
   subdir: 'anuga/structures'
 )
 
-subdir('tests')
\ No newline at end of file
+subdir('tests')
diff --git a/anuga/structures/riverwall.py b/anuga/structures/riverwall.py
index b3421e164..caf2499f9 100644
--- a/anuga/structures/riverwall.py
+++ b/anuga/structures/riverwall.py
@@ -230,7 +230,7 @@ def create_riverwalls(self, riverwalls, riverwallPar={ },
         #    print '         the shallow water flux solution as the ratio (min_head)/(weir_height) becomes '
         #    print '         large, or the ratio (downstream_head)/(upstream_head) becomes large'
         #    print ' '
-        #    print '  It works in parallel, but you must use domain.riverwallData.create_riverwall AFTER distributing the mesh'
+        #    print '  It works in parallel, but you must use domain.create_riverwall AFTER distributing the mesh'
         #    print ' '
 
         # NOTE: domain.riverwallData is initialised in shallow_water_domain.py for DE algorithms
diff --git a/anuga/structures/setup.py b/anuga/structures/setup.py
deleted file mode 100644
index 7dd47445d..000000000
--- a/anuga/structures/setup.py
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('structures', parent_package, top_path)
-
-    config.add_data_dir('tests')
-    config.add_data_dir(join('tests','data'))
-
-
-    
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/structures/tests/test_boyd_box_operator.py b/anuga/structures/tests/test_boyd_box_operator.py
index bb2ba2b01..5296baa2a 100644
--- a/anuga/structures/tests/test_boyd_box_operator.py
+++ b/anuga/structures/tests/test_boyd_box_operator.py
@@ -3252,6 +3252,6 @@ def test_boyd_14_operator_invert(self):
         assert numpy.allclose(d, d_expected, rtol=2.0e-2) #depth at outlet used to calc v          
 # =========================================================================
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_boyd_box_operator, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_boyd_box_operator)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/structures/tests/test_boyd_pipe_operator.py b/anuga/structures/tests/test_boyd_pipe_operator.py
index 429f12099..293202c6b 100644
--- a/anuga/structures/tests/test_boyd_pipe_operator.py
+++ b/anuga/structures/tests/test_boyd_pipe_operator.py
@@ -657,6 +657,6 @@ def test_boyd_non_skew7(self):
 
 # =========================================================================
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_boyd_pipe_operator, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_boyd_pipe_operator)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/structures/tests/test_inlet_operator.py b/anuga/structures/tests/test_inlet_operator.py
index 66a590bdc..d1d2db97c 100644
--- a/anuga/structures/tests/test_inlet_operator.py
+++ b/anuga/structures/tests/test_inlet_operator.py
@@ -369,6 +369,6 @@ def test_inlet_variable_Q_default(self):
 
 # =========================================================================
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_inlet_operator, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_inlet_operator)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/structures/tests/test_internal_boundary_functions.py b/anuga/structures/tests/test_internal_boundary_functions.py
index acd40a872..eefb4bb77 100644
--- a/anuga/structures/tests/test_internal_boundary_functions.py
+++ b/anuga/structures/tests/test_internal_boundary_functions.py
@@ -213,6 +213,6 @@ def test_pumping_station_function(self):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_internal_boundary_functions, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_internal_boundary_functions)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/structures/tests/test_riverwall_structure.py b/anuga/structures/tests/test_riverwall_structure.py
old mode 100755
new mode 100644
index 55d4b0719..70e5ed549
--- a/anuga/structures/tests/test_riverwall_structure.py
+++ b/anuga/structures/tests/test_riverwall_structure.py
@@ -614,6 +614,6 @@ def test_multiple_riverwalls(self):
 
 # =========================================================================
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_riverwall_structure, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_riverwall_structure)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/structures/tests/test_weir_orifice_trapezoid_operator.py b/anuga/structures/tests/test_weir_orifice_trapezoid_operator.py
index c8ef9b22d..54f7bf9c4 100644
--- a/anuga/structures/tests/test_weir_orifice_trapezoid_operator.py
+++ b/anuga/structures/tests/test_weir_orifice_trapezoid_operator.py
@@ -673,6 +673,6 @@ def test_weir_orifice_6(self):
 
 # =========================================================================
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_weir_orifice_trapezoid_operator, 'test_')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_weir_orifice_trapezoid_operator)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/tsunami_source/setup.py b/anuga/tsunami_source/setup.py
deleted file mode 100644
index 3a73a991f..000000000
--- a/anuga/tsunami_source/setup.py
+++ /dev/null
@@ -1,27 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('tsunami_source', parent_package, top_path)
-
-    config.add_data_dir('tests')
-    config.add_data_dir(join('tests','data'))
-    
-    
-    #config.add_extension('okada_tsunami_fortran',
-	#					sources=['okada_tsunami_fortran.f'])
-
-    
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/tsunami_source/tests/test_eq.py b/anuga/tsunami_source/tests/test_eq.py
index caebdea71..930441e09 100644
--- a/anuga/tsunami_source/tests/test_eq.py
+++ b/anuga/tsunami_source/tests/test_eq.py
@@ -61,6 +61,6 @@ def test_earthquake_tsunami(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_eq,'test_')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_eq)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/tsunami_source/tests/test_smf.py b/anuga/tsunami_source/tests/test_smf.py
index b7ed89f0b..e788ecfa5 100644
--- a/anuga/tsunami_source/tests/test_smf.py
+++ b/anuga/tsunami_source/tests/test_smf.py
@@ -137,7 +137,7 @@ def test_slide_tsunami_domain(self):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_smf,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_smf)
     runner = unittest.TextTestRunner()
     runner.run(suite)
 
diff --git a/anuga/tsunami_source/tests/test_tsunami_okada.py b/anuga/tsunami_source/tests/test_tsunami_okada.py
index 2461ecd3b..ee0c7e7ba 100644
--- a/anuga/tsunami_source/tests/test_tsunami_okada.py
+++ b/anuga/tsunami_source/tests/test_tsunami_okada.py
@@ -298,7 +298,7 @@ def topography(x,y):
 #-------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_eq,'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_eq)
     runner = unittest.TextTestRunner()
     runner.run(suite)
 
diff --git a/anuga/utilities/anuga_constants.h b/anuga/utilities/anuga_constants.h
new file mode 100644
index 000000000..74a319743
--- /dev/null
+++ b/anuga/utilities/anuga_constants.h
@@ -0,0 +1,9 @@
+#ifndef ANUGA_CONSTANTS_H
+#define ANUGA_CONSTANTS_H
+
+static const double TINY =1.0e-100;
+static const double EPSILON = 1.0e-12;
+static const double single_precision_epsilon = 1.0e-6;
+const double pi = 3.14159265358979;
+
+#endif
\ No newline at end of file
diff --git a/anuga/utilities/anuga_runtime.h b/anuga/utilities/anuga_runtime.h
new file mode 100644
index 000000000..21f279b48
--- /dev/null
+++ b/anuga/utilities/anuga_runtime.h
@@ -0,0 +1,18 @@
+#ifndef ANUGA_RUNTIME_H
+#define ANUGA_RUNTIME_H
+
+// useful definitions for ANUGA runtime
+
+// there must be a better way to do this... TODO JLGV
+
+// We can obtain openmp on macOS using clang with llvm-openmp
+
+// We can obtain openmp on windows using clang or gcc_win-64 gxx_win-64
+// or using msvc with msvc_win-64
+
+// We can obtain openmp on linux using gcc with gcc_linux-64 gxx_linux-64
+
+
+#include "omp.h"
+
+#endif
\ No newline at end of file
diff --git a/anuga/utilities/anuga_typedefs.h b/anuga/utilities/anuga_typedefs.h
new file mode 100644
index 000000000..04b27d3d9
--- /dev/null
+++ b/anuga/utilities/anuga_typedefs.h
@@ -0,0 +1,14 @@
+#ifndef ANUGA_TYPEDEFS_H
+#define ANUGA_TYPEDEFS_H
+
+
+#include <stdint.h>
+
+
+typedef int64_t anuga_int;
+typedef uint64_t anuga_uint;
+
+
+
+
+#endif
\ No newline at end of file
diff --git a/anuga/utilities/cg.c b/anuga/utilities/cg.c
index c879f405a..5f3861b84 100644
--- a/anuga/utilities/cg.c
+++ b/anuga/utilities/cg.c
@@ -11,7 +11,7 @@
 //
 // Note Padarn 5/12/12: I have tried a few optimization modifications which
 // didn't seem to save any time:
-// -- conversion of the int64_t arrays to int64_t arrays (to save memory passing time)
+// -- conversion of the int arrays to long arrays (to save memory passing time)
 // -- taking advantage of the symmetric quality of the matrix A to reduce the zAx loop
 // -- specifying different 'chunk' sizes for the openmp loops
 // -- using blas instead of own openmp loops
@@ -19,6 +19,7 @@
 #include "math.h"
 #include "stdio.h"
 #include <stdint.h>
+#include "anuga_typedefs.h"
 
 #if defined(__APPLE__)
    // clang doesn't have openmp
@@ -29,14 +30,14 @@
 
 
 // Dot product of two double vectors: a.b
-// @input N: int64_t length of vectors a and b
+// @input N: anuga_int length of vectors a and b
 //        a: first vector of doubles
 //        b: second vector of double
 // @return: double result of a.b 
-double cg_ddot( int64_t N, double *a, double *b)
+double cg_ddot( anuga_int N, double *a, double *b)
 {
   double ret = 0;
-  int64_t i;
+  anuga_int i;
   #pragma omp parallel for private(i) reduction(+:ret)
   for(i=0;i<N;i++)
   {
@@ -47,12 +48,12 @@ double cg_ddot( int64_t N, double *a, double *b)
 }
 
 // In place multiplication of a double vector x by constant a: a*x
-// @input N: int64_t length of vector x
+// @input N: anuga_int length of vector x
 //        a: double scalar to multiply by
 //        x: double vector to scale
-void cg_dscal(int64_t N, double a, double *x)
+void cg_dscal(anuga_int N, double a, double *x)
 {
-  int64_t i;
+  anuga_int i;
   #pragma omp parallel for private(i)
   for(i=0;i<N;i++)
   {
@@ -62,12 +63,12 @@ void cg_dscal(int64_t N, double a, double *x)
 }
 
 // Copy of one vector to another - memory already allocated: y=x
-// @input N: int64_t length of vectors x and y
+// @input N: anuga_int length of vectors x and y
 //        x: double vector to make copy of
 //        y: double vector to copy into
-void cg_dcopy( int64_t N, double *x, double *y)
+void cg_dcopy( anuga_int N, double *x, double *y)
 {
-  int64_t i;
+  anuga_int i;
   #pragma omp parallel for private(i)
   for(i=0;i<N;i++)
   {
@@ -76,13 +77,13 @@ void cg_dcopy( int64_t N, double *x, double *y)
 }
 
 // In place axpy operation: y = a*x + y
-// @input N: int64_t length of vectors x and y
+// @input N: anuga_int length of vectors x and y
 //        a: double to multiply x by
 //        x: first double vector
 //        y: second double vector, stores result
-void cg_daxpy(int64_t N, double a, double *x, double *y)
+void cg_daxpy(anuga_int N, double a, double *x, double *y)
 {
-  int64_t i;
+  anuga_int i;
   #pragma omp parallel for private(i)
   for(i=0;i<N;i++)
   {
@@ -93,15 +94,15 @@ void cg_daxpy(int64_t N, double a, double *x, double *y)
 // Sparse CSR matrix-vector product: z = A*x
 // @input z: double vector to store the result
 //        data: double vector with non-zero entries of A
-//        colind: int64_t vector of column indicies of non-zero entries of A
-//        row_ptr: int64_t vector giving index of rows for non-zero entires of A
+//        colind: anuga_int vector of column indicies of non-zero entries of A
+//        row_ptr: anuga_int vector giving index of rows for non-zero entires of A
 //        x: double vector to be multiplied
 //        M: length of vector x
-void cg_zAx(double * z, double * data, int64_t * colind, int64_t * row_ptr, double * x, int64_t M){
+void cg_zAx(double * z, double * data, anuga_int * colind, anuga_int * row_ptr, double * x, anuga_int M){
 
   
   
-  int64_t i, j, ckey;
+  anuga_int i, j, ckey;
 
 
      
@@ -122,12 +123,12 @@ void cg_zAx(double * z, double * data, int64_t * colind, int64_t * row_ptr, doub
 //        D: double vector of diagonal matrix
 //        x: double vector to be multiplied
 //        M: length of vector x
-void cg_zDx(double * z, double * D, double * x, int64_t M){
+void cg_zDx(double * z, double * D, double * x, anuga_int M){
 
   
-  int64_t i, j, ckey;
+  anuga_int i;
    
-    #pragma omp parallel for private(ckey,j,i)
+    #pragma omp parallel for private(i)
     for (i=0; i<M; i++){
       z[i]=D[i]*x[i];              
     }
@@ -140,12 +141,12 @@ void cg_zDx(double * z, double * D, double * x, int64_t M){
 //        D: double vector of diagonal matrix
 //        x: double vector to be multiplied
 //        M: length of vector x
-void cg_zDinx(double * z, double * D, double * x, int64_t M){
+void cg_zDinx(double * z, double * D, double * x, anuga_int M){
 
   
-  int64_t i, j, ckey;
+  anuga_int i;
    
-    #pragma omp parallel for private(ckey,j,i)
+    #pragma omp parallel for private(i)
     for (i=0; i<M; i++){
       z[i]=1.0/D[i]*x[i];              
     }
@@ -159,14 +160,14 @@ void cg_zDinx(double * z, double * D, double * x, int64_t M){
 // @input z: double vector to store the result
 //        a: double to scale matrix-vector product by
 //        data: double vector with non-zero entries of A
-//        colind: int64_t vector of column indicies of non-zero entries of A
-//        row_ptr: int64_t vector giving index of rows for non-zero entires of A
+//        colind: anuga_int vector of column indicies of non-zero entries of A
+//        row_ptr: anuga_int vector giving index of rows for non-zero entires of A
 //        x: double vector to be multiplied
 //        y: double vector to add
 //        M: length of vector x
-void cg_zaAxpy(double * z, double a, double * data, int64_t * colind, int64_t * row_ptr, double * x,
-      double * y,int64_t M){
-  int64_t i, j, ckey;
+void cg_zaAxpy(double * z, double a, double * data, anuga_int * colind, anuga_int * row_ptr, double * x,
+      double * y,anuga_int M){
+  anuga_int i, j, ckey;
   #pragma omp parallel for private(ckey,j,i)
     for (i=0; i<M; i++ ){
       z[i]=y[i];
@@ -183,19 +184,19 @@ void cg_zaAxpy(double * z, double a, double * data, int64_t * colind, int64_t *
 // Jacobi preconditioner for matrix, A, and right hand side, b. Mutiplies each row
 // by one divided by the diagonal element of the matrix. If the diagonal 
 // element is zero, does nothing (should nnot occur)
-//        colind: int64_t vector of column indicies of non-zero entries of A
-//        row_ptr: int64_t vector giving index of rows for non-zero entires of A
+//        colind: anuga_int vector of column indicies of non-zero entries of A
+//        row_ptr: anuga_int vector giving index of rows for non-zero entires of A
 //        b: double vector specifying right hand side of equation to solve
 //        M: length of vector b
 
-int64_t _jacobi_precon_c(double* data, 
-                int64_t* colind,
-                int64_t* row_ptr,
+anuga_int _jacobi_precon_c(double* data, 
+                anuga_int* colind,
+                anuga_int* row_ptr,
                 double * precon,
-                int64_t M){
+                anuga_int M){
 
 
-  int64_t i, j, k, ckey;
+  anuga_int i, j, ckey;
   double diag;
 
 
@@ -221,25 +222,25 @@ int64_t _jacobi_precon_c(double* data,
 
 // Conjugate gradient solve Ax = b for x, A given in Sparse CSR format
 // @input data: double vector with non-zero entries of A
-//        colind: int64_t vector of column indicies of non-zero entries of A
-//        row_ptr: int64_t vector giving index of rows for non-zero entires of A
+//        colind: anuga_int vector of column indicies of non-zero entries of A
+//        row_ptr: anuga_int vector giving index of rows for non-zero entires of A
 //        b: double vector specifying right hand side of equation to solve
 //        x: double vector with initial guess and to store result
 //        imax: maximum number of iterations
 //        tol: error tollerance for stopping criteria
 //        M: length of vectors x and b
 // @return: 0 on success  
-int64_t _cg_solve_c(double* data, 
-                int64_t* colind,
-                int64_t* row_ptr,
+anuga_int _cg_solve_c(double* data, 
+                anuga_int* colind,
+                anuga_int* row_ptr,
                 double * b,
                 double * x,
-                int64_t imax,
+                anuga_int imax,
                 double tol,
                 double a_tol,
-                int64_t M){
+                anuga_int M){
 
-  int64_t i = 1;
+  anuga_int i = 1;
   double alpha,rTr,rTrOld,bt,rTr0;
 
   double * d = malloc(sizeof(double)*M);
@@ -291,8 +292,8 @@ int64_t _cg_solve_c(double* data,
 // Conjugate gradient solve Ax = b for x, A given in Sparse CSR format,
 // using a diagonal preconditioner M. 
 // @input data: double vector with non-zero entries of A
-//        colind: int64_t vector of column indicies of non-zero entries of A
-//        row_ptr: int64_t vector giving index of rows for non-zero entires of A
+//        colind: anuga_int vector of column indicies of non-zero entries of A
+//        row_ptr: anuga_int vector giving index of rows for non-zero entires of A
 //        b: double vector specifying right hand side of equation to solve
 //        x: double vector with initial guess and to store result
 //        imax: maximum number of iterations
@@ -300,18 +301,18 @@ int64_t _cg_solve_c(double* data,
 //        M: length of vectors x and b
 //        precon: diagonal preconditioner given as vector
 // @return: 0 on success  
-int64_t _cg_solve_c_precon(double* data, 
-                int64_t* colind,
-                int64_t* row_ptr,
+anuga_int _cg_solve_c_precon(double* data, 
+                anuga_int* colind,
+                anuga_int* row_ptr,
                 double * b,
                 double * x,
-                int64_t imax,
+                anuga_int imax,
                 double tol,
                 double a_tol,
-                int64_t M,
+                anuga_int M,
                 double * precon){
 
-  int64_t i = 1;
+  anuga_int i = 1;
   double alpha,rTr,rTrOld,bt,rTr0;
 
   double * d = malloc(sizeof(double)*M);
diff --git a/anuga/utilities/create_benchmark_csvfile.py b/anuga/utilities/create_benchmark_csvfile.py
new file mode 100755
index 000000000..3ed7ae0e6
--- /dev/null
+++ b/anuga/utilities/create_benchmark_csvfile.py
@@ -0,0 +1,134 @@
+
+
+import pstats
+import os
+import csv
+import numpy as np
+
+
+#=================================
+# Collect timings
+#=================================
+
+def create_benchmark_csvfile(pstat_basename, openmp_threads, verbose=True):
+
+    """
+    Create a CSV file summarizing the benchmark timings from pstat files.
+    Args:
+        pstat_basename (str): The base name for the pstat files.
+        openmp_threads (list): List of OpenMP thread counts to analyze.
+        verbose (bool): If True, prints the DataFrame to the console.
+
+    Example usage:
+        create_benchmark_csvfile('profile_example', [1, 2, 4, 8], verbose=True)
+        This will create a CSV file named 'profile_example.csv' with the benchmark results.
+        The verbose flag will print the DataFrame to the console.
+    Note: Ensure that the pstat files are in the same directory as this script or provide the full path.
+    """
+
+    output_csv = pstat_basename+'.csv'
+
+    table_contents = []
+
+    # FIXME SR: Need to update script to cover cases with evolve_one_euler_step, 
+    # evolve_one_rk2_step, etc.
+    myfuncs = ['OMP_NUM_THREADS', 
+    'total_time', 
+    'evolve', 
+    'evolve_one_rk2_step', 
+    'compute_fluxes', 
+    'distribute_to_vertices_and_edges', 
+    'update_conserved_quantities', 
+    'compute_forcing_terms', 
+    'update_boundary',
+    'backup_conserved_quantities',
+    'saxpy_conserved_quantities',
+    'apply_fractional_steps'
+    ]
+
+    column_header = ['THREADS', 
+    'total_time', 
+    'evolve', 
+    'rk2_step', 
+    'fluxes', 
+    'distribute', 
+    'update', 
+    'forcing',
+    'boundary',
+    'backup',
+    'saxpy', 
+    'operators'
+    ]
+
+    table_contents.append(column_header)
+
+    for threads in openmp_threads:
+
+        pstat_file = pstat_basename+f'_omp_{threads}.pstat'
+
+        print(f'Analysing file {pstat_file}')
+
+        stats = pstats.Stats(pstat_file)
+
+        benchmark_dict = {}
+
+        for func_key, stat in stats.stats.items():
+            filename, line, funcname = func_key
+            for myfuncname in myfuncs:
+                if funcname.endswith(myfuncname):
+                    #print(f'{funcname}, {stat[3]:.3g}')
+                    benchmark_dict[funcname] = stat[3]
+
+
+        benchmark_dict['total_time']=stats.total_tt
+        benchmark_dict['OMP_NUM_THREADS'] = threads
+
+
+        #for key in myfuncs:
+        #    print(f'{key} {benchmark_dict[key]:.3g}')
+
+        table_line =[]
+
+        for key in myfuncs:
+            table_line.append(f'{benchmark_dict[key]:.3g}')
+
+        table_contents.append(table_line)
+
+
+
+    # Write the cumulative times to a CSV file
+    with open(output_csv, 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(table_contents[0])  # Write the header
+        for row in table_contents[1:]:
+            writer.writerow(row)  # Write the data rows
+
+
+    if verbose:
+        print(f'Wrote benchmark results to {output_csv}')
+        import pandas as pd
+
+        df = pd.read_csv(f'{pstat_basename}.csv')
+
+        print('')
+        print(80 * '=')
+        print(f'Benchmark results for {pstat_basename}')
+        print(80 * '=')
+        print('')
+        print(df.to_string(index=False))  # Prints the entire DataFrame without row numbers
+
+if __name__ == '__main__':
+    import sys
+
+    if len(sys.argv) != 3:
+        print("Usage: python create_benchmark_csvfile.py <pstat_basename> <openmp_threads>")
+        sys.exit(1)
+
+    pstat_basename = sys.argv[1]
+    openmp_threads = list(map(int, sys.argv[2].split(',')))
+
+    create_benchmark_csvfile(pstat_basename, openmp_threads, verbose=True)
+    # Example usage:
+    # python create_benchmark_csvfile.py pstat_basename 1,2,4,8
+    # This will create a CSV file named 'pstat_basename.csv' with the benchmark results.
+
diff --git a/anuga/utilities/meson.build b/anuga/utilities/meson.build
index a83c62d85..c4ce4d2a1 100644
--- a/anuga/utilities/meson.build
+++ b/anuga/utilities/meson.build
@@ -1,11 +1,13 @@
 
 inc_dir = include_directories('../utilities', incdir_numpy)
 
+
 # Compile the Cython-generated C code and additional C code
 py3.extension_module('cg_ext',
   sources: ['cg_ext.pyx'],
   include_directories: inc_dir,
-  dependencies: dependencies,
+  c_args : openmp_c_args,
+  dependencies: openmp_deps,
   subdir: 'anuga/utilities',
   install: true,
 )
@@ -48,6 +50,7 @@ python_sources = [
 'animate.py',
 'argparsing.py',
 'cg_solve.py',
+'create_benchmark_csvfile.py',
 'csv_tools.py',
 'data_audit.py',
 'data_audit_wrapper.py',
diff --git a/anuga/utilities/quad_tree.c b/anuga/utilities/quad_tree.c
index 9cb67d7b2..e3af8c76a 100644
--- a/anuga/utilities/quad_tree.c
+++ b/anuga/utilities/quad_tree.c
@@ -10,20 +10,20 @@ static void *emalloc(size_t amt,char * location)
         exit(EXIT_FAILURE);
     }
     return v;
-};
+}
 
 double dot_points(double p1x,double p1y,double p2x,double p2y)
 {
     
     return p1x * p2x + p1y * p2y;
     
-};
+}
 
 // ***************************************************
 
 // ******************* TRIANGLE **********************
 
-triangle * new_triangle(int64_t index,double x1,double y1,double x2,double y2,double x3,double y3)
+triangle * new_triangle(anuga_int index,double x1,double y1,double x2,double y2,double x3,double y3)
 {
 
 	triangle * T = emalloc(sizeof(triangle),"new_triangle"); 
@@ -67,7 +67,7 @@ triangle * new_triangle(int64_t index,double x1,double y1,double x2,double y2,do
     }
 
     return T;
-};
+}
 
 void delete_triangle_list(triangle * T)
 {
@@ -77,7 +77,7 @@ void delete_triangle_list(triangle * T)
 	   T = NULL;
      T = next;
   }
-};
+}
 
 double * calculate_sigma(triangle * T,double x,double y)
 {
@@ -92,7 +92,7 @@ double * calculate_sigma(triangle * T,double x,double y)
 	ret_sigma[2] = dot_points(x - T->x1, y - T->y1, T->nx3, T->ny3)/
 					dot_points(T->x3 - T->x1, T->y3 - T->y1, T->nx3, T->ny3);
 	return ret_sigma;				
-};
+}
 
 double dist(double x,
 	    double y) {
@@ -100,7 +100,7 @@ double dist(double x,
   return sqrt(x*x + y*y);
 }
 
-int64_t __point_on_line(double x, double y,
+anuga_int __point_on_line(double x, double y,
                     double x0, double y0,
                     double x1, double y1,
                     double rtol,
@@ -114,9 +114,9 @@ int64_t __point_on_line(double x, double y,
 
   */
 
-  double a0, a1, a_normal0, a_normal1, b0, b1, len_a, len_b;
+  double a0, a1, a_normal0, a_normal1, b0, b1;
   double nominator, denominator;
-  int64_t is_parallel;
+  anuga_int is_parallel;
 
   a0 = x - x0;
   a1 = y - y0;
@@ -168,11 +168,11 @@ int64_t __point_on_line(double x, double y,
     }
   }
   return 0;
-};
+}
 
-int64_t __is_inside_triangle(double* point,
+anuga_int __is_inside_triangle(double* point,
 			 double* triangle,
-			 int64_t closed,
+			 anuga_int closed,
 			 double rtol,
 			 double a_tol) {
 			 
@@ -181,7 +181,7 @@ int64_t __is_inside_triangle(double* point,
   double denom, alpha, beta;
   
   double x, y; // Point coordinates
-  int64_t i, j, res;
+  anuga_int i, j, res;
 
   x = point[0];
   y = point[1];
@@ -251,7 +251,7 @@ int64_t __is_inside_triangle(double* point,
   return 0;			 			 
 }			  			       	
 
-int64_t triangle_contains_point(triangle * T,double pointx,double pointy)
+anuga_int triangle_contains_point(triangle * T,double pointx,double pointy)
 {
     
 //    double v0x,v0y,v1x,v1y,v2x,v2y,dot00,dot01,dot02,dot11,dot12,invDenom,u,v;
@@ -282,7 +282,7 @@ int64_t triangle_contains_point(triangle * T,double pointx,double pointy)
       point[0] = pointx; point[1] = pointy;
       double rtol=1.0e-12;
       double a_tol=1.0e-12;
-      int64_t closed = 1;
+      anuga_int closed = 1;
 
       return __is_inside_triangle(point,
 			 tri,
@@ -291,7 +291,7 @@ int64_t triangle_contains_point(triangle * T,double pointx,double pointy)
 			 a_tol); 
 
 
-};
+}
 
 
 
@@ -312,7 +312,7 @@ quad_tree * new_quad_tree(double xmin, double xmax, double ymin, double ymax)
   ret -> count = 0;
 	return ret; 
 
-};
+}
 
 void delete_quad_tree(quad_tree * quadtree)
 {
@@ -320,7 +320,7 @@ void delete_quad_tree(quad_tree * quadtree)
   quad_tree_ll * nodelist = new_quad_tree_ll(quadtree,0);
   quad_tree_ll * last = nodelist;
   quad_tree_ll * temp;
-  int64_t i;
+  anuga_int i;
 
   while(nodelist !=NULL){
       
@@ -347,7 +347,7 @@ void delete_quad_tree(quad_tree * quadtree)
       free(temp);
   }
 
-};
+}
 
 void quad_tree_make_children(quad_tree *node){
 
@@ -391,7 +391,7 @@ void quad_tree_insert_triangle(quad_tree *node,triangle *T)
 	// find the quadrant of the current node's extents in which the
 	// point lies (zero if intersects center of extents axes).
 
-	int64_t quad = trivial_contain_split(node,T);
+	anuga_int quad = trivial_contain_split(node,T);
 
   // always increase point count, as storing the total in tree below
   node->count+=1;
@@ -412,27 +412,27 @@ void quad_tree_insert_triangle(quad_tree *node,triangle *T)
 	// the triangle here
 	quad_tree_add_triangle_to_list(node,T);
 
-};
+}
 
 
-int64_t trivial_contain_split(quad_tree *node, triangle *T){
+anuga_int trivial_contain_split(quad_tree *node, triangle *T){
 
-	int64_t p1 = trivial_contain_split_point(node,T->x1,T->y1);
-	int64_t p2 = trivial_contain_split_point(node,T->x2,T->y2);
-	int64_t p3 = trivial_contain_split_point(node,T->x3,T->y3);
+	anuga_int p1 = trivial_contain_split_point(node,T->x1,T->y1);
+	anuga_int p2 = trivial_contain_split_point(node,T->x2,T->y2);
+	anuga_int p3 = trivial_contain_split_point(node,T->x3,T->y3);
 	if(p1 == p2 && p2 == p3){
 	 	return p1;
 	}
 	return 0;
-};
+}
 
-int64_t trivial_contain_split_point(quad_tree *node, double xp,double yp)
+anuga_int trivial_contain_split_point(quad_tree *node, double xp,double yp)
 {
 
 	double midx = (node->xmin+node->xmax)/2;
 	double midy = (node->ymin+node->ymax)/2;
 
-	int64_t ret=0;
+	anuga_int ret=0;
 	
 	if (midx < xp){
 		// quad 1 or 4
@@ -450,7 +450,7 @@ int64_t trivial_contain_split_point(quad_tree *node, double xp,double yp)
 		}
 	}
 	return ret;
-};
+}
 
 triangle * search_triangles_of_quad_tree(quad_tree * node,double xp,double yp){
 	
@@ -463,7 +463,7 @@ triangle * search_triangles_of_quad_tree(quad_tree * node,double xp,double yp){
         T = T->next;
     }
 	return T; // should be NULL if this is reached
-};
+}
 
 // Searches the quad tree starting at 'cur_quad_tree' for the 
 // point, returning the triangle that contains it, or NULL
@@ -483,7 +483,7 @@ triangle * search(quad_tree * node, double xp, double yp){
         if(node->q[0]!=NULL) // look for child to search
         {
             //find correct quadrant to search
-            int64_t quad = trivial_contain_split_point(node,xp,yp);
+            anuga_int quad = trivial_contain_split_point(node,xp,yp);
             
             if (quad!=0)
             {
@@ -494,25 +494,25 @@ triangle * search(quad_tree * node, double xp, double yp){
         }
 	}
 	return return_T; // should not be reached
-};
+}
 
-int64_t quad_tree_node_count(quad_tree * tree)
+anuga_int quad_tree_node_count(quad_tree * tree)
 {
-  int64_t node_count = 1;
+  anuga_int node_count = 1;
   if (tree->q[0]!=NULL){
-      int64_t i;
+      anuga_int i;
       for(i=0;i<4;i++){
         node_count+=quad_tree_node_count(tree->q[i]);
       }
   }
   return node_count;
-};
+}
 
 // ***************************************************
 
 // ***************** quad_tree_ll *******************
 
-quad_tree_ll * new_quad_tree_ll(quad_tree * start,int64_t index){
+quad_tree_ll * new_quad_tree_ll(quad_tree * start,anuga_int index){
     quad_tree_ll * list = malloc(sizeof(quad_tree_ll));
     list->tree = start;
     list->next = NULL;
@@ -524,7 +524,7 @@ quad_tree_ll * new_quad_tree_ll(quad_tree * start,int64_t index){
 
 // ***************** queue_ll *******************
 
-queue_ll * new_queue_ll(int64_t node){
+queue_ll * new_queue_ll(anuga_int node){
     queue_ll * list = malloc(sizeof(queue_ll));
     list->node=node;
     list->next = NULL;
diff --git a/anuga/utilities/quad_tree.h b/anuga/utilities/quad_tree.h
index 5baf66fad..5b72cc49a 100644
--- a/anuga/utilities/quad_tree.h
+++ b/anuga/utilities/quad_tree.h
@@ -10,8 +10,9 @@
 #include <stdio.h>   /* gets */
 #include <stdlib.h>  /* atoi, malloc */
 #include <string.h>  /* strcpy */
-#include <stdint.h>  /* int64_t uint64_t */
+#include <stdint.h>  /* anuga_int uanuga_int */
 #include <math.h>
+#include "anuga_typedefs.h" /* in utilities */
 
 #ifndef quad_tree_H
 #define quad_tree_H
@@ -30,7 +31,7 @@ typedef struct triangle{
 	double x3,y3;
 
 	// index stores the triangles unique id.
-	int64_t index;
+	anuga_int index;
 
 	// outward normal vectors of triangles sides.
 	double nx1,ny1;
@@ -43,7 +44,7 @@ typedef struct triangle{
 } triangle;
 
 // creates a new triangle and returns a pointer to the malloc'ed memory. 
-triangle * new_triangle(int64_t index, double x1, double x2, double x3,
+triangle * new_triangle(anuga_int index, double x1, double x2, double x3,
 						double y1, double y2, double y3);
 
 // deletes entire list of triangles
@@ -54,8 +55,8 @@ void delete_triangle_list(triangle * T);
 double * calculate_sigma(triangle * T,double x,double y);
 
 // Tests to see if a triangle contains a given point,
-// returns a int64_t value 0 false, 1 true.
-int64_t triangle_contains_point(triangle * T,double pointx,double pointy);
+// returns a anuga_int value 0 false, 1 true.
+anuga_int triangle_contains_point(triangle * T,double pointx,double pointy);
 
 //**************************************************************************
 
@@ -68,7 +69,7 @@ typedef struct quad_tree{
 
 	// rectangular extents of current node
 	double xmin,xmax,ymin,ymax;
-	int64_t count;
+	anuga_int count;
 	// parent and children of quad_tree
 	struct quad_tree *parent;
 	struct quad_tree *q[4];
@@ -92,11 +93,11 @@ void quad_tree_insert_triangle(quad_tree *node,triangle *T);
 
 // returns the quadrant of the quad_tree containing the point, or 0 if intersects
 // center axes
-int64_t trivial_contain_split_point(quad_tree *node, double xp,double yp);
+anuga_int trivial_contain_split_point(quad_tree *node, double xp,double yp);
 
 // returns the quadrant of the quad_tree containing the triangle, or 0 if intersects
 // center axes
-int64_t trivial_contain_split(quad_tree *node, triangle *T);
+anuga_int trivial_contain_split(quad_tree *node, triangle *T);
 
 // returns the triangle in the quad_tree's leaves containing the point or NULL
 // if none of the triangles on the current quad_tree contain it.
@@ -107,7 +108,7 @@ triangle * search_triangles_of_quad_tree(quad_tree * node,double xp,double yp);
 triangle * search(quad_tree * node ,double xp, double yp);
 
 // return number of noes in tree
-int64_t quad_tree_node_count(quad_tree * tree);
+anuga_int quad_tree_node_count(quad_tree * tree);
 
 // split the node to make 4 children
 void quad_tree_make_children(quad_tree *node);
@@ -126,11 +127,11 @@ typedef struct quad_tree_ll {
 
     void * tree;
     struct quad_tree_ll * next;
-    int64_t index;
+    anuga_int index;
 
 } quad_tree_ll;
 
-quad_tree_ll * new_quad_tree_ll(quad_tree * start, int64_t index);
+quad_tree_ll * new_quad_tree_ll(quad_tree * start, anuga_int index);
 
 //**************************************************************************
 
@@ -140,13 +141,13 @@ quad_tree_ll * new_quad_tree_ll(quad_tree * start, int64_t index);
 
 typedef struct queue_ll {
 
-    int64_t node;
+    anuga_int node;
     struct queue_ll * next;
 
 
 } queue_ll;
 
-queue_ll * new_queue_ll(int64_t node);
+queue_ll * new_queue_ll(anuga_int node);
 
 //**************************************************************************
 
diff --git a/anuga/utilities/quad_tree_ext.pyx b/anuga/utilities/quad_tree_ext.pyx
index ba2ca2ae9..307c605fa 100644
--- a/anuga/utilities/quad_tree_ext.pyx
+++ b/anuga/utilities/quad_tree_ext.pyx
@@ -11,7 +11,7 @@ cdef extern from "quad_tree.c":
 		pass
 	void delete_quad_tree(quad_tree* quadtree)
 
-cdef delete_quad_tree_cap(object cap):
-	kill = <quad_tree* > PyCapsule_GetPointer(cap, "quad tree")
-	if kill != NULL:
-		delete_quad_tree(kill)
\ No newline at end of file
+#cdef delete_quad_tree_cap(object cap):
+#	kill = <quad_tree* > PyCapsule_GetPointer(cap, "quad tree")
+#	if kill != NULL:
+#		delete_quad_tree(kill)
\ No newline at end of file
diff --git a/anuga/utilities/setup.py b/anuga/utilities/setup.py
deleted file mode 100644
index 41d6fec03..000000000
--- a/anuga/utilities/setup.py
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-from Cython.Build import cythonize
-import Cython.Compiler.Options
-Cython.Compiler.Options.annotate = True
-
-def configuration(parent_package='',top_path=None):
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    config = Configuration('utilities', parent_package, top_path)
-
-    config.add_data_dir('tests')
-    config.add_data_dir(join('tests','data'))
-
-    config.add_extension('sparse_ext',
-                         sources='sparse_ext.pyx')
-
-    config.add_extension('sparse_matrix_ext',
-                         sources=['sparse_matrix_ext.pyx'])
-
-
-    config.add_extension('util_ext',
-                         sources='util_ext_c.pyx')
-
-    if sys.platform == 'darwin':
-        extra_args = None
-    else:
-        extra_args = ['-fopenmp']
-
-    config.add_extension('cg_ext',
-                         sources='cg_ext.pyx',
-                         extra_compile_args=extra_args,
-                         extra_link_args=extra_args)
-
-    config.add_extension('quad_tree_ext',
-                         sources=['quad_tree_ext.pyx'])
-    
-    config.ext_modules = cythonize(config.ext_modules,annotate=True)
-
-    return config
-
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/utilities/sparse.c b/anuga/utilities/sparse.c
index 00b02b650..80ba318b8 100644
--- a/anuga/utilities/sparse.c
+++ b/anuga/utilities/sparse.c
@@ -12,16 +12,17 @@
 #include "math.h"
 #include "stdio.h"
 #include <stdint.h>
+#include "anuga_typedefs.h"
 
 //Matrix-vector routine
-int64_t _csr_mv(int64_t M,
+anuga_int _csr_mv(anuga_int M,
 	    double* data, 
-	    int64_t* colind,
-	    int64_t* row_ptr,
+	    anuga_int* colind,
+	    anuga_int* row_ptr,
 	    double* x,
 	    double* y) {
   		
-  int64_t i, j, ckey;
+  anuga_int i, j, ckey;
 
   for (i=0; i<M; i++ ) 
     for (ckey=row_ptr[i]; ckey<row_ptr[i+1]; ckey++) {
@@ -33,15 +34,15 @@ int64_t _csr_mv(int64_t M,
 }            
 
 //Matrix-matrix routine
-int64_t _csr_mm(int64_t M,
-	    int64_t columns, 
+anuga_int _csr_mm(anuga_int M,
+	    anuga_int columns, 
 	    double* data, 
-	    int64_t* colind,
-	    int64_t* row_ptr,
+	    anuga_int* colind,
+	    anuga_int* row_ptr,
 	    double* x,
 	    double* y) {
   		
-  int64_t i, j, ckey, c, rowind_i, rowind_j;
+  anuga_int i, j, ckey, c, rowind_i, rowind_j;
 
   for (i=0; i<M; i++ ) {
     rowind_i = i*columns;
diff --git a/anuga/utilities/sparse_csr.h b/anuga/utilities/sparse_csr.h
index 7a3737d0c..d9b0b9db3 100644
--- a/anuga/utilities/sparse_csr.h
+++ b/anuga/utilities/sparse_csr.h
@@ -15,8 +15,9 @@
 #include <stdio.h>   /* gets */
 #include <stdlib.h>  /* atoi, malloc */
 #include <string.h>  /* strcpy */
-#include <stdint.h>  /* int64_t uint64_t */
+#include <stdint.h>  /* anuga_int uanuga_int */
 #include "math.h"
+#include "anuga_typedefs.h" /* in utilities */
 
 #ifndef SPARSE_CSR_H
 #define SPARSE_CSR_H
@@ -26,15 +27,15 @@
 // number of rows and the number of entries in the matrix.
 typedef struct {
 	double *data;
-	int64_t *colind;
-	int64_t *row_ptr;
-	int64_t num_rows;
-	int64_t num_entries;
+	anuga_int *colind;
+	anuga_int *row_ptr;
+	anuga_int num_rows;
+	anuga_int num_entries;
 } sparse_csr;
 
 // 'Constructor' function. Returns a pointer to new malloced memory
 // All struct entries are intialised appropriately (mostly to NULL). 
-sparse_csr * make_csr();
+sparse_csr * make_csr(void);
 
 // delete_csr_contents - Free the memory associated with the struct
 // and set the pointer to NULL
diff --git a/anuga/utilities/sparse_dok.c b/anuga/utilities/sparse_dok.c
index 5ea54eee5..674efa329 100644
--- a/anuga/utilities/sparse_dok.c
+++ b/anuga/utilities/sparse_dok.c
@@ -1,3 +1,4 @@
+#include <inttypes.h> /* PRId64 */
 #include "sparse_dok.h"
 
 
@@ -11,13 +12,13 @@ static void *emalloc(size_t amt,char * location)
         exit(EXIT_FAILURE);
     }
     return v;
-};
+}
 
 // ***************************************************
 
 // 'Constructor'
 
-sparse_dok * make_dok(){
+sparse_dok * make_dok(void){
 
     sparse_dok * ret = emalloc(sizeof(sparse_dok),"make_dok");
     ret->edgetable=NULL;
@@ -93,16 +94,16 @@ void print_dok_entries(sparse_dok * hashtable) {
     edge_t *s;
 
     for(s=hashtable->edgetable; s != NULL; s=(edge_t*)(s->hh.next)) {
-        printf("edge key i %ld i %ld entry %f\n",
+        printf("edge key i %" PRId64 " i %" PRId64 " entry %f\n",
                       s->key.i, s->key.j, s->entry);
     }
 }
 
-int64_t key_sort(edge_t *a, edge_t *b) {
+anuga_int key_sort(edge_t *a, edge_t *b) {
     return (a->key.i - b->key.i);
 }
 
-int64_t key_sort_2(edge_t *a, edge_t *b){
+anuga_int key_sort_2(edge_t *a, edge_t *b){
     if(a->key.i - b->key.i==0){
         return (a->key.j-b->key.j);
     } else{
@@ -130,22 +131,22 @@ void convert_to_csr_ptr(sparse_csr * new_csr, sparse_dok * hashtable){
 
     //sort and get number of entries
     sort_by_key(hashtable); 
-    int64_t num_entries = hashtable->num_entries;
-    int64_t num_rows = hashtable->num_rows+1;
+    anuga_int num_entries = hashtable->num_entries;
+    anuga_int num_rows = hashtable->num_rows+1;
 
     //build storage matricies
     ret_csr->data=emalloc(num_entries*sizeof(double),"convert_to_csr_ptr");
-    ret_csr->colind=emalloc(num_entries*sizeof(int64_t),"convert_to_csr_ptr");
-    ret_csr->row_ptr=emalloc((num_rows+1)*sizeof(int64_t),"convert_to_csr_ptr");
+    ret_csr->colind=emalloc(num_entries*sizeof(anuga_int),"convert_to_csr_ptr");
+    ret_csr->row_ptr=emalloc((num_rows+1)*sizeof(anuga_int),"convert_to_csr_ptr");
 
     edge_t * edge = hashtable->edgetable;
 
     //now convert
-    int64_t current_row = -1;
-    int64_t k;
+    anuga_int current_row = -1;
+    anuga_int k;
     for(k=0;k<num_entries;k++){
-        int64_t i = edge->key.i;
-        int64_t j = edge->key.j;
+        anuga_int i = edge->key.i;
+        anuga_int j = edge->key.j;
         double value = edge->entry;
 
         if (i!=current_row){
@@ -170,14 +171,12 @@ void convert_to_csr_ptr(sparse_csr * new_csr, sparse_dok * hashtable){
 void add_sparse_dok(sparse_dok * dok1,double mult1,sparse_dok * dok2,double mult2){
 
     // add both into dok1 - then leave both alone (free outside)
-    int64_t num_entries = dok1->num_entries;
+    anuga_int num_entries = dok1->num_entries;
     edge_t * edge = dok1->edgetable;
     edge_t * edge2;
 
-    int64_t k;
+    anuga_int k;
     for(k=0;k<num_entries;k++){
-        int64_t i = edge->key.i;
-        int64_t j = edge->key.j;
         double value = edge->entry;
         edge->entry=value*mult1;
         edge2=find_dok_entry(dok2,edge->key);
@@ -198,9 +197,9 @@ void add_sparse_dok(sparse_dok * dok1,double mult1,sparse_dok * dok2,double mult
 
 }
 
-int64_t get_dok_rows(sparse_dok * dok){
+anuga_int get_dok_rows(sparse_dok * dok){
 
-    int64_t rows = 0;
+    anuga_int rows = 0;
 
     edge_t *current_edge, *tmp;
 
diff --git a/anuga/utilities/sparse_dok.h b/anuga/utilities/sparse_dok.h
index c63ba6ba9..c215bc478 100644
--- a/anuga/utilities/sparse_dok.h
+++ b/anuga/utilities/sparse_dok.h
@@ -17,10 +17,11 @@
 #include <stdio.h>   /* gets */
 #include <stdlib.h>  /* atoi, malloc */
 #include <string.h>  /* strcpy */
-#include <stdint.h>  /* int64_t uint64_t */
+#include <stdint.h>  /* anuga_int uanuga_int */
 #include "math.h"
 #include "uthash.h"     /* in utilities */
 #include "sparse_csr.h"
+#include "anuga_typedefs.h" /* in utilities */
 
 #ifndef SPARSE_DOK_H
 #define SPARSE_DOK_H
@@ -28,8 +29,8 @@
 // Struct edge_key_t to store the i,j position in the matrix within
 // a key of the hashtable
 typedef struct  {
-    int64_t i;
-    int64_t j;
+    anuga_int i;
+    anuga_int j;
 } edge_key_t;
 
 // Struct edge_t is a basic element of the hash table. By including 
@@ -48,13 +49,13 @@ typedef struct {
 // can be made an appropriate size.
 typedef struct {
 	edge_t *edgetable;
-	int64_t num_entries;
-	int64_t num_rows;
+	anuga_int num_entries;
+	anuga_int num_rows;
 } sparse_dok;
 
 // 'Constructor' function. Returns pointer to new malloced memory, with 
 // appropriate initilisation.
-sparse_dok * make_dok();
+sparse_dok * make_dok(void);
 
 // --------------- Hashtable Functions -----------------
 
@@ -85,7 +86,7 @@ void print_dok_entries(sparse_dok * edgetable);
 
 // key_sort - Compare the relative size of two keys, used for sorting
 // PADARN NOTE: Does not need to be in header.
-int64_t key_sort(edge_t *a, edge_t *b);
+anuga_int key_sort(edge_t *a, edge_t *b);
 
 // sort_by_key - Sort the linked list of the hash table by their key
 // values and the key_sort function.
@@ -106,7 +107,7 @@ void convert_to_csr_ptr(sparse_csr * new_csr,sparse_dok * hashtable);
 void add_sparse_dok(sparse_dok * dok1,double mult1,sparse_dok * dok2,double mult2);
 
 // get_dok_rows -- Return the number of rows currently stored in the matrix
-int64_t get_dok_rows(sparse_dok * dok);
+anuga_int get_dok_rows(sparse_dok * dok);
 
 #endif
 
diff --git a/anuga/utilities/sparse_ext.pyx b/anuga/utilities/sparse_ext.pyx
index fc0d5a1d9..8ceaea787 100644
--- a/anuga/utilities/sparse_ext.pyx
+++ b/anuga/utilities/sparse_ext.pyx
@@ -42,7 +42,6 @@ def csr_mv(object csr_sparse, np.ndarray x not None):
 	else:
 
 		raise ValueError("Allowed dimensions in sparse_ext restricted to 1 or 2")
-		return None
 
 	return y
 
diff --git a/anuga/utilities/tests/test_cg_solve.py b/anuga/utilities/tests/test_cg_solve.py
index 405a06ea8..064095dad 100644
--- a/anuga/utilities/tests/test_cg_solve.py
+++ b/anuga/utilities/tests/test_cg_solve.py
@@ -552,7 +552,6 @@ def test_sparse_solve_matrix_using_c_ext_with_jacobi(self):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_CG_Solve, 'test')
-    #runner = unittest.TextTestRunner(verbosity=2)
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_CG_Solve)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_csv_tools.py b/anuga/utilities/tests/test_csv_tools.py
index 23c67b85e..ece7153a2 100644
--- a/anuga/utilities/tests/test_csv_tools.py
+++ b/anuga/utilities/tests/test_csv_tools.py
@@ -421,6 +421,6 @@ def get_file_contents(self, filename):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_CSV_utils, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_CSV_utils)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_data_audit.py b/anuga/utilities/tests/test_data_audit.py
index ca7f4261f..ccbe72078 100644
--- a/anuga/utilities/tests/test_data_audit.py
+++ b/anuga/utilities/tests/test_data_audit.py
@@ -334,6 +334,6 @@ def test_valid_license_file_with_multiple_files(self):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_data_audit, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_data_audit)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_file_utils.py b/anuga/utilities/tests/test_file_utils.py
index a0d2a3d1b..ec028f39b 100644
--- a/anuga/utilities/tests/test_file_utils.py
+++ b/anuga/utilities/tests/test_file_utils.py
@@ -150,6 +150,6 @@ def test_merge_swwfiles(self):
 
 # -------------------------------------------------------------
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_FileUtils, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_FileUtils)
     runner = unittest.TextTestRunner()  # verbosity=2)
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_function_utils.py b/anuga/utilities/tests/test_function_utils.py
index f1c9f173f..98a542d62 100644
--- a/anuga/utilities/tests/test_function_utils.py
+++ b/anuga/utilities/tests/test_function_utils.py
@@ -53,6 +53,6 @@ def myfunc(x, y, t):
 # -------------------------------------------------------------
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Function_Utils, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Function_Utils)
     runner = unittest.TextTestRunner()  # verbosity=2)
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_log_analyser.py b/anuga/utilities/tests/test_log_analyser.py
index bb888c4d5..86763c4b5 100644
--- a/anuga/utilities/tests/test_log_analyser.py
+++ b/anuga/utilities/tests/test_log_analyser.py
@@ -81,6 +81,6 @@ def test_log(self):
 
 ################################################################################
 if __name__ == "__main__":
-    suite = unittest.makeSuite(logTestCase, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(logTestCase)
     runner = unittest.TextTestRunner()  # verbosity=2)
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_mem_time_equation.py b/anuga/utilities/tests/test_mem_time_equation.py
index f242ae90e..37023413b 100644
--- a/anuga/utilities/tests/test_mem_time_equation.py
+++ b/anuga/utilities/tests/test_mem_time_equation.py
@@ -42,6 +42,6 @@ def test_estimate_time_mem(self):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_mem_time_equation, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_mem_time_equation)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_model_tools.py b/anuga/utilities/tests/test_model_tools.py
index fc8b78e3d..53f071a02 100644
--- a/anuga/utilities/tests/test_model_tools.py
+++ b/anuga/utilities/tests/test_model_tools.py
@@ -91,6 +91,6 @@ def test_WCC_2016_blockage_factor(self):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Model_Tools, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Model_Tools)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_numerical_tools.py b/anuga/utilities/tests/test_numerical_tools.py
index 580bebf26..d81144c8b 100644
--- a/anuga/utilities/tests/test_numerical_tools.py
+++ b/anuga/utilities/tests/test_numerical_tools.py
@@ -634,6 +634,6 @@ def test_ensure_numeric_copy(self):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Numerical_Tools, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Numerical_Tools)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_plot_utils.py b/anuga/utilities/tests/test_plot_utils.py
index 2c2934df3..72a800a8e 100644
--- a/anuga/utilities/tests/test_plot_utils.py
+++ b/anuga/utilities/tests/test_plot_utils.py
@@ -495,6 +495,6 @@ def test_triangle_containing_point(self):
 ################################################################################
 if __name__ == "__main__":
     # _triangle_containing_point')
-    suite = unittest.makeSuite(Test_plot_utils, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_plot_utils)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_quantity_setting_functions.py b/anuga/utilities/tests/test_quantity_setting_functions.py
index 2fab1947c..0cfda5579 100644
--- a/anuga/utilities/tests/test_quantity_setting_functions.py
+++ b/anuga/utilities/tests/test_quantity_setting_functions.py
@@ -419,6 +419,6 @@ def test_quantity_from_Pt_Pol_Data_and_Raster(self):
 
 # =========================================================================
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_quantity_setting_functions, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_quantity_setting)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_sparse.py b/anuga/utilities/tests/test_sparse.py
index b8e5be55f..08475ab5b 100644
--- a/anuga/utilities/tests/test_sparse.py
+++ b/anuga/utilities/tests/test_sparse.py
@@ -209,6 +209,6 @@ def test_sparse_csr_init(self):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_Sparse, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_Sparse)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_spatialInputUtil.py b/anuga/utilities/tests/test_spatialInputUtil.py
old mode 100755
new mode 100644
index 07e5d0aca..0011582e5
--- a/anuga/utilities/tests/test_spatialInputUtil.py
+++ b/anuga/utilities/tests/test_spatialInputUtil.py
@@ -435,6 +435,6 @@ def should_fail():
 
 # =========================================================================
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_spatialInputUtil, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_spatialInputUtil)
     runner = unittest.TextTestRunner(verbosity=1)
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_system_tools.py b/anuga/utilities/tests/test_system_tools.py
index 261482164..7db8b954b 100644
--- a/anuga/utilities/tests/test_system_tools.py
+++ b/anuga/utilities/tests/test_system_tools.py
@@ -490,6 +490,6 @@ def test_get_revision_date(self):
 ################################################################################
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_system_tools, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_system_tools)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/utilities/tests/test_xml_tools.py b/anuga/utilities/tests/test_xml_tools.py
index d04897b26..ece2d8c8b 100644
--- a/anuga/utilities/tests/test_xml_tools.py
+++ b/anuga/utilities/tests/test_xml_tools.py
@@ -269,6 +269,6 @@ def test_duplicate_tags(self):
 
 
 if __name__ == "__main__":
-    suite = unittest.makeSuite(Test_xml_tools, 'test')
+    suite = unittest.TestLoader().loadTestsFromTestCase(Test_xml_tools)
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/anuga/utilities/util_ext.h b/anuga/utilities/util_ext.h
index 975caf7bf..2e564ebb8 100644
--- a/anuga/utilities/util_ext.h
+++ b/anuga/utilities/util_ext.h
@@ -13,6 +13,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <inttypes.h>
+#include "anuga_typedefs.h"
+#include "anuga_runtime.h"
 
 
 #ifndef ANUGA_UTIL_EXT_H
@@ -49,11 +52,11 @@ double sign(double x) {
   else return 0.0;
 }
 
-int64_t _gradient(double x0, double y0, 
-	      double x1, double y1, 
-	      double x2, double y2, 
-	      double q0, double q1, double q2, 
-	      double *a, double *b) {
+anuga_int _gradient(const double x0, const double y0, 
+	      const double x1, const double y1, 
+	      const double x2, const double y2, 
+	      const double q0, const double q1, const double q2, 
+	      double * __restrict a, double * __restrict b) {
 	      
   /*Compute gradient (a,b) based on three points (x0,y0), (x1,y1) and (x2,y2) 
   with values q0, q1 and q2.
@@ -78,8 +81,6 @@ int64_t _gradient(double x0, double y0,
   which is solved using the standard determinant technique    
       
   */
-	      
-
   double det;
   
   det = (y2-y0)*(x1-x0) - (y1-y0)*(x2-x0);
@@ -94,7 +95,7 @@ int64_t _gradient(double x0, double y0,
 }
 
 
-int64_t _gradient2(double x0, double y0, 
+anuga_int _gradient2(double x0, double y0, 
 	       double x1, double y1, 
 	       double q0, double q1, 
 	       double *a, double *b) {
@@ -150,43 +151,40 @@ int64_t _gradient2(double x0, double y0,
 }
 
 
-void _limit_old(int64_t N, double beta, double* qc, double* qv, 
-	    double* qmin, double* qmax) { 
 
-  //N are the number of elements
-  int64_t k, i, k3;
-  double dq, dqa[3], phi, r;
-  
-  //printf("INSIDE\n");
-  for (k=0; k<N; k++) {
-    k3 = k*3;
-    
-    //Find the gradient limiter (phi) across vertices  
-    phi = 1.0;
-    for (i=0; i<3; i++) {    
-      r = 1.0;
-      
-      dq = qv[k3+i] - qc[k];    //Delta between vertex and centroid values
-      dqa[i] = dq;              //Save dq for use in the next loop
-      
-      if (dq > 0.0) r = (qmax[k] - qc[k])/dq;
-      if (dq < 0.0) r = (qmin[k] - qc[k])/dq;      
-  
-  
-      phi = anuga_min( anuga_min(r*beta, 1.0), phi);    
-    }
-    
-    //Then update using phi limiter
-    for (i=0; i<3; i++) {    
-      qv[k3+i] = qc[k] + phi*dqa[i];
-    }
+void _limit_old(anuga_int N, double beta,
+                double* __restrict qc,
+                double* __restrict qv,
+                double* __restrict qmin,
+                double* __restrict qmax)
+{
+  for (anuga_int k = 0; k < N; ++k) {
+    anuga_int k3 = k * 3;
+
+    double dq0 = qv[k3 + 0] - qc[k];
+    double dq1 = qv[k3 + 1] - qc[k];
+    double dq2 = qv[k3 + 2] - qc[k];
+
+    double r0 = (dq0 > 0.0) ? (qmax[k] - qc[k]) / dq0 :
+                (dq0 < 0.0) ? (qmin[k] - qc[k]) / dq0 : 1.0;
+    double r1 = (dq1 > 0.0) ? (qmax[k] - qc[k]) / dq1 :
+                (dq1 < 0.0) ? (qmin[k] - qc[k]) / dq1 : 1.0;
+    double r2 = (dq2 > 0.0) ? (qmax[k] - qc[k]) / dq2 :
+                (dq2 < 0.0) ? (qmin[k] - qc[k]) / dq2 : 1.0;
+
+    double phi = fmin(fmin(fmin(r0 * beta, r1 * beta), r2 * beta), 1.0);
+
+    qv[k3 + 0] = qc[k] + phi * dq0;
+    qv[k3 + 1] = qc[k] + phi * dq1;
+    qv[k3 + 2] = qc[k] + phi * dq2;
   }
 }
 
 
-void  print_double_array(char* name, double* array, int64_t n, int64_t m){
 
-    int64_t k,i,km;
+void  print_double_array(char* name, double* array, anuga_int n, anuga_int m){
+
+    anuga_int k,i,km;
 
     printf("%s = [",name);
     for (k=0; k<n; k++){
@@ -203,9 +201,9 @@ void  print_double_array(char* name, double* array, int64_t n, int64_t m){
     printf("]\n");
 }
 
-void  print_int_array(char* name, int32_t* array, int64_t n, int64_t m){
+void  print_int_array(char* name, int32_t* array, anuga_int n, anuga_int m){
 
-    int64_t k,i,km;
+    anuga_int k,i,km;
 
     printf("%s = [",name);
     for (k=0; k<n; k++){
@@ -223,16 +221,16 @@ void  print_int_array(char* name, int32_t* array, int64_t n, int64_t m){
 }
 
 
-void  print_long_array(char* name, int64_t * array, int64_t n, int64_t m){
+void  print_long_array(char* name, anuga_int * array, anuga_int n, anuga_int m){
 
-    int64_t k,i,km;
+    anuga_int k,i,km;
 
     printf("%s = [",name);
     for (k=0; k<n; k++){
 	km = m*k;
 	printf("[");
 	for (i=0; i<m ; i++){
-	  printf("%li ",array[km+i]);
+	  printf("%" PRId64 " ",array[km+i]);
 	}
 	if (k==(n-1))
 	    printf("]");
diff --git a/anuga/validation_utilities/setup.py b/anuga/validation_utilities/setup.py
deleted file mode 100644
index 31b93675e..000000000
--- a/anuga/validation_utilities/setup.py
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('validation_utilities', parent_package, top_path)
-
-    #config.add_data_dir('tests')
-    #config.add_data_dir(join('tests','data'))
-    
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/anuga/visualiser/numerical_dam_break_dry.py b/anuga/visualiser/numerical_dam_break_dry.py
index 149e1ace8..6524f12f9 100644
--- a/anuga/visualiser/numerical_dam_break_dry.py
+++ b/anuga/visualiser/numerical_dam_break_dry.py
@@ -12,7 +12,7 @@
 import anuga
 from anuga import Domain as Domain
 from math import cos
-from numpy import zeros, float
+from numpy import zeros
 from time import localtime, strftime, gmtime
 #from balanced_dev import *
 
diff --git a/anuga/visualiser/setup.py b/anuga/visualiser/setup.py
deleted file mode 100644
index 8ef0d3d6d..000000000
--- a/anuga/visualiser/setup.py
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-import os
-import sys
-
-from os.path import join
-
-def configuration(parent_package='',top_path=None):
-    
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    
-    config = Configuration('visualiser', parent_package, top_path)
-
-    #config.add_data_dir('tests')
-    #config.add_data_dir(join('tests','data'))
-    
-    return config
-    
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/create_revision_file.py b/create_revision_file.py
index 27a3014d1..755616c29 100644
--- a/create_revision_file.py
+++ b/create_revision_file.py
@@ -18,9 +18,9 @@
 # ===================================================
 # Read VERSION from setup.py file
 # ===================================================
-with open('setup.py') as infile:
+with open('pyproject.toml') as infile:
     for line in infile:
-        match = re.match(r'VERSION =', line)
+        match = re.match(r'version = ', line)
         if match != None:
             VERSION = re.findall('\d.\d.\ddev|\d.\d.\d',line)[0]
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index c8c523199..b6532171a 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -6,7 +6,6 @@ pybind11
 meson
 meson-python
 ninja
-pkg-config
 
 pymetis
 meshpy
@@ -17,7 +16,7 @@ dill
 future
 Pmw
 
-gdal
+
 utm
 pyproj
 affine
diff --git a/docs/source/installation/install_anuga.rst b/docs/source/installation/install_anuga.rst
index 4471ecc24..cd5ca88ea 100644
--- a/docs/source/installation/install_anuga.rst
+++ b/docs/source/installation/install_anuga.rst
@@ -54,7 +54,7 @@ You can test your ANUGA installation by running the unit tests via:
 
 .. note::
 
-    You will nedd to `activate` the `anuga_env` environment each time you want to use ANUGA.
+    You will need to `activate` the `anuga_env` environment each time you want to use ANUGA.
     
     If you are using standard python you use the `source anuga_env/bin/activate` command to 
     activate the environment.
diff --git a/docs/source/installation/install_anuga_developers.rst b/docs/source/installation/install_anuga_developers.rst
index b5a3f25a0..728b3a0a1 100644
--- a/docs/source/installation/install_anuga_developers.rst
+++ b/docs/source/installation/install_anuga_developers.rst
@@ -65,7 +65,7 @@ and its dependencies.
     .. code-block:: bash
 
         cd anuga_core
-        conda env create -n anuga_env_3.12 -f environments/environment.3.12.yml
+        conda env create -n anuga_env_3.12 -f environments/environment_3.12.yml
         conda activate anuga_env_3.12
 
     and finally installs ANUGA in editable mode via: 
@@ -132,6 +132,14 @@ Test the installation.
 
    pytest --pyargs anuga
 
+ANUGA also comes with a validation test suite which verifies the correctness of 
+real life hydraulic scenarios. You can run them as follows:
+
+.. code-block:: bash
+
+    cd validation_tests 
+    python run_auto_validation_tests.py
+
 
 Updating
 ~~~~~~~~
diff --git a/examples/cairns/export_results.py b/examples/cairns/export_results.py
index 27d722826..1b24b5e38 100644
--- a/examples/cairns/export_results.py
+++ b/examples/cairns/export_results.py
@@ -7,7 +7,7 @@
 scenario = 'fixed_wave'
 name = 'cairns_' + scenario
 
-print 'output dir:', name
+print('output dir:', name)
 which_var = 0
 
 if which_var == 0:    # Stage
@@ -30,7 +30,7 @@
     outname = name + '_elevation'
     quantityname = 'elevation'  #Elevation
 
-print 'start sww2dem'
+print('start sww2dem')
 
 anuga.sww2dem(name+'.sww',
         outname+'.asc',
diff --git a/examples/cairns/export_results_to_qgis.py b/examples/cairns/export_results_to_qgis.py
index 472526daf..2b71fa329 100644
--- a/examples/cairns/export_results_to_qgis.py
+++ b/examples/cairns/export_results_to_qgis.py
@@ -8,7 +8,7 @@
 #scenario = 'slide'
 name = 'cairns_' + scenario
 
-print 'output dir:', name
+print('output dir:', name)
 
 """
 Produce .tif files extracting results of cairns simulation.
diff --git a/examples/cairns/get_timeseries.py b/examples/cairns/get_timeseries.py
index 285b05778..70b11a86c 100644
--- a/examples/cairns/get_timeseries.py
+++ b/examples/cairns/get_timeseries.py
@@ -18,7 +18,7 @@
                 quantities=['stage','speed','depth','elevation'],
                 verbose=True)
 except:
-    print 'Failed to process cairns_slide'
+    print('Failed to process cairns_slide')
 
 try:                
     anuga.sww2csv_gauges('cairns_fixed_wave.sww',
@@ -26,7 +26,7 @@
                quantities=['stage', 'speed','depth','elevation'],
                verbose=True)
 except:
-    print 'Failed to process cairns_fixed_wave'
+    print('Failed to process cairns_fixed_wave')
 
 try: 
     import pylab
@@ -41,7 +41,7 @@
                           verbose=True)
 except ImportError:
     #ANUGA does not rely on pylab to work 
-    print 'must have pylab installed to generate plots'
+    print('must have pylab installed to generate plots')
 
 
 try: 
@@ -57,4 +57,4 @@
                           verbose=True)
 except ImportError:
     #ANUGA does not rely on pylab to work 
-    print 'must have pylab installed to generate plots'
+    print('must have pylab installed to generate plots')
diff --git a/examples/cairns/runcairns.py b/examples/cairns/runcairns.py
index 2d76aa020..8e4f73d4b 100644
--- a/examples/cairns/runcairns.py
+++ b/examples/cairns/runcairns.py
@@ -32,7 +32,7 @@
 #------------------------------------------------------------------------------
 # Unzip asc from zip file
 import zipfile as zf
-if project.verbose: print 'Reading ASC from cairns.zip'
+if project.verbose: print('Reading ASC from cairns.zip')
 zf.ZipFile(project.name_stem+'.zip').extract(project.name_stem+'.asc')
 
 # Create DEM from asc data
@@ -60,9 +60,9 @@
                                     verbose=project.verbose)
 
 # Print some stats about mesh and domain
-print 'Number of triangles = ', len(domain)
-print 'The extent is ', domain.get_extent()
-print domain.statistics()
+print('Number of triangles = ', len(domain))
+print('The extent is ', domain.get_extent())
+print(domain.statistics())
                                     
 #------------------------------------------------------------------------------
 # Setup parameters of computational domain
@@ -90,7 +90,7 @@
 
 
 time01 = time.time()
-print 'That took %.2f seconds to fit data' %(time01-time00)
+print('That took %.2f seconds to fit data' %(time01-time00))
 
 if project.just_fitting:
     import sys
@@ -114,7 +114,7 @@
 #------------------------------------------------------------------------------
 # Setup boundary conditions
 #------------------------------------------------------------------------------
-print 'Available boundary tags', domain.get_boundary_tags()
+print('Available boundary tags', domain.get_boundary_tags())
 
 Bd = anuga.Dirichlet_boundary([tide, 0, 0]) # Mean water level
 Bs = anuga.Transmissive_stage_zero_momentum_boundary(domain) # Neutral boundary
@@ -148,8 +148,8 @@
 if project.scenario == 'slide':
     # Initial run without any event
     for t in domain.evolve(yieldstep=10, finaltime=60): 
-        print domain.timestepping_statistics()
-        print domain.boundary_statistics(tags='ocean_east')        
+        print(domain.timestepping_statistics())
+        print(domain.boundary_statistics(tags='ocean_east'))        
         
     # Add slide to water surface
     if allclose(t, 60):
@@ -158,19 +158,19 @@
     # Continue propagating wave
     for t in domain.evolve(yieldstep=10, finaltime=5000, 
                            skip_initial_step=True):
-        print domain.timestepping_statistics()
-        print domain.boundary_statistics(tags='ocean_east')    
+        print(domain.timestepping_statistics())
+        print(domain.boundary_statistics(tags='ocean_east'))    
 
 if project.scenario == 'fixed_wave':
     # Save every two mins leading up to wave approaching land
     for t in domain.evolve(yieldstep=2*60, finaltime=5000): 
-        print domain.timestepping_statistics()
-        print domain.boundary_statistics(tags='ocean_east')    
+        print(domain.timestepping_statistics())
+        print(domain.boundary_statistics(tags='ocean_east'))    
 
     # Save every 30 secs as wave starts inundating ashore
     for t in domain.evolve(yieldstep=60*0.5, finaltime=10000, 
                            skip_initial_step=True):
-        print domain.timestepping_statistics()
-        print domain.boundary_statistics(tags='ocean_east')
+        print(domain.timestepping_statistics())
+        print(domain.boundary_statistics(tags='ocean_east'))
             
-print 'That took %.2f seconds' %(time.time()-t0)
+print('That took %.2f seconds' %(time.time()-t0))
diff --git a/examples/cairns_excel/flow_through_cross_sections.py b/examples/cairns_excel/flow_through_cross_sections.py
index 105546966..a5b4dae75 100644
--- a/examples/cairns_excel/flow_through_cross_sections.py
+++ b/examples/cairns_excel/flow_through_cross_sections.py
@@ -138,7 +138,7 @@ def get_approximate_discharge_timeseries(sww_filename,
 
     for pk in polylines.keys():
 
-        if verbose: print pk
+        if verbose: print(pk)
 
         pl_full = polylines[pk]
 
@@ -163,7 +163,7 @@ def get_approximate_discharge_timeseries(sww_filename,
             ds = (numpy.diff(gridXY[:,0])**2 + numpy.diff(gridXY[:,1])**2)**0.5
             ds_trapz = numpy.hstack([ ds[0], (ds[0:-1] + ds[1:]), ds[-1]])*0.5
 
-            if verbose: print 'Finding triangles containing point'
+            if verbose: print('Finding triangles containing point')
 
             if use_knn:
                 point_distance, point_indices = point_index_kdtree.query(gridXY, 
@@ -195,7 +195,7 @@ def get_approximate_discharge_timeseries(sww_filename,
                         util.get_triangle_containing_point(p, gridXY_offset[i,:],
                             search_order = search_order) 
 
-            if verbose: print 'Computing the flux'
+            if verbose: print('Computing the flux')
 
             if k_nearest_neighbours == 1:
                 point_uh = ud[:][:, point_indices]    
@@ -297,10 +297,10 @@ def discharge_series_subset(discharge_series, river_name_pattern):
         search_mesh = False
 
     assert os.path.exists(sww_filename), 'sww_filename not found'
-    print 'sww_filename: ' + sww_filename
-    print 'knn: ' + str(knn)
-    print 'desired_ds: ' + str(desired_ds)
-    print ''
+    print('sww_filename: ' + sww_filename)
+    print('knn: ' + str(knn))
+    print('desired_ds: ' + str(desired_ds))
+    print('')
 
     output_times, discharge_series = get_approximate_discharge_timeseries(
         sww_filename, polylines, desired_ds=desired_ds, 
diff --git a/examples/cairns_excel/make_anugaviewer_movie.py b/examples/cairns_excel/make_anugaviewer_movie.py
index 8231af224..86755f2a0 100644
--- a/examples/cairns_excel/make_anugaviewer_movie.py
+++ b/examples/cairns_excel/make_anugaviewer_movie.py
@@ -116,7 +116,7 @@
 # NOTE: There are other programs which can make more efficient movies
 movie_command = 'convert -delay ' + str(delay) + ' TMP/*' \
     + file_wildcard + ' ' + animation_name
-print movie_command
+print(movie_command)
 os.system(movie_command)
 
 # Clean up directory
diff --git a/examples/cairns_excel/run_model.py b/examples/cairns_excel/run_model.py
index 76798c56e..5d7f18ba9 100644
--- a/examples/cairns_excel/run_model.py
+++ b/examples/cairns_excel/run_model.py
@@ -71,18 +71,18 @@
 set_initial_conditions_in_parallel = False
 
 if not set_initial_conditions_in_parallel:
-    print 'Making domain and initial conditions in serial'
+    print('Making domain and initial conditions in serial')
     domain = setup_mesh.setup_mesh(project, 
         setup_initial_conditions=setup_initial_conditions)
 else:
-    print 'Making domain in serial'
+    print('Making domain in serial')
     domain = setup_mesh.setup_mesh(project)
-    print 'Making initial conditions in parallel'
+    print('Making initial conditions in parallel')
     setup_initial_conditions.setup_initial_conditions(domain, project)
     
 
 # Riverwalls must be added AFTER any distribute step
-print 'Adding riverwalls'
+print('Adding riverwalls')
 setup_riverwalls.setup_riverwalls(domain, project)
 
 ##########################################################################
@@ -91,16 +91,16 @@
 #
 ##########################################################################
 
-print 'Making rainfall '
+print('Making rainfall ')
 setup_rainfall.setup_rainfall(domain, project)
 
-print 'Making inlets '
+print('Making inlets ')
 setup_inlets.setup_inlets(domain, project)
 
-print 'Making bridges '
+print('Making bridges ')
 setup_bridges.setup_bridges(domain, project)
 
-print 'Making pumping stations '
+print('Making pumping stations ')
 setup_pumping_stations.setup_pumping_stations(domain, project)
 
 ##########################################################################
@@ -109,7 +109,7 @@
 #
 ##########################################################################
 
-print 'Making boundary conditions '
+print('Making boundary conditions ')
 setup_boundary_conditions.setup_boundary_conditions(domain, project)
 
 ##########################################################################
@@ -131,7 +131,7 @@
 #
 ##########################################################################
 
-print 'Evolving'
+print('Evolving')
 
 barrier()
 for t in domain.evolve(yieldstep=project.yieldstep,
@@ -168,13 +168,13 @@
 
 os.chdir(project.output_dir)
 if myid == 0 and numprocs > 1:
-    print 'Number of processors %g ' % numprocs
-    print 'That took %.2f seconds' % (time.time() - t0)
-    print 'Communication time %.2f seconds' % domain.communication_time
-    print 'Reduction Communication time %.2f seconds' \
-        % domain.communication_reduce_time
-    print 'Broadcast time %.2f seconds' \
-        % domain.communication_broadcast_time
+    print('Number of processors %g ' % numprocs)
+    print('That took %.2f seconds' % (time.time() - t0))
+    print('Communication time %.2f seconds' % domain.communication_time)
+    print('Reduction Communication time %.2f seconds' \
+        % domain.communication_reduce_time)
+    print('Broadcast time %.2f seconds' \
+        % domain.communication_broadcast_time)
 
     anuga.utilities.sww_merge.sww_merge_parallel(
         project.scenario,
@@ -190,8 +190,8 @@
             proj4string=project.proj4string,
             cell_size=project.output_tif_cellsize)
     except:
-        print 'GeoTif creation failed, you can try manually using' + \
-              ' raster_outputs.py or anuga.utilities.plot_utils.Make_Geotif'
+        print('GeoTif creation failed, you can try manually using' + \
+              ' raster_outputs.py or anuga.utilities.plot_utils.Make_Geotif')
 
 barrier()
 finalize()
diff --git a/examples/cairns_excel/setup/make_spatially_averaged_function.py b/examples/cairns_excel/setup/make_spatially_averaged_function.py
index 98172fe06..1b6704826 100644
--- a/examples/cairns_excel/setup/make_spatially_averaged_function.py
+++ b/examples/cairns_excel/setup/make_spatially_averaged_function.py
@@ -125,7 +125,7 @@ def elevation_setter(xc, yc):
             ub = min((i + 1) * chunk_size, lx)
 
             if verbose:
-                print 'Averaging in triangles ', lb, '-', ub - 1
+                print('Averaging in triangles ', lb, '-', ub - 1)
 
             # Store x,y,triangleIndex
             px = scipy.array([])
@@ -157,7 +157,7 @@ def elevation_setter(xc, yc):
 
             # Get function values at all px,py
             if verbose:
-                print '  Evaluating function at ', len(px), ' points'
+                print('  Evaluating function at ', len(px), ' points')
 
             allTopo = q_function(px, py)
 
@@ -211,16 +211,16 @@ def topography(x, y):
     # Check that the elevation in the 'averaging' band is very small
     # (since we used 'min' averaging)
     if elv[inpol].mean() < 1.0e-06:
-        print 'PASS'
+        print('PASS')
     else:
-        print 'FAIL'
+        print('FAIL')
 
     # Check that no 'averaging' occurred outside the polygon
     x = domain.centroid_coordinates[:,0]
     if all(elv[outpol] - x[outpol]%0.5 == 0.0):
-        print 'PASS'
+        print('PASS')
     else:
-        print 'FAIL'
+        print('FAIL')
 
     # Another test which can catch index errors   
  
@@ -231,7 +231,7 @@ def topography(x, y):
     domain.set_quantity('elevation', topography_smooth2, location='centroids')
 
     # If we get to here, then the above function did not hit an index error.
-    print 'PASS' 
+    print('PASS') 
     
 
     # Another test which can catch index errors   
@@ -244,5 +244,5 @@ def topography(x, y):
     domain.set_quantity('elevation', topography_smooth3, location='centroids')
     
     # If we get to here, then the above function did not hit an index error.
-    print 'PASS' 
+    print('PASS') 
     
diff --git a/examples/cairns_excel/setup/prepare_data.py b/examples/cairns_excel/setup/prepare_data.py
index 00291dddf..a8afc1466 100644
--- a/examples/cairns_excel/setup/prepare_data.py
+++ b/examples/cairns_excel/setup/prepare_data.py
@@ -20,7 +20,7 @@
 from anuga.parallel import myid, barrier, send, receive, numprocs
 
 # Local modules
-from read_boundary_tags_line_shapefile import \
+from .read_boundary_tags_line_shapefile import \
     read_boundary_tags_line_shapefile
 from setup.parse_input_data import ProjectData
 
@@ -108,7 +108,7 @@ def define_output_directory_and_redirect_stdout(self,
             except:
                 pass
 
-            print 'OUTPUT_DIRECTORY: ' + str(self.output_dir)
+            print('OUTPUT_DIRECTORY: ' + str(self.output_dir))
 
         # Send stdout to a file inside the output directory
         if output_log is not None:
@@ -118,7 +118,7 @@ def define_output_directory_and_redirect_stdout(self,
                 stdout_file = output_log
 
             if myid == 0:
-                print 'Redirecting output now to ' + stdout_file
+                print('Redirecting output now to ' + stdout_file)
                 sys.stdout = Logger(stdout_file)
             barrier()
 
@@ -137,12 +137,12 @@ def process_project_data(self):
         # (Consider refactoring though)
         if myid == 0:
             for p in self.print_info:
-                print p
-            print ''
-            print '---------------------'
-            print 'PROCESS_PROJECT_DATA'
-            print '---------------------'
-            print ''
+                print(p)
+            print('')
+            print('---------------------')
+            print('PROCESS_PROJECT_DATA')
+            print('---------------------')
+            print('')
             # Record the time and broadcast to other processers
             time_number = time.time()
             if numprocs > 1:
diff --git a/examples/cairns_excel/setup/raster_outputs.py b/examples/cairns_excel/setup/raster_outputs.py
index 63a62f9c1..42bc155c7 100644
--- a/examples/cairns_excel/setup/raster_outputs.py
+++ b/examples/cairns_excel/setup/raster_outputs.py
@@ -84,7 +84,7 @@ def make_resampled_elevation(
     gdal_rasterize_command = gdal_rasterize + ' -a_srs ' + all_srs \
         + ' -burn 1 -l ' + clip_polygon_layer + ' ' + extent_info + ' ' \
         + res_info + ' ' + clip_polygon + ' ' + new_mask
-    print gdal_rasterize_command
+    print(gdal_rasterize_command)
     os.system(gdal_rasterize_command)
 
     return
@@ -117,7 +117,7 @@ def gdal_calc_command(
 
     gdalcalc_command = gdal_calc + gdal_input + calc_command \
         + '--outfile ' + rast_out
-    print gdalcalc_command
+    print(gdalcalc_command)
     os.system(gdalcalc_command)
 
     return
@@ -239,7 +239,7 @@ def make_me_some_tifs(
         for (i, quant) in enumerate(['elevation_c', 'friction_c']):
 
             # Get the quantity if it exists
-            if fid.variables.has_key(quant):
+            if quant in fid.variables:
                 quant_values = fid.variables[quant]
                 # If multi time-steps, only get first timestep
                 if(len(quant_values.shape) > 1):
diff --git a/examples/cairns_excel/setup/read_boundary_tags_line_shapefile.py b/examples/cairns_excel/setup/read_boundary_tags_line_shapefile.py
index 22cef6c8c..56c7f88d0 100644
--- a/examples/cairns_excel/setup/read_boundary_tags_line_shapefile.py
+++ b/examples/cairns_excel/setup/read_boundary_tags_line_shapefile.py
@@ -17,7 +17,7 @@ def check_output(list_of_commands):
     process = subprocess.Popen(list_of_commands, stdout=subprocess.PIPE)
 
     output = process.communicate()[0]
-    print output
+    print(output)
 
     #output, unused_err = process.communicate()
     #retcode = process.poll()
@@ -54,7 +54,7 @@ def parse_ogr_info_text(ogr_info, tag_attribute):
                 counter = counter + 1
 
                 if (i + counter) == len(ogr_info):
-                    print ogr_info
+                    print(ogr_info)
                     msg = 'Failed to parse the above output from ogr_info' + \
                           '\n Check that your boundary tag attribute name' + \
                           ' is correctly specified in the input file'
@@ -74,7 +74,7 @@ def parse_ogr_info_text(ogr_info, tag_attribute):
                 counter = counter + 1
 
                 if (i + counter) == len(ogr_info):
-                    print ogr_info
+                    print(ogr_info)
                     msg = 'Failed to parse the above output from ogr_info' + \
                           '\n Could not find the linestring of every ' + \
                           'Feature. \n Check that all geometries are part ' + \
diff --git a/examples/cairns_excel/setup/setup_initial_conditions.py b/examples/cairns_excel/setup/setup_initial_conditions.py
index d53abdac0..181a86e19 100644
--- a/examples/cairns_excel/setup/setup_initial_conditions.py
+++ b/examples/cairns_excel/setup/setup_initial_conditions.py
@@ -107,9 +107,9 @@ def quick_set_quantity(quantity_name, quantity_data, domain,
                 quantity_function, domain, approx_grid_spacing=grid_spacing,
                 averaging=mean_type)
 
-        print quantity_name, quantity_data, domain, quantity_clip_range, quantity_mean, quantity_additions
+        print(quantity_name, quantity_data, domain, quantity_clip_range, quantity_mean, quantity_additions)
 
-        print quantity_function(numpy.array([0.0]),numpy.array([0.0]))
+        print(quantity_function(numpy.array([0.0]),numpy.array([0.0])))
         # Set the quantity
         domain.set_quantity(quantity_name, quantity_function,
                             location=location)
diff --git a/examples/cairns_excel/setup/setup_mesh.py b/examples/cairns_excel/setup/setup_mesh.py
index 09e458393..7b4b0b599 100644
--- a/examples/cairns_excel/setup/setup_mesh.py
+++ b/examples/cairns_excel/setup/setup_mesh.py
@@ -60,24 +60,24 @@ def build_mesh(project):
 
     # Print some stats about mesh and domain
 
-    print 'Number of triangles = ', len(domain)
-    print 'The extent is ', domain.get_extent()
-    print domain.statistics()
+    print('Number of triangles = ', len(domain))
+    print('The extent is ', domain.get_extent())
+    print(domain.statistics())
 
     # Print info on the smallest triangles
 
     small_areas = domain.areas.argsort()
-    print ''
-    print 'LOCATIONS OF TRIANGLES WITH SMALLEST AREAS'
+    print('')
+    print('LOCATIONS OF TRIANGLES WITH SMALLEST AREAS')
     for i in range(10):
         j = small_areas[i]
         x = domain.centroid_coordinates[j, 0] \
             + domain.geo_reference.xllcorner
         y = domain.centroid_coordinates[j, 1] \
             + domain.geo_reference.yllcorner
-        print '  Area ' + str(domain.areas[j]) + ' location: ' \
-            + str(round(x, 1)) + ',' + str(round(y, 1))
-    print ''
+        print('  Area ' + str(domain.areas[j]) + ' location: ' \
+            + str(round(x, 1)) + ',' + str(round(y, 1)))
+    print('')
 
     return domain
 
@@ -99,7 +99,7 @@ def setup_mesh(project, setup_initial_conditions=None):
     if myid == 0:
 
         if verbose:
-            print 'Hello from processor ', myid
+            print('Hello from processor ', myid)
 
         #
         # HERE, WE MAKE/PARTITION/READ THE MESH
@@ -114,10 +114,10 @@ def setup_mesh(project, setup_initial_conditions=None):
 
         if os.path.exists(pickle_name):
             if verbose:
-                print 'Saved domain seems to already exist'
+                print('Saved domain seems to already exist')
         else:
             if verbose:
-                print 'CREATING PARTITIONED DOMAIN'
+                print('CREATING PARTITIONED DOMAIN')
             domain = build_mesh(project)
 
             if setup_initial_conditions is not None:
@@ -130,7 +130,7 @@ def setup_mesh(project, setup_initial_conditions=None):
 
             if pypar_available:
                 if verbose:
-                    print 'Saving Domain'
+                    print('Saving Domain')
                 sequential_distribute_dump(domain, 1,
                                            partition_dir=project.partition_dir,
                                            verbose=verbose)
@@ -148,16 +148,16 @@ def setup_mesh(project, setup_initial_conditions=None):
                                    par_pickle_name)
             if os.path.exists(par_pickle_name):
                 if verbose:
-                    print 'Saved partitioned domain seems to already exist'
+                    print('Saved partitioned domain seems to already exist')
             else:
                 if verbose:
-                    print 'Load in saved sequential pickled domain'
+                    print('Load in saved sequential pickled domain')
                 domain = \
                     sequential_distribute_load_pickle_file(
                         pickle_name, np=1, verbose=verbose)
 
                 if verbose:
-                    print 'Dump partitioned domains'
+                    print('Dump partitioned domains')
                 sequential_distribute_dump(
                     domain, numprocs,
                     partition_dir=project.partition_dir, verbose=verbose)
@@ -172,7 +172,7 @@ def setup_mesh(project, setup_initial_conditions=None):
 
         domain = None
         if verbose:
-            print 'Hello from processor ', myid
+            print('Hello from processor ', myid)
 
     barrier()
 
@@ -185,7 +185,7 @@ def setup_mesh(project, setup_initial_conditions=None):
 
     if pypar_available:
         if myid == 0:
-            print 'LOADING PARTITIONED DOMAIN'
+            print('LOADING PARTITIONED DOMAIN')
 
         domain = \
             sequential_distribute_load(
diff --git a/examples/cairns_excel/setup/setup_pumping_stations.py b/examples/cairns_excel/setup/setup_pumping_stations.py
index 1b4087560..204e0bbc2 100644
--- a/examples/cairns_excel/setup/setup_pumping_stations.py
+++ b/examples/cairns_excel/setup/setup_pumping_stations.py
@@ -50,7 +50,7 @@ def setup_pumping_stations(domain, project):
 
         smoothing_timescale = ps[10]
 
-        print 'Need to implement elevation data adjustments'
+        print('Need to implement elevation data adjustments')
 
         # Function which computes Q
         pump_behaviour = pumping_station_function(
diff --git a/examples/cairns_excel/setup/spatially_averaged_function.py b/examples/cairns_excel/setup/spatially_averaged_function.py
index c66132dbb..de295ef98 100644
--- a/examples/cairns_excel/setup/spatially_averaged_function.py
+++ b/examples/cairns_excel/setup/spatially_averaged_function.py
@@ -93,7 +93,7 @@ def elevation_setter(xc, yc):
             ub = min((i + 1) * chunk_size, lx)
 
             if verbose:
-                print 'Averaging in triangles ', lb, '-', ub - 1
+                print('Averaging in triangles ', lb, '-', ub - 1)
 
             # Store x,y,triangleIndex
             px = scipy.array([])
@@ -119,7 +119,7 @@ def elevation_setter(xc, yc):
 
             # Get function values at all px,py
             if verbose:
-                print '  Evaluating function at ', len(px), ' points'
+                print('  Evaluating function at ', len(px), ' points')
 
             allTopo = q_function(px, py)
 
@@ -136,7 +136,7 @@ def elevation_setter(xc, yc):
                 elif(averaging == 'harmonic_mean'):
                     out[j] = 1.0 / (1.0 / allTopo[out_indices]).mean()
                 else:
-                    raise Exception, 'Unknown value of averaging'
+                    raise Exception('Unknown value of averaging')
         return(out)
 
     return elevation_setter
diff --git a/examples/cairns_excel/user_functions.py b/examples/cairns_excel/user_functions.py
index 287274c6d..02e12adcf 100644
--- a/examples/cairns_excel/user_functions.py
+++ b/examples/cairns_excel/user_functions.py
@@ -41,11 +41,11 @@ def print_velocity_statistics(domain, max_quantities):
             dd = dd * (dd > 1.0e-03) + 1.0e-03 * (dd <= 1.0e-03)
             vv = 1 / dd * (xx ** 2 + yy ** 2) ** 0.5
             vv = vv * (dd > 1.0e-03)
-            print '    Processor ', myid
-            print '    @ Peak velocity is: ', vv.max(), vv.argmax()
-            print '     &- MaxSpeedHistory: ', \
-                max_quantities.max_speed.max()
-            print '     %- FUF: ', domain.flux_update_frequency.mean()
+            print('    Processor ', myid)
+            print('    @ Peak velocity is: ', vv.max(), vv.argmax())
+            print('     &- MaxSpeedHistory: ', \
+                max_quantities.max_speed.max())
+            print('     %- FUF: ', domain.flux_update_frequency.mean())
         else:
             pass
         barrier()
@@ -53,7 +53,7 @@ def print_velocity_statistics(domain, max_quantities):
     # Make a newline
 
     if myid == 0:
-        print ''
+        print('')
 
     return
 
@@ -72,19 +72,19 @@ def print_operator_inputs(domain):
     if myid == 0:
         for i in range(len(operators)):
             if hasattr(operators[i], 'rate'):
-                print '    Operator ' + operators[i].label + \
-                      ' rate = ' + str(operators[i].rate(domain.time))
+                print('    Operator ' + operators[i].label + \
+                      ' rate = ' + str(operators[i].rate(domain.time)))
 
     barrier()
 
     # Inlets
     for i in range(len(operators)):
         if hasattr(operators[i], 'applied_Q'):
-            print '    Operator ' + operators[i].label + \
-                  ' Q = ' + str(operators[i].applied_Q)
+            print('    Operator ' + operators[i].label + \
+                  ' Q = ' + str(operators[i].applied_Q))
     barrier()
 
     if myid == 0:
-        print ' '
+        print(' ')
 
     return
diff --git a/examples/cairns_excel/xlrd/book.py b/examples/cairns_excel/xlrd/book.py
index 7bb01b4f4..07877133f 100644
--- a/examples/cairns_excel/xlrd/book.py
+++ b/examples/cairns_excel/xlrd/book.py
@@ -2,7 +2,6 @@
 # This module is part of the xlrd package, which is released under a
 # BSD-style licence.
 
-from __future__ import print_function
 
 from .timemachine import *
 from .biffh import *
diff --git a/examples/cairns_excel/xlrd/compdoc.py b/examples/cairns_excel/xlrd/compdoc.py
index e434e8ec8..27d8f6760 100644
--- a/examples/cairns_excel/xlrd/compdoc.py
+++ b/examples/cairns_excel/xlrd/compdoc.py
@@ -15,7 +15,6 @@
 # 2007-05-07 SJM Meaningful exception instead of IndexError if a SAT (sector allocation table) is corrupted.
 # 2007-04-22 SJM Missing "<" in a struct.unpack call => can't open files on bigendian platforms.
 
-from __future__ import print_function
 import sys
 from struct import unpack
 from .timemachine import *
diff --git a/examples/cairns_excel/xlrd/timemachine.py b/examples/cairns_excel/xlrd/timemachine.py
index a068db3ec..36f42977c 100644
--- a/examples/cairns_excel/xlrd/timemachine.py
+++ b/examples/cairns_excel/xlrd/timemachine.py
@@ -7,7 +7,6 @@
 # Currently supported: 2.6 to 2.7, 3.2+
 # usage: from timemachine import *
 
-from __future__ import print_function
 import sys
 
 python_version = sys.version_info[:2] # e.g. version 2.6 -> (2, 6)
diff --git a/examples/cairns_excel/xlrd/xlsx.py b/examples/cairns_excel/xlrd/xlsx.py
index 53fbb8926..2f3e73dd8 100644
--- a/examples/cairns_excel/xlrd/xlsx.py
+++ b/examples/cairns_excel/xlrd/xlsx.py
@@ -3,7 +3,6 @@
 # This module is part of the xlrd package, which is released under a BSD-style licence.
 ##
 
-from __future__ import print_function, unicode_literals
 
 DEBUG = 0
 
diff --git a/examples/cuda/run_cuda_rectangle.py b/examples/cuda/run_cuda_rectangle.py
index 75471fe6d..8c8599fc9 100644
--- a/examples/cuda/run_cuda_rectangle.py
+++ b/examples/cuda/run_cuda_rectangle.py
@@ -1,4 +1,4 @@
-"""  Test environmental forcing - rain, wind, etc.
+"""  Test of the CUDA implementation on the rectangular cross domain
 """
 
 import unittest, os
diff --git a/examples/parallel/create_pbs_job.py b/examples/parallel/create_pbs_job.py
index b21399e0e..49f1d23a0 100644
--- a/examples/parallel/create_pbs_job.py
+++ b/examples/parallel/create_pbs_job.py
@@ -17,9 +17,9 @@
 ext = '.py'
 
 if len(arg)<3:
-    print __doc__
-    print 'Usage:'
-    print command + ' p <filename>' + ext
+    print(__doc__)
+    print('Usage:')
+    print(command + ' p <filename>' + ext)
     sys.exit()
 
 numproc = int(arg[1])    
diff --git a/meson.build b/meson.build
index e08679836..ef9610903 100644
--- a/meson.build
+++ b/meson.build
@@ -7,7 +7,9 @@ py3 = import('python').find_installation(pure: false)
 
 dep_py = py3.dependency()
 
-# Find NumPy
+#========================================================
+# Setup numpy dependencies
+#========================================================
 dep_numpy = dependency('numpy')
 
 incdir_numpy = run_command(py3,
@@ -18,11 +20,49 @@ incdir_numpy = run_command(py3,
   check: true
 ).stdout().strip()
 
+# FIXME SR: Do we use this now?
 message('NumPy include directory: ' + incdir_numpy)
 
-# important to put the numpy dependency first to avoid picking up the system numpy
 dependencies = [dep_py, dep_numpy]
 
+#========================================================
+# Deal with OpenMP
+#========================================================
+
+# cc = meson.get_compiler('c')
+# if openmp_dep.found()
+#   if cc.get_id() in ['intel', 'intel-cl', 'icx']
+#     openmp_flag = '-qopenmp'
+#   else
+#     openmp_flag = '-fopenmp'
+#   endif
+# endif
+
+openmp = dependency('openmp', required: false)
+if openmp.found()
+  # On Windows, the mingw compiler does not support OpenMP ATOMIC operations
+  # so using gcc_win-64 gxx_win-64 provided by conda-forge
+  # On linux, OpenMP is supported by gcc and g++
+  # On macOS, OpenMP is not supported by the Apple clang compiler so using 
+  # clang and llvm-openmp provided by conda-forge
+ 
+  #if host_machine.system() == 'windows'
+  
+  openmp_c_args = ['-O3', '-march=native', '-fopenmp', '-g']
+  # openmp_c_args = ['-O3', '-march=native','-funroll-loops', '-fvectorize', '-Rpass=loop-vectorize', '-Rpass=loop-unroll', '-g'],
+  # openmp_c_args = ['-O3', '-march=native', openmp_flag, '-g'],
+
+  openmp_deps = dependencies + [openmp]
+
+else
+  openmp_deps = dependencies
+endif
+
+
+
+#========================================================
+# Install the Python scripts
+#========================================================
 conf = configuration_data()
 conf.set('PYTHON', py3.path())
 
@@ -44,6 +84,14 @@ configure_file(
     install_mode: 'rwxr-xr-x'
 )
 
+configure_file(
+    input: 'scripts/anuga_benchmark_omp.py',
+    output: 'anuga_benchmark_omp',
+    configuration: conf,
+    install_dir: get_option('bindir'),
+    install_mode: 'rwxr-xr-x'
+)
+
 # Add subdirectories which contains python sources
 subdir('anuga')
 
diff --git a/pyproject.toml b/pyproject.toml
index e72879b83..4b564b49d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@ name = 'anuga'
 version = '3.2.0dev'
 description = 'A set of python modules for tsunami and flood modelling'
 readme = 'README.rst'
-license = {file = 'LICENSE.txt'}
+license = 'Apache-2.0'
 authors = [
   {name = 'Stephen Roberts', email = 'stephen.roberts@anu.edu.au'},
 ]
@@ -26,7 +26,6 @@ dependencies = [
 classifiers = [
     'Intended Audience :: Science/Research',
     'Intended Audience :: Developers',
-    'License :: OSI Approved',
     'Programming Language :: C',
     'Programming Language :: C++',
     'Programming Language :: Python',
diff --git a/requirements.txt b/requirements.txt
index 2e500e77b..ac64a958d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,6 +25,5 @@ Pmw
 meson
 meson-python
 ninja
-pkg-config
 xarray
 cartopy
diff --git a/run_validations.py b/run_validations.py
deleted file mode 100644
index dc6a85012..000000000
--- a/run_validations.py
+++ /dev/null
@@ -1,10 +0,0 @@
-
-import os
-
-
-os.chdir('validation_tests')
-print()
-print(20*'=' + ' anuga automated validation tests ' + 20*'=')
-print('Changing to', os.getcwd()) # This is now different from buildroot   
-exec(open('run_auto_validation_tests.py').read())
-
diff --git a/scripts/anuga_benchmark_omp.py b/scripts/anuga_benchmark_omp.py
new file mode 100755
index 000000000..5efb411f4
--- /dev/null
+++ b/scripts/anuga_benchmark_omp.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+
+import pstats
+import os
+import csv
+import numpy as np
+
+# This script runs benchmark ifor specified script with different numbers of OpenMP threads
+
+from time import localtime, strftime, gmtime, sleep
+time = strftime('%Y%m%d_%H%M', localtime())
+
+import subprocess
+import sys
+import socket
+
+import argparse
+
+
+parser = argparse.ArgumentParser(
+    description="Run benchmark for given script."
+)
+parser.add_argument(
+    "script_file",
+    type=str,
+    nargs="?",
+    default="run_small_towradgi.py",
+    help="The Python script to run for benchmarking (default: run_small_towradgi.py)"
+)
+
+
+args = parser.parse_args()
+script_file = args.script_file
+print(f"Using script file: {script_file}")
+
+script = script_file.rsplit('.', 1)[0]
+
+
+
+# Copy the current environment and set OMP_NUM_THREADS
+env = os.environ.copy()
+
+hostname = socket.gethostname()
+hostname = hostname.split('.')[0]  # Get the hostname without the domain part
+hostname = hostname.split('-')[0]  # Get the first part of the hostname if it contains a hyphen
+
+print(f"On machine: {hostname}")
+
+# Define the Conda environment name
+conda_prefix = os.environ.get("CONDA_PREFIX")
+if conda_prefix:
+    anuga_env = os.path.basename(conda_prefix)
+    print(f"Conda environment name: {anuga_env}")
+else:
+    print("Not running inside a conda environment.")
+
+
+if 'PBS_QUEUE' in os.environ:
+    PBS_QUEUE = os.environ['PBS_QUEUE']
+    print(f"Using PBS queue: {PBS_QUEUE}")
+    if PBS_QUEUE == 'normalsr-exec':
+        queue = 'normalsr'
+        openmp_threads = [1, 2, 4, 8, 16, 32, 48, 64, 80, 100]
+    elif PBS_QUEUE == 'normal-exec':
+        queue = 'normal'
+        openmp_threads = [1, 2, 4, 6, 8, 12, 16, 24, 32, 48]
+else:
+    queue = 'local'
+    openmp_threads = [1,2,4]
+
+print(f"Using queue: {queue}")
+
+for threads in openmp_threads:
+    env["OMP_NUM_THREADS"] = str(threads)  # Set to your desired number of threads
+    pstat_file = f'profile_{script}_{hostname}_{queue}_{anuga_env}_{time}_omp_{threads}.pstat'
+
+    cmd = ['conda', 'run', '--no-capture-output', '-n', anuga_env, 'python', '-u', '-m', 'cProfile', '-o', pstat_file, script_file]
+    
+    print('')
+    print(80 * '=')
+    print(f'Running command: {" ".join(cmd)}')
+    print(80 * '=')
+    print('')
+
+    # Run the subprocess with the modified environment
+    with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, text=True, env=env) as process:
+        for line in process.stdout:
+            print(line, end='')  # Print each line as it arrives
+
+
+
+
+#=================================
+# Collect timings
+#=================================
+pstat_basename = f'profile_{script}_{hostname}_{queue}_{anuga_env}_{time}'
+
+from anuga.utilities.create_benchmark_csvfile import create_benchmark_csvfile
+
+create_benchmark_csvfile(pstat_basename, openmp_threads, verbose=True)
+
+
diff --git a/setup.py b/setup.py
deleted file mode 100644
index c3118a8dd..000000000
--- a/setup.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#! /usr/bin/env python
-#
-# Copyright (C) 2007-2009 Cournapeau David <cournape@gmail.com>
-#               2010 Fabian Pedregosa <fabian.pedregosa@inria.fr>
-# License: 3-clause BSD
-#
-# Setup.py taken from scikit learn
-
-descr = """A set of python modules for modelling the effect of tsunamis and flooding"""
-
-import sys
-import os
-import shutil
-from distutils.command.clean import clean as Clean
-os.environ["PROJECT_ROOT"] = os.getcwd()
-
-#os.environ["CC"] = "nvc -O3 -acc=gpu -Minfo=accel -noswitcherror -lm -I$CUDA_HOME/include/ --device-debug --generate-line-info -arch sm_11 "
-#os.environ["CXX"] = "nvc++ -O3 -acc=gpu -Minfo=accel -noswitcherror -lm -I$CUDA_HOME/include/ --device-debug --generate-line-info -arch sm_11 -std=c++17"
-#os.environ["FC"] = "nvfortran -O3 -acc=gpu -Minfo=accel -noswitcherror -lm -I$CUDA_HOME/include/ --device-debug --generate-line-info "
-
-#==============================================================================
-DISTNAME = 'anuga'
-DESCRIPTION = 'A set of python modules for tsunami and flood modelling'
-with open('README.rst') as f:
-    LONG_DESCRIPTION = f.read()
-MAINTAINER = 'Stephen Roberts'
-MAINTAINER_EMAIL = 'stephen.roberts@anu.edu.au'
-URL = "https://github.com/anuga-community/anuga_core"
-LICENSE = 'GPL'
-DOWNLOAD_URL = "http://sourceforge.net/projects/anuga/"
-VERSION = '3.2.0dev'
-#===============================================================================
-
-
-
-
-###############################################################################
-# Optional setuptools features
-# We need to import setuptools early, if we want setuptools features,
-# as it monkey-patches the 'setup' function
-
-# For some commands, use setuptools
-SETUPTOOLS_COMMANDS = set([
-    'develop', 'release', 'bdist_egg', 'bdist_rpm',
-    'bdist_wininst', 'install_egg_info', 'build_sphinx',
-    'egg_info', 'easy_install', 'upload', 'bdist_wheel',
-    '--single-version-externally-managed',
-])
-
-
-if len(SETUPTOOLS_COMMANDS.intersection(sys.argv)) > 0:
-    import setuptools
-    import platform
-    install_requires = ['cython',
-                         'numpy',
-                         'dill',
-                         'future',
-                         'gdal >= 3.0.4',
-                         'gitpython',
-                         'matplotlib',
-                         'meshpy',
-                         'netcdf4',
-                         'Pmw',
-                         'pymetis',
-                         'pytest',
-                         'scipy',
-                         'utm']
-    version_tuple = platform.python_version_tuple()
-    if version_tuple[0] == '3' and int(version_tuple[1]) < 9 :
-        install_requires.append('backports.zoneinfo')
-
-    extra_setuptools_args = dict(
-        zip_safe=False,  # the package can run out of an .egg file
-        include_package_data=True,
-      	install_requires=install_requires
-    )
-else:
-    extra_setuptools_args = dict()
-
-###############################################################################
-
-class CleanCommand(Clean):
-    description = "Remove build artifacts from the source tree"
-
-    def run(self):
-        Clean.run(self)
-        if os.path.exists('build'):
-            shutil.rmtree('build')
-        for dirpath, dirnames, filenames in os.walk('anuga'):
-            for filename in filenames:
-                if (filename.endswith('.so') or filename.endswith('.pyd')
-                        or filename.endswith('.pyc')):
-                    os.unlink(os.path.join(dirpath, filename))
-            for dirname in dirnames:
-                if dirname == '__pycache__':
-                    shutil.rmtree(os.path.join(dirpath, dirname))
-
-
-###############################################################################
-def configuration(parent_package='', top_path=None):
-    if os.path.exists('MANIFEST'):
-        os.remove('MANIFEST')
-
-    from numpy.distutils.misc_util import Configuration
-    config = Configuration(None, parent_package, top_path)
-
-    # Avoid non-useful msg:
-    # "Ignoring attempt to set 'name' (from ... "
-    config.set_options(ignore_setup_xxx_py=True,
-                       assume_default_configuration=True,
-                       delegate_options_to_subpackages=True,
-                       quiet=True)
-
-    config.add_subpackage('anuga')
-
-    return config
-
-
-
-def setup_package():
-
-    metadata = dict(name=DISTNAME,
-                    maintainer=MAINTAINER,
-                    maintainer_email=MAINTAINER_EMAIL,
-                    description=DESCRIPTION,
-                    license=LICENSE,
-                    url=URL,
-                    version=VERSION,
-                    download_url=DOWNLOAD_URL,
-                    long_description=LONG_DESCRIPTION,
-                    classifiers=['Intended Audience :: Science/Research',
-                                 'Intended Audience :: Developers',
-                                 'License :: OSI Approved',
-                                 'Programming Language :: C',
-                                 'Programming Language :: C++',
-                                 'Programming Language :: Python',
-                                 'Topic :: Software Development',
-                                 'Topic :: Scientific/Engineering',
-                                 'Operating System :: Microsoft :: Windows',
-                                 'Operating System :: POSIX',
-                                 'Operating System :: Unix',
-                                 'Operating System :: MacOS',
-                                 'Programming Language :: Python :: 3.7',
-                                 'Programming Language :: Python :: 3.8',
-                                 'Programming Language :: Python :: 3.9',
-                                 'Programming Language :: Python :: 3.10',
-                                 'Programming Language :: Python :: 3.11',
-                                 ],
-                    cmdclass={'clean': CleanCommand},
-                    **extra_setuptools_args)
-
-
-
-    metadata['version'] = VERSION
-    metadata['configuration'] = configuration
-
-    from numpy.distutils.core import setup
-
-    setup(**metadata)
-
-
-if __name__ == "__main__":
-    setup_package()
diff --git a/tools/count_lines.py b/tools/count_lines.py
index fcb5528b7..d0ab67b0d 100644
--- a/tools/count_lines.py
+++ b/tools/count_lines.py
@@ -4,12 +4,12 @@
 import os
 
 cmd_string = 'find . -type f -name "*.py" -print | xargs wc -l'
-print cmd_string
+print(cmd_string)
 os.system(cmd_string)
 
 
 cmd_string = 'find . -type f -name "*.c" -print | xargs wc -l'
-print cmd_string
+print(cmd_string)
 os.system(cmd_string)
 
 
diff --git a/tools/install_miniforge.sh b/tools/install_miniforge.sh
index fab54049c..70ef7bb07 100644
--- a/tools/install_miniforge.sh
+++ b/tools/install_miniforge.sh
@@ -78,7 +78,7 @@ echo "..."
 
 cd ${SCRIPTPATH}
 cd ..
-pip install --no-build-isolation -editable .
+pip install --no-build-isolation --editable .
 echo " "
 
 echo "#==========================="
diff --git a/appveyor.yml b/tools/old.appveyor.yml
similarity index 75%
rename from appveyor.yml
rename to tools/old.appveyor.yml
index 3c67b165f..72c23abe9 100644
--- a/appveyor.yml
+++ b/tools/old.appveyor.yml
@@ -15,6 +15,7 @@ build_script:
   - cmd: start /wait "" Miniforge3.exe /InstallationType=JustMe /RegisterPython=0 /S /D=%UserProfile%\Miniforge3
   - cmd: call %UserProfile%\Miniforge3\Scripts\activate.bat
 
+
   - CMD: SET
   - conda config --set always_yes yes --set changeps1 no
   - conda update -q conda
@@ -29,15 +30,13 @@ build_script:
   # test python
   - python --version
 
-  # If using m2w64 gcc compiler for windows uncomment the following
-  - conda install -c conda-forge libpython m2w64-toolchain
-
-  # there seems to be a bug with distutil by not picking the mingw comiler for anuga
-  #- cp windows_setup.cfg setup.cfg
+  # Using gcc compiler for windows due to openmp support
+  - conda install -c conda-forge libpython gcc_win-64 gxx_win-64
 
 
-  # Use MSVC compiler by default unless m2w64-toolchain is installed
+  - cmd: SET OMP_NUM_THREADS=1
   - pip install -v .
 
 test_script:
-  - pytest -q --pyargs anuga
\ No newline at end of file
+  - cmd: SET OMP_NUM_THREADS=1
+  - pytest -q --pyargs anuga
diff --git a/.travis.yml b/tools/old.travis.yml
similarity index 97%
rename from .travis.yml
rename to tools/old.travis.yml
index 7027b096f..66a8c14a8 100644
--- a/.travis.yml
+++ b/tools/old.travis.yml
@@ -38,6 +38,7 @@ install:
 script:
     - export OMPI_MCA_btl_vader_single_copy_mechanism=none
     - python --version
+    - export OMP_NUM_THREADS=1
     - pytest -q --pyargs anuga
 
 after_success:
diff --git a/validation_tests/analytical_exact/carrier_greenspan_periodic/util.py b/validation_tests/analytical_exact/carrier_greenspan_periodic/util.py
index 517abb0a8..729e6b56b 100644
--- a/validation_tests/analytical_exact/carrier_greenspan_periodic/util.py
+++ b/validation_tests/analytical_exact/carrier_greenspan_periodic/util.py
@@ -115,28 +115,28 @@ def calculate_new_wet_area_analytic(x1,x2,z1,z2,A,t):
     w1,uh1 = analytic_cannal(x1,t)
     w2,uh2 = analytic_cannal(x2,t)
     if (w1 > z1) & (w2 < z2) & (z1 <= z2):
-        print "test1"
+        print("test1")
         x = ((w2-z1)*(x2-x1)+x1*(z2-z1)-x2*(w2-w1))/(z2-z1+w1-w2)
         wet_len = x-x1
     elif (w1 < z1) & (w2 > z2) & (z1 < z2):
-        print "test2"
+        print("test2")
         x = ((w2-z1)*(x2-x1)+x1*(z2-z1)-x2*(w2-w1))/(z2-z1+w1-w2)
         wet_len = x2-x
     elif (w1 < z1) & (w2 > z2) & (z1 >= z2):
-        print "test3"
+        print("test3")
         x = ((w1-z2)*(x2-x1)+x2*(z2-z1)-x1*(w2-w1))/(z2-z1+w1-w2)
         wet_len = x2-x
     elif (w1 > z1) & (w2 < z2) & (z1 > z2):
-        print "test4"
+        print("test4")
         x = ((w1-z2)*(x2-x1)+x2*(z2-z1)-x1*(w2-w1))/(z2-z1+w1-w2)
         wet_len = x-x1
     elif (w1 >= z1) & (w2 >= z2):
-        print "test5"
+        print("test5")
         wet_len = x2-x1 
     else: #(w1 <= z1) & (w2 <= z2)
-        print "test5"
+        print("test5")
         if (w1 > z1) | (w2 > z2):
-            print "ERROR"
+            print("ERROR")
         wet_len = x2-x1        
     return w1,w2,wet_len,uh1,uh2
 
diff --git a/validation_tests/analytical_exact/lake_at_rest_immersed_bump/numerical_immersed_bump.py b/validation_tests/analytical_exact/lake_at_rest_immersed_bump/numerical_immersed_bump.py
index 40173ba67..64369cb56 100644
--- a/validation_tests/analytical_exact/lake_at_rest_immersed_bump/numerical_immersed_bump.py
+++ b/validation_tests/analytical_exact/lake_at_rest_immersed_bump/numerical_immersed_bump.py
@@ -11,7 +11,7 @@
 import anuga
 from anuga import Domain as Domain
 from math import cos
-from numpy import zeros, float
+from numpy import zeros
 from time import localtime, strftime, gmtime
 #from balanced_dev import *
 from anuga import myid, finalize, distribute
diff --git a/validation_tests/analytical_exact/trapezoidal_channel/plot_results.py b/validation_tests/analytical_exact/trapezoidal_channel/plot_results.py
index 540f2117c..b9ff1358c 100644
--- a/validation_tests/analytical_exact/trapezoidal_channel/plot_results.py
+++ b/validation_tests/analytical_exact/trapezoidal_channel/plot_results.py
@@ -13,7 +13,7 @@
 p=util.get_centroids(p2, velocity_extrapolation=True)
 v = (p.x>6.0)*(p.x<8.0)
 
-print numpy.any(v)
+print(numpy.any(v))
 # Numerical results along a central channel 'slice'
 index= -1
 V1 = p.stage[index,v] - p.elev[v]
@@ -50,7 +50,7 @@ def minme(dc):
 import scipy.optimize
 dc_analytical = scipy.optimize.fmin(minme, x0=1.0)[0]
 
-print 'dc_analytic ',dc_analytical
+print('dc_analytic ',dc_analytical)
 
 ##################################
 # Plots
@@ -164,9 +164,9 @@ def minme(dc):
 
 
 
-print '#======================================================================'
-print '# Extract some cross section info'
-print '#======================================================================'
+print('#======================================================================')
+print('# Extract some cross section info')
+print('#======================================================================')
 
 from anuga.shallow_water.sww_interrogate import get_flow_through_multiple_cross_sections
 
diff --git a/validation_tests/behaviour_only/lateral_weir_hecras/plot_results.py b/validation_tests/behaviour_only/lateral_weir_hecras/plot_results.py
index abc715169..e931d6a8d 100644
--- a/validation_tests/behaviour_only/lateral_weir_hecras/plot_results.py
+++ b/validation_tests/behaviour_only/lateral_weir_hecras/plot_results.py
@@ -46,7 +46,7 @@ def get_corresponding_series(reach, station):
 
 
     if(not station%100.==0.):
-        raise Exception, 'Station must be in 0., 100. , 200., .... 900., 1000.'
+        raise Exception('Station must be in 0., 100. , 200., .... 900., 1000.')
 
     # Get station string in hecras gauges
     if(station>0. and station<1000.):
diff --git a/validation_tests/behaviour_only/tides_hecras/plot_results.py b/validation_tests/behaviour_only/tides_hecras/plot_results.py
index d70468b3e..614be65c9 100644
--- a/validation_tests/behaviour_only/tides_hecras/plot_results.py
+++ b/validation_tests/behaviour_only/tides_hecras/plot_results.py
@@ -42,7 +42,7 @@ def get_corresponding_series(reach, station):
     #elif(reach=='RIGHT'):
     #    anuga_x=5.
     else:
-        raise Exception, 'reach not recognized'
+        raise Exception('reach not recognized')
 
 
     # Get station string in hecras gauges
@@ -80,7 +80,7 @@ def compare_reach(reach):
             pyplot.plot(x[1][:,0],x[1][:,1]+station/1000.+1.0, '--', color='black',linewidth=1.0)
         except:
             msg = 'Missing reach/station '+ reach + '/'+str(station)
-            print msg
+            print(msg)
 
         pyplot.xlim((0, 15000.))
         pyplot.xlabel('Time (s)',fontsize=20)
diff --git a/validation_tests/case_studies/merewether/plot_asc.py b/validation_tests/case_studies/merewether/plot_asc.py
index 14ca2719e..08fbd6128 100644
--- a/validation_tests/case_studies/merewether/plot_asc.py
+++ b/validation_tests/case_studies/merewether/plot_asc.py
@@ -3,6 +3,7 @@
 # To change this template, choose Tools | Templates
 # and open the template in the editor.
 
+from future.utils import raise_
 __author__="steve"
 __date__ ="$17/04/2012 11:32:04 AM$"
 
@@ -61,7 +62,7 @@
         xllcorner = float(xref[1].strip())
     else:
         msg = 'Unknown keyword: %s' % xref[0].strip()
-        raise Exception, msg
+        raise_(Exception, msg)
 
     yref = lines[3].split()
     if yref[0].strip() == 'yllcorner':
@@ -70,19 +71,19 @@
         yllcorner = float(yref[1].strip())
     else:
         msg = 'Unknown keyword: %s' % yref[0].strip()
-        raise Exception, msg
+        raise_(Exception, msg)
 
     NODATA_value = float(lines[5].split()[1].strip())
 
     assert len(lines) == nrows + 6
 
 
-    print 'rows ', nrows
-    print 'cols ', ncols
-    print 'cell size ',cellsize
+    print('rows ', nrows)
+    print('cols ', ncols)
+    print('cell size ',cellsize)
 
-    print 'xllcorner ',xllcorner
-    print 'yllcorner ',yllcorner
+    print('xllcorner ',xllcorner)
+    print('yllcorner ',yllcorner)
 
     Z = np.zeros((nrows,ncols),dtype=float)
 
@@ -95,12 +96,12 @@
         if len(fields) != ncols:
             msg = 'Wrong number of columns in file "%s" line %d\n' % (name_in, i)
             msg += 'I got %d elements, but there should have been %d\n' % (len(fields), ncols)
-            raise Exception, msg
+            raise_(Exception, msg)
 
         Z[i, :] = np.array([float(x) for x in fields])
 
 
-    print Z.shape
+    print(Z.shape)
 
 
     
@@ -110,7 +111,7 @@
     z_min = int(np.nanmin(ZZ))
     z_max = int(np.nanmax(ZZ))+1
 
-    print z_min,z_max
+    print(z_min,z_max)
 
     xlen = ncols*cellsize
     ylen = nrows*cellsize
@@ -118,8 +119,8 @@
     y = np.arange(ylen, 0.0, -cellsize)
     X, Y = np.meshgrid(x, y)
 
-    print X.shape
-    print Y.shape
+    print(X.shape)
+    print(Y.shape)
 
     plt.figure()
 
@@ -127,7 +128,7 @@
     levels = z_min + np.arange(11,dtype=float)/10.0*(z_max-z_min)
 
 
-    print levels
+    print(levels)
     
     #CS = plt.contour(X, Y, Z)
 
diff --git a/validation_tests/case_studies/merewether/plot_hydrograph.py b/validation_tests/case_studies/merewether/plot_hydrograph.py
index 9a0c6a593..74192db90 100644
--- a/validation_tests/case_studies/merewether/plot_hydrograph.py
+++ b/validation_tests/case_studies/merewether/plot_hydrograph.py
@@ -30,7 +30,7 @@ def get_flow_through_cross_section(filename, polyline, verbose=False):
     and the polyline would then be a cross section perpendicular to the flow.
     """
 
-    print 'In get_flow_through_cross_section'
+    print('In get_flow_through_cross_section')
     quantity_names =['elevation',
                      'stage',
                      'xmomentum',
@@ -44,7 +44,7 @@ def get_flow_through_cross_section(filename, polyline, verbose=False):
 #                                                           verbose=verbose)
     from anuga.fit_interpolate.interpolate import Interpolation_function
 
-    print 'In get_interpolated_quantities_at_polyline_midpoints'
+    print('In get_interpolated_quantities_at_polyline_midpoints')
     # Get mesh and quantities from sww file
     X = get_mesh_and_quantities_from_file(filename,
                                           quantities=quantity_names,
@@ -59,7 +59,7 @@ def get_flow_through_cross_section(filename, polyline, verbose=False):
 
     # Interpolate
     if verbose:
-        print 'Interpolating - total number of interpolation points = %d' % len(interpolation_points)
+        print('Interpolating - total number of interpolation points = %d' % len(interpolation_points))
 
     interpolation_function = Interpolation_function(time,
                                quantities,
@@ -73,7 +73,7 @@ def get_flow_through_cross_section(filename, polyline, verbose=False):
     time = interpolation_function.time
     interpolation_points = interpolation_function.interpolation_points
 
-    if verbose: print 'Computing hydrograph'
+    if verbose: print('Computing hydrograph')
 
     # Compute hydrograph
     Q = []
diff --git a/validation_tests/case_studies/merewether/plot_results.py b/validation_tests/case_studies/merewether/plot_results.py
index 0fc891bc1..b88985977 100644
--- a/validation_tests/case_studies/merewether/plot_results.py
+++ b/validation_tests/case_studies/merewether/plot_results.py
@@ -49,7 +49,7 @@
 f = open('Stage_point_comparison.csv','w')
 f.writelines( 'Field, ANUGA, TUFLOW, ANUGA minus Field, ANUGA minus TUFLOW \n' )
 
-if verbose: print nearest_points.tolist()
+if verbose: print(nearest_points.tolist())
 
 for i in range(len(nearest_points)):
     po = point_observations[i,-2]
diff --git a/validation_tests/case_studies/patong/extras/build_elevation.py b/validation_tests/case_studies/patong/extras/build_elevation.py
index 23649db5a..1df4ef99f 100644
--- a/validation_tests/case_studies/patong/extras/build_elevation.py
+++ b/validation_tests/case_studies/patong/extras/build_elevation.py
@@ -45,8 +45,8 @@
 # Fine pts file to be clipped to area of interest
 #------------------------------------------------------------------------------
 
-print 'project.bounding_polygon', project.bounding_polygon
-print 'project.combined_elevation_basename', project.combined_elevation_basename
+print('project.bounding_polygon', project.bounding_polygon)
+print('project.combined_elevation_basename', project.combined_elevation_basename)
 
 # Create Geospatial data from ASCII files
 geospatial_data = {}
@@ -61,7 +61,7 @@
 
     G_grid = Geospatial_data(file_name=absolute_filename+'.pts',
                                                 verbose=True)
-    print 'Clip geospatial object'
+    print('Clip geospatial object')
     geospatial_data[filename] = G_grid.clip(project.bounding_polygon)
 
 # Create Geospatial data from TXT files
@@ -70,7 +70,7 @@
     absolute_filename = join(project.topographies_folder, filename)
     G_points = Geospatial_data(file_name=absolute_filename,
                                                 verbose=True)
-    print 'Clip geospatial object'
+    print('Clip geospatial object')
     geospatial_data[filename] = G_points.clip(project.bounding_polygon)
 
 #-------------------------------------------------------------------------------
@@ -81,7 +81,7 @@
     p = read_polygon(join(project.polygons_folder, extent_polygon_filename))
     extent_polygons.append(p)
     
-print 'Add geospatial objects' 
+print('Add geospatial objects') 
 G = None
 for key in geospatial_data:
     if key == project.point_filenames[0] or key == project.point_filenames[1]:
@@ -96,9 +96,9 @@
         D = D.clip_outside(extent_polygons[2])
         G += D
 
-print 'Export combined DEM file'
+print('Export combined DEM file')
 G.export_points_file(project.combined_elevation + '.pts')
-print 'Do txt version too'
+print('Do txt version too')
 # Use for comparision in ARC
 G.export_points_file(project.combined_elevation + '.txt')
 
diff --git a/validation_tests/case_studies/patong/extras/build_urs_boundary.py b/validation_tests/case_studies/patong/extras/build_urs_boundary.py
index 63d1571ba..f84863849 100644
--- a/validation_tests/case_studies/patong/extras/build_urs_boundary.py
+++ b/validation_tests/case_studies/patong/extras/build_urs_boundary.py
@@ -3,6 +3,7 @@
 and passes in the data from the EventSelection <event>.list file.
 """
 
+from future.utils import raise_
 import os
 import os.path
 from time import localtime, strftime, gmtime
@@ -19,7 +20,7 @@
 # Get gauges (timeseries of index points)
 #-------------------------------------------------------------------------------
 def get_sts_gauge_data(filename, verbose=False):
-    print 'get_sts_gauge_data: filename=%s' % filename
+    print('get_sts_gauge_data: filename=%s' % filename)
     fid = NetCDFFile(filename+'.sts', 'r')      #Open existing file for read
     permutation = fid.variables['permutation'][:]
     x = fid.variables['x'][:] + fid.xllcorner   #x-coordinates of vertices
@@ -39,7 +40,7 @@ def get_sts_gauge_data(filename, verbose=False):
     #---------------------------------------------------------------------------
      
     maxname = 'max_sts_stage.csv'
-    print 'get_sts_gauge_data: maxname=%s' % maxname
+    print('get_sts_gauge_data: maxname=%s' % maxname)
     fid_max = open(os.path.join(project.event_folder, maxname), 'w')
     fid_max.write('index, x, y, max_stage \n')    
     for j in range(len(x)):
@@ -96,27 +97,27 @@ def build_urs_boundary(event_file, output_dir):
     if project.multi_mux:
         # get the mux+weight data from the meta-file (in <boundaries>)
         mux_event_file = os.path.join(project.event_folder, event_file)
-        print 'using multi-mux file', mux_event_file
+        print('using multi-mux file', mux_event_file)
         try:
             fd = open(mux_event_file, 'r')
             mux_data = fd.readlines()
             fd.close()
-        except IOError, e:
+        except IOError as e:
             msg = 'File %s cannot be read: %s' % (mux_event_file, str(e))
-            raise Exception, msg
+            raise_(Exception, msg)
         except:
             raise
 
         # first line of file is # filenames+weight in rest of file
         num_lines = int(mux_data[0].strip())
         mux_data = mux_data[1:]
-        print 'number of sources %d' % num_lines
+        print('number of sources %d' % num_lines)
 
         # quick sanity check on input mux meta-file
         if num_lines != len(mux_data):
             msg = ('Bad file %s: %d data lines, but line 1 count is %d'
                    % (event_file, len(mux_data), num_lines))
-            raise Exception, msg
+            raise_(Exception, msg)
 
         # Create filename and weights lists.
         # Must chop GRD filename just after '*.grd'.
@@ -131,7 +132,7 @@ def build_urs_boundary(event_file, output_dir):
         mux_weights = [float(line.strip().split()[1]) for line in mux_data]
 
         # Call legacy function to create STS file.
-        print 'creating sts file'
+        print('creating sts file')
         urs2sts(mux_filenames,
                 basename_out=output_dir,
                 ordering_filename=project.urs_order,
@@ -142,14 +143,14 @@ def build_urs_boundary(event_file, output_dir):
     else:                           # a single mux stem file, assume 1.0 weight
         mux_file = os.path.join(project.event_folder, event_file)
         mux_filenames = [mux_file]
-        print 'using single-mux file', mux_file
+        print('using single-mux file', mux_file)
 
         weight_factor = 1.0
         mux_weights = weight_factor*num.ones(len(mux_filenames), float)
             
         order_filename = project.urs_order
 
-        print 'reading', order_filename
+        print('reading', order_filename)
         # Create ordered sts file
         urs2sts(mux_filenames,
                 basename_out=output_dir,
@@ -161,7 +162,7 @@ def build_urs_boundary(event_file, output_dir):
     # report on progress so far
     sts_file = os.path.join(project.event_folder, project.scenario_name)
     quantities, elevation, time = get_sts_gauge_data(sts_file, verbose=False)
-    print len(elevation), len(quantities['stage'][0,:])
+    print(len(elevation), len(quantities['stage'][0,:]))
 
     
 
diff --git a/validation_tests/case_studies/patong/plot_results.py b/validation_tests/case_studies/patong/plot_results.py
index d08ab9e82..f2dcf6acf 100644
--- a/validation_tests/case_studies/patong/plot_results.py
+++ b/validation_tests/case_studies/patong/plot_results.py
@@ -20,7 +20,7 @@
     import pylab
     pylab.hold(False)  # Check if this command can be issued
 except:
-    print 'Could not import pylab'
+    print('Could not import pylab')
     plotting = False
 else:
     # Create plots as png files
@@ -85,7 +85,7 @@ def plot_timeseries(timevector,
     N = timevector.shape[0]
     assert timeseries.shape[0] == N    
 
-    print 'Plotting gauge "%s"' % name
+    print('Plotting gauge "%s"' % name)
 
     if True:
         # Generate plots
@@ -104,8 +104,8 @@ def plot_timeseries(timevector,
 
 timevector, timeseries = get_timeseries(join('outputs','patong.sww'), gauges )
 
-print timevector.shape
-print timeseries.keys()
+print(timevector.shape)
+print(timeseries.keys())
 
 for gauge, ts in timeseries.iteritems():
     plot_timeseries(timevector, ts, name=join('outputs',gauge+'.png'))
diff --git a/validation_tests/case_studies/towradgi/print_pstats.py b/validation_tests/case_studies/towradgi/print_pstats.py
new file mode 100755
index 000000000..53e919895
--- /dev/null
+++ b/validation_tests/case_studies/towradgi/print_pstats.py
@@ -0,0 +1,42 @@
+#!/bin/env python3
+
+#!/usr/bin/env python3
+
+import argparse
+import pstats
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="View and print Python cProfile stats from a file."
+    )
+    parser.add_argument(
+        "profile_file",
+        help="The cProfile stats file to read (e.g., output.prof)"
+    )
+    parser.add_argument(
+        "-n", "--num-lines",
+        type=int,
+        default=20,
+        help="Number of lines to print (default: 20)"
+    )
+    parser.add_argument(
+        "-s", "--sort",
+        default="cumulative",
+        choices=["cumulative", "time", "calls", "name", "filename", "module", "pcalls"],
+        help="Sort order for stats (default: cumulative)"
+    )
+
+    args = parser.parse_args()
+
+    try:
+        stats = pstats.Stats(args.profile_file)
+        stats.strip_dirs().sort_stats(args.sort).print_stats(args.num_lines)
+    except FileNotFoundError:
+        print(f"File '{args.profile_file}' not found.")
+    except Exception as e:
+        print(f"Error reading profile: {e}")
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/validation_tests/case_studies/towradgi/project.py b/validation_tests/case_studies/towradgi/project.py
index 6a1da3457..0c8a2c4c3 100644
--- a/validation_tests/case_studies/towradgi/project.py
+++ b/validation_tests/case_studies/towradgi/project.py
@@ -4,8 +4,8 @@
 model_output_dir='MODEL_OUTPUTS'
 partition_dir = 'PARTITIONS'
 checkpoint_dir = 'CHECKPOINTS'
-checkpoint_time = 60 # 30*60 # 30 minutes
-checkpoint = True
+checkpoint_time = 30*60 # 30 minutes
+checkpoint = False
 
 finaltime = 400.0
 yieldstep = 10.0
diff --git a/validation_tests/case_studies/towradgi/run_small_towradgi.py b/validation_tests/case_studies/towradgi/run_small_towradgi.py
new file mode 100755
index 000000000..2024ff9ef
--- /dev/null
+++ b/validation_tests/case_studies/towradgi/run_small_towradgi.py
@@ -0,0 +1,962 @@
+#!/usr/bin/env python
+
+""" 
+Towradgi Creek 17 August 1998 Storm Event Calibration
+By Petar Milevski, some revisions by Gareth Davies
+Updated script By Petar Milevski 2022
+"""
+
+# ------------------------------------------------------------------------------
+# IMPORT NECESSARY MODULES
+# ------------------------------------------------------------------------------
+from project import *
+from anuga import distribute, myid, numprocs, finalize, barrier
+from anuga import Rate_operator
+from anuga import Boyd_box_operator
+from anuga import Boyd_pipe_operator
+from anuga import Domain
+from anuga import create_mesh_from_regions
+from anuga import read_polygon
+from anuga import Polygon_function
+from anuga import file_function
+
+import anuga.utilities.spatialInputUtil as su
+
+from os.path import join
+import glob
+import os
+import numpy
+import time
+import anuga
+
+# ------------------------------------------------------------------------------
+# TEST FUNCTION AND DICTIONARY
+# ------------------------------------------------------------------------------
+
+def read_polygon_list(poly_list):
+    # Alternative to read_polygon_dir -- allows us to control order of polygons
+    result = []
+    for i in range(len(poly_list)):
+        result.append((read_polygon(poly_list[i][0]), poly_list[i][1]))
+    return result
+
+# ------------------------------------------------------------------------------
+# PARALLEL INTERFACE
+# ------------------------------------------------------------------------------
+                            
+if myid == 0:
+    print('ABOUT to Start Simulation:- Importing Modules')
+
+if myid == 0 and not os.path.isdir('DEM_bridges'):
+    msg = """
+################################################################################
+#
+# Could not the find data directories
+#
+# You can download these directories using the data_download.py script.
+# This will download over 86 MB of data!
+#
+################################################################################
+"""
+    raise Exception(msg)
+
+args = anuga.get_args()
+alg = args.alg
+verbose = args.verbose
+
+# --------------------------------------------------------------------------
+# Setup parameters
+# --------------------------------------------------------------------------
+
+verbose = False
+yieldstep=10. # yield evolve loop every 10 seconds
+outputstep=60. # update sww files every 60 seconds
+finaltime=140. #83700.
+
+scale = 1 # For coarse mesh set to 10 (135237 triangles), fine mesh set to 1 (256688 triangles)
+maximum_triangle_area = 1000 # This doesn't make much difference for this mesh
+
+# Choices are 1 (openmp) 2 (cupy)
+multiprocessor_mode = 1
+
+checkpoint_time = max(600/scale, 60)
+checkpoint_dir = 'CHECKPOINTS'
+
+useCulverts = False # Use this to turn off culverts
+useCheckpointing = False
+
+
+
+#minimum_storable_height = 0.05
+base_friction = 0.04
+alpha = 0.99
+
+
+basename = join('DEM_bridges', 'towradgi')
+domain_name = join('Towradgi_historic_flood')
+meshname = join('DEM_bridges', 'towradgi.tsh')
+func = file_function(join('Forcing', 'Tide', 'Pioneer.tms'), quantities='rainfall')
+
+# ------------------------------------------------------------------------------
+# Use a try statement to read in previous checkpoint file and if not possible
+# just go ahead as normal and produce domain as usual.
+#
+# Though in the part of the code where you create the domain as normal,
+# remember to turn on checkpointing via
+# domain.set_checkpointing(checkpoint_time = checkpoint_time)
+# ------------------------------------------------------------------------------
+try:
+    from anuga import load_checkpoint_file
+
+    if myid == 0:
+        msg = f"""
+=================================================
+Trying to load checkpoint files.
+
+If you don't want to use checkpoint files from a previous 
+run, you need to delete the files from the 
+checkpoint directory {checkpoint_dir}
+=================================================
+"""
+        print(msg)
+
+    domain = load_checkpoint_file(domain_name=domain_name,
+                                  checkpoint_dir=checkpoint_dir)
+
+    if myid == 0:
+        print('Checkpoint File Loaded')
+        domain.write_time()
+
+except:
+
+    if myid == 0:
+        msg = """
+=================================================
+No checkpoint file found.
+
+Creating domain from scratch.
+=================================================
+"""
+        print(msg)
+                         
+    # ------------------------------------------------------------------------------
+    #  ADD CATCHMENT INFORMATION HERE
+    # ------------------------------------------------------------------------------
+    CatchmentList = [
+        [join('Model', 'Bdy', 'Catchment.csv'), scale*100.0],
+        [join('Model', 'Bdy', 'FineCatchment.csv'), scale*36.0],
+        [join('Model', 'Bdy', 'CreekBanks.csv'), 8.0]
+    ]
+    
+    # IMPORTANT -- The ORDER in ManningList matters: When there is overlap,
+    # priority regions at BOTTOM
+    # FIXME: This setup can be done with fewer lines of code!
+    
+    ManningList = [
+       [ join('Model', 'Mannings', '1.csv'),0.04], #park
+       [ join('Model', 'Mannings', '2.csv'),0.15],
+       [ join('Model', 'Mannings', '3.csv'),0.15],
+       [ join('Model', 'Mannings', '4.csv'),0.04],
+       [ join('Model', 'Mannings', '5.csv'),0.15],
+       [ join('Model', 'Mannings', '6.csv'),0.15],
+       [ join('Model', 'Mannings', '7.csv'),0.15],
+       [ join('Model', 'Mannings', '8.csv'),0.15],
+       [ join('Model', 'Mannings', '9.csv'),0.04], #park
+       [ join('Model', 'Mannings', '10.csv'), 0.15],
+       [ join('Model', 'Mannings', '11.csv'), 0.15],
+       [ join('Model', 'Mannings', '12.csv'), 0.15],
+       [ join('Model', 'Mannings', '13.csv'), 0.04],
+       [ join('Model', 'Mannings', '14.csv'), 0.15],
+       [ join('Model', 'Mannings', '15.csv'), 0.15],
+       [ join('Model', 'Mannings', '16.csv'), 0.15],
+       [ join('Model', 'Mannings', '17.csv'), 0.15],
+       [ join('Model', 'Mannings', '18.csv'), 0.045],
+       [ join('Model', 'Mannings', '18a.csv'), 0.15],
+       [ join('Model', 'Mannings', '18b.csv'), 0.15],
+       [ join('Model', 'Mannings', '18c.csv'), 0.15],
+       [ join('Model', 'Mannings', '18d.csv'), 0.15],
+       [ join('Model', 'Mannings', '18e.csv'), 0.08], #cokeworks site
+       [ join('Model', 'Mannings', '19.csv'), 0.15],
+       [ join('Model', 'Mannings', '20.csv'), 0.15],
+       [ join('Model', 'Mannings', '21.csv'), 0.15],
+       [ join('Model', 'Mannings', '22.csv'), 0.15],
+       [ join('Model', 'Mannings', '23.csv'), 0.15],
+       [ join('Model', 'Mannings', '24.csv'), 0.05],
+       [ join('Model', 'Mannings', '25.csv'), 0.15],
+       [ join('Model', 'Mannings', '26.csv'), 0.15],
+       [ join('Model', 'Mannings', '27.csv'), 0.15],
+       [ join('Model', 'Mannings', '28.csv'), 0.15],
+       [ join('Model', 'Mannings', '29.csv'), 0.15],
+       [ join('Model', 'Mannings', '30.csv'), 0.15],
+       [ join('Model', 'Mannings', '31.csv'), 0.15],
+       [ join('Model', 'Mannings', '32.csv'), 0.15],
+       [ join('Model', 'Mannings', '33.csv'), 0.15],
+       [ join('Model', 'Mannings', '34.csv'), 0.15],
+       [ join('Model', 'Mannings', '35.csv'), 0.15],
+       [ join('Model', 'Mannings', '36.csv'), 0.05],
+       [ join('Model', 'Mannings', '37.csv'), 0.15],
+       [ join('Model', 'Mannings', '38.csv'), 0.15],
+       [ join('Model', 'Mannings', '39.csv'), 0.15],
+       [ join('Model', 'Mannings', '40.csv'), 0.15],
+       [ join('Model', 'Mannings', '41.csv'), 0.15],
+       [ join('Model', 'Mannings', '42.csv'), 0.15],
+       [ join('Model', 'Mannings', '43.csv'), 0.15],
+       [ join('Model', 'Mannings', '44.csv'), 0.15],
+       [ join('Model', 'Mannings', '45.csv'), 0.15],
+       [ join('Model', 'Mannings', '46.csv'), 0.15],
+       [ join('Model', 'Mannings', '47.csv'), 0.15],
+       [ join('Model', 'Mannings', '48.csv'), 0.15],
+       [ join('Model', 'Mannings', '49.csv'), 0.15],
+       [ join('Model', 'Mannings', '50.csv'), 0.15],
+       [ join('Model', 'Mannings', '51.csv'), 0.15],
+       [ join('Model', 'Mannings', '52.csv'), 0.15],
+       [ join('Model', 'Mannings', '53.csv'), 0.15],
+       [ join('Model', 'Mannings', '54.csv'), 0.15],
+       [ join('Model', 'Mannings', '55.csv'), 0.15],
+       [ join('Model', 'Mannings', '56.csv'), 0.15],
+       [ join('Model', 'Mannings', '57.csv'), 0.15],
+       [ join('Model', 'Mannings', '58.csv'), 0.15],
+       [ join('Model', 'Mannings', '59.csv'), 0.08],
+       [ join('Model', 'Mannings', '60.csv'), 0.15],
+       [ join('Model', 'Mannings', '61.csv'), 0.08],
+       [ join('Model', 'Mannings', '62.csv'), 0.15],
+       [ join('Model', 'Mannings', '63.csv'), 0.08],
+       [ join('Model', 'Mannings', '64.csv'), 0.15],
+       [ join('Model', 'Mannings', '65.csv'), 0.15],
+       [ join('Model', 'Mannings', '66.csv'), 0.15],
+       [ join('Model', 'Mannings', '67.csv'), 0.15],
+       [ join('Model', 'Mannings', '68.csv'), 0.15],
+       [ join('Model', 'Mannings', '69.csv'), 0.15],
+       [ join('Model', 'Mannings', '70.csv'), 0.15],
+       [ join('Model', 'Mannings', '71.csv'), 0.05],
+       [ join('Model', 'Mannings', '72.csv'), 0.15],
+       [ join('Model', 'Mannings', '73.csv'), 0.15],
+       [ join('Model', 'Mannings', '74.csv'), 0.15],
+       [ join('Model', 'Mannings', '75.csv'), 0.15],
+       [ join('Model', 'Mannings', '76.csv'), 0.15],
+       [ join('Model', 'Mannings', '77.csv'), 0.07],
+       [ join('Model', 'Mannings', '78.csv'), 0.15],
+       [ join('Model', 'Mannings', '79.csv'), 0.15],
+       [ join('Model', 'Mannings', '80.csv'), 0.15],
+       [ join('Model', 'Mannings', '81.csv'), 0.15],
+       [ join('Model', 'Mannings', '82.csv'), 0.15],
+       [ join('Model', 'Mannings', '83.csv'), 0.15],
+       [ join('Model', 'Mannings', '84.csv'), 0.15],
+       [ join('Model', 'Mannings', '85.csv'), 0.15],
+       [ join('Model', 'Mannings', '86.csv'), 0.15],
+       [ join('Model', 'Mannings', 'Escarpement.csv'), 0.15],
+       [ join('Model', 'Mannings', 'Railway.csv'), 0.04],
+       [ join('Model', 'Creeks', 'creeks1.csv'), channel_manning],
+       [ join('Model', 'Creeks', 'creeks2.csv'), channel_manning],
+       [ join('Model', 'Creeks', 'creeks3.csv'), channel_manning],
+       [ join('Model', 'Creeks', 'creeks4.csv'), channel_manning],
+       [ join('Model', 'Creeks', 'creeks5.csv'), channel_manning],
+       [ join('Model', 'Creeks', 'creeks6.csv'), channel_manning],
+    # modelling the impact of important buildings using higher roughness
+       [ join('Model', 'Buildings', 'Building1.csv'),  10.0],
+       [ join('Model', 'Buildings', 'Building4.csv'),  10.0],
+       [ join('Model', 'Buildings', 'Building5.csv'),  10.0],
+       [ join('Model', 'Buildings', 'Building6.csv'),  10.0],
+       [ join('Model', 'Buildings', 'Building7.csv'),  10.0],
+       [ join('Model', 'Buildings', 'Building8.csv'),  10.0],
+       [ join('Model', 'Buildings', 'Building9.csv'),  10.0],
+       [ join('Model', 'Buildings', 'Building10.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building11.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building12.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building13.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building14.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building15.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building16.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building17.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building18.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building19.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building20.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building21.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building22.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building23.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building24.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building25.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building26.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building27.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building28.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building29.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building30.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building31.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building32.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building33.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building34.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building35.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building36.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building37.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building38.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building39.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building40.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building41.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building42.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building43.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building44.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building45.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building46.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building47.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building48.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building49.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building50.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building51.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building52.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building53.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building54.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building55.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building56.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building57.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building62.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building63.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building64.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building65.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building66.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building67.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building68.csv'), 10.0],
+       [ join('Model', 'Buildings', 'Building69.csv'), 10.0]
+       	]
+    
+    W = 303517
+    N = 6195670
+    E = 308570
+    S = 6193140
+    
+
+                                   
+    model_output_dir = 'MODEL_OUTPUTS'
+    try:
+        os.mkdir(model_output_dir)
+    except:
+        pass
+    
+    # --------------------------------------------------------------------------
+    # Setup Domain
+    # --------------------------------------------------------------------------
+    
+    # Make a list of the csv files in BREAKLINES
+    riverWall_csv_files = glob.glob('Model/Riverwalls/*.csv')
+    (riverWalls, riverWall_parameters) = su.readListOfRiverWalls(riverWall_csv_files)
+    	                                      
+    if myid == 0:
+        # ------------------------------------------------------------------------------
+        # CREATING MESH
+        # ------------------------------------------------------------------------------
+    
+        bounding_polygon = [[W, S], [E, S], [E, N], [W, N]]
+    
+        interior_regions = read_polygon_list(CatchmentList)
+    
+        # Make the mesh
+        create_mesh_from_regions(bounding_polygon,
+                                 boundary_tags={'south': [0], 'east': [
+                                     1], 'north': [2], 'west': [3]},
+                                 maximum_triangle_area=maximum_triangle_area,
+                                 interior_regions=interior_regions,
+                                 breaklines=riverWalls.values(),
+                                 filename=meshname,
+                                 use_cache=False,
+                                 verbose=False)
+    
+        # ------------------------------------------------------------------------------
+        # SETUP COMPUTATIONAL DOMAIN
+        # ------------------------------------------------------------------------------
+    
+        domain = Domain(meshname, use_cache=False, verbose=False)
+    
+        domain.set_flow_algorithm(alg)
+    
+        if(not domain.get_using_discontinuous_elevation()):
+            raise Exception(
+                'This model run relies on a discontinuous elevation solver (because of how topography is set up)')
+    
+        domain.set_datadir(model_output_dir)
+        try:
+            os.mkdir()
+        except:
+            pass
+        domain.set_name(domain_name)
+    
+        print(domain.statistics())
+    
+        # ------------------------------------------------------------------------------
+        # APPLY MANNING'S ROUGHNESSES
+        # ------------------------------------------------------------------------------
+    
+        print('FITTING polygon_function for friction')
+        friction_list = read_polygon_list(ManningList)
+    
+        domain.set_quantity('friction', Polygon_function(
+            friction_list, default=base_friction, geo_reference=domain.geo_reference))
+    
+        # Set a Initial Water Level over the Domain
+        domain.set_quantity('stage', 0)
+    
+        
+        print('TRYING TO READ %s' % basename+'.npy')
+        try:
+            elev_xyz = numpy.load(basename+'.npy')
+        except:
+            print('TRYING TO READ %s' % basename+'.csv')
+            elev_xyz = numpy.genfromtxt(fname=basename+'.csv', delimiter=',')
+            print('SAVING %s' % basename+'.npy')
+            numpy.save(basename+'.npy', elev_xyz)
+    
+        # Use nearest-neighbour interpolation of elevation
+        print('CREATING nearest neighbour interpolator')
+        from anuga.utilities.quantity_setting_functions import make_nearestNeighbour_quantity_function
+        elev_fun_wrapper = make_nearestNeighbour_quantity_function(
+            elev_xyz, domain)
+    
+        print('FITTING to domain')
+        domain.set_quantity('elevation', elev_fun_wrapper, location='centroids')
+    
+        
+        # -----------------------------------------------------------------------------
+        # Turn on checkpointing every 5 sec (just for testing, more reasonable to
+        # set to say 15 minutes = 15*60 sec)
+        # Only set this on process 0 ie within a if myid == 0: structure
+        # -----------------------------------------------------------------------------
+        if useCheckpointing:
+            domain.set_checkpointing(
+                checkpoint_time=checkpoint_time, checkpoint_dir=checkpoint_dir)
+
+    else:
+        domain = None
+    
+    barrier()
+    
+    if myid == 0 and verbose:
+        print('DISTRIBUTING DOMAIN')
+    
+    domain = distribute(domain, verbose=verbose)
+    
+    barrier()
+
+    # -----------------------------------------------------------------------------
+    # Turn on checkpointing every 5 sec (just for testing, more reasonable to
+    # set to say 15 minutes = 15*60 sec)
+    # Only set this on process 0 ie within a if myid == 0: structure
+    # -----------------------------------------------------------------------------
+    if useCheckpointing:
+        domain.set_checkpointing(
+                    checkpoint_time=checkpoint_time, checkpoint_dir=checkpoint_dir)
+                    
+    
+    domain.quantities_to_be_stored = {'elevation': 2,
+                                      'friction': 1,
+                                      'stage': 2,
+                                      'xmomentum': 2,
+                                      'ymomentum': 2}
+    
+    if myid == 0:
+        print('CREATING RIVERWALLS')
+    
+    domain.create_riverwalls(riverWalls)
+    
+    
+    barrier()
+
+    if myid == 0:
+        print('CREATING INLETS')
+       
+    #------------------------------------------------------------------------------
+    # ENTER CULVERT DATA
+    #------------------------------------------------------------------------------
+    
+    if useCulverts: # Use this to turn off culverts
+
+        smoothTS=30. # Smoothing timescale for bridges
+
+        if myid == 0: print ('Creating Boyd_pipe_operator at Branch_2_Brooker_St_Culvert') 
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305772.982,6193988.557] , [305772.378,6193987.823]])
+        el1 = numpy.array([[305794.592,6193983.907] , [305793.988,6193983.173]])
+        ## Adjust el0, el1
+        #elOffset=0.
+        #el0M=0.5*(el0[0,:]+el0[1,:]) ; el1M=0.5*(el1[0,:]+el1[1,:]); n0=el0M-el1M; n0=n0/((n0*n0).sum())**0.5;
+        #el0 = el0 
+        culvert = Boyd_pipe_operator(domain,
+                                    losses=losses,
+                                    diameter=0.9, #actual culvert is a 1.8m diameter, was 75% blocked in 1998
+                                    exchange_lines=[el0, el1],
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_2_Brooker_St_Culvert',
+                                    verbose=False)    
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_2_Meadow_St_Culvert')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305886.333,6193929.052] , [305883.172,6193922.986]])
+        el1 = numpy.array([[305906.553,6193910.461] , [305903.393,6193904.395]])  
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=5.4, #3x1.8mx0.6m RCBC (50% blocked in 1998) actual culvert dimensions are 3x1.8x1.8 RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=0.6,
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_2_Meadow_St_Culvert',
+                                    verbose=False)    
+        
+        if myid == 0: print ('Creating Boyd_pipe_operator at Branch_2_Williams_St_Culvert') 
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305945.955,6193836.293] , [305945.125,6193835.387]])
+        el1 = numpy.array([[306040.565,6193827.573] , [306039.735,6193826.667]])
+        culvert = Boyd_pipe_operator(domain,
+                                    losses=losses,
+                                    diameter=1.2, #actual culvert is 2X1.2m diameter, was 50% blocked in 1998
+                                    exchange_lines=[el0, el1],
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_2_Williams_St_Culvert',
+                                    verbose=False)     
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Meadow_St_Culvert')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305812.113,6193591.972] , [305809.390,6193588.820]])
+        el1 = numpy.array([[305834.913,6193588.382] , [305832.190,6193585.230]])  
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=4.0,#2x3.0mx2.2m RCBC (33% blocked in 1998) actual culvert dimensions are 3x3.0mx2.2m RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=2.2,
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_Towradgi_Meadow_St_Culvert',
+                                    verbose=False)  
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_5_Collins_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[306330.608,6194817.116] , [306320.768,6194805.884]])
+        el1 = numpy.array([[306369.483,6194811.616] , [306359.643,6194800.384]])   
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=14.4,#4x3.6mx0.93m RCBC (0% blocked in 1998)
+                                    exchange_lines=[el0, el1],
+                                    height=0.93,
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_5_Collins_St_Culverts',
+                                    verbose=False)                                     
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_5_Northern_Distributor_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[306956.242,6194465.589] , [306950.446,6194457.411]])
+        el1 = numpy.array([[307003.711,6194446.089] , [306997.916,6194437.911]])   
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=9.09,#2x3.03mx0.85m RCBC (50% blocked in 1998) actual culvert dimensions are 3x3.03mx1.7m RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=0.85,
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_5_Northern_Distributor_Culverts',
+                                    verbose=False)                                      
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_5_Coke_Works_Culverts')                                   
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[307142.161,6194181.3065] , [307138.519,6194174.394]])
+        el1 = numpy.array([[307160.521,6194164.8165] , [307156.879,6194157.904]])   
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=4.56,#use a 4.56mx2.9m RCBC (0% blocked in 1998) actual culvert dimensions are 2x2.9m diameter pipes
+                                    exchange_lines=[el0, el1],
+                                    height=2.9,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_5_Coke_Works_Culverts',
+                                    verbose=False)                                      
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_6_Northern_Distributor_Culverts')            
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[306950.758,6193454.717] , [306947.804,6193453.283]])
+        el1 = numpy.array([[306988.633,6193474.217] , [306985.679,6193472.783]])  
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=3.6,#3x1.2mx1.2m RCBC (0% blocked in 1998) actual culvert dimensions are 3x1.2mx1.2m RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=1.2,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_6_Northern_Distributor_Culverts',
+                                    verbose=False)                                      
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_6_Railway_Culverts')                                
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[307139.134,6193474.458] , [307138.492,6193473.542]])
+        el1 = numpy.array([[307150.884,6193469.458] , [307150.242,6193468.542]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=1.0,#1x1.0mx3.5m RCBC (67% blocked in 1998) actual culvert dimensions are 1x3.0mx3.5m RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=3.5,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_6_Railway_Culverts',
+                                    verbose=False) 
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_6_Colgong_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[307200.610,6193476.765] , [307199.140,6193475.235]])
+        el1 = numpy.array([[307224.610,6193475.765] , [307223.140,6193474.235]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=1.65,#use a 1.65mx1.05m RCBC (0% blocked in 1998) actual culvert dimensions are 2x1.05m diameter pipes
+                                    exchange_lines=[el0, el1],
+                                    height=1.05,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_6_Colgong_St_Culverts',
+                                    verbose=False)   
+                                            
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_3_Basin_Outlet_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305629.639,6194408.883] , [305626.521,6194400.457]])
+        el1 = numpy.array([[305665.889,6194347.183] , [305662.771,6194338.757]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=6.0,#2x3.0mx0.86m RCBC (0% blocked in 1998) actual culvert dimensions are 2x3.0mx0.86m RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=0.86,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_3_Basin_Outlet_Culverts',
+                                    verbose=False)                                      
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_3_Bellambi_Rd_Culverts')                                    
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305777.182,6194305.377] , [305776.444,6194304.623]])
+        el1 = numpy.array([[305873.807,6194303.377] , [305873.069,6194302.623]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=1.65,#use a 1.65mx1.05m RCBC (0% blocked in 1998) actual culvert dimensions are 2x1.05m diameter pipes
+                                    exchange_lines=[el0, el1],
+                                    height=1.05,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_3_Bellambi_Rd_Culverts',
+                                    verbose=False)    
+        
+        if myid == 0: print ('Creating Boyd_pipe_operator at Branch_3_Meadow_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305914.649,6194322.375] , [305913.477,6194321.625]])
+        el1 = numpy.array([[305950.711,6194335.375] , [305949.539,6194334.625]])
+            
+        culvert = Boyd_pipe_operator(domain,
+                                    losses=losses,
+                                    diameter=1.5, # actual culvert is a single 1.5m diameter pipe 0% blocked
+                                    exchange_lines=[el0, el1],
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_3_Meadow_St_Culverts',
+                                    verbose=False)     
+        
+        if myid == 0: print ('Creating Boyd_pipe_operator at Branch_3_13_Meadow_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305911.280,6194359.203] , [305910.260,6194358.017]])
+        el1 = numpy.array([[305946.090,6194353.573] , [305945.070,6194352.387]])
+            
+        culvert = Boyd_pipe_operator(domain,
+                                    losses=losses,
+                                    diameter=1.5,# actual culvert is a single 1.5m diameter pipe 0% blocked
+                                    exchange_lines=[el0, el1],
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_3_13_Meadow_St_Culverts', 
+                                    verbose=False)     
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_3_41_Angel_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[306196.779,6194028.193] , [306192.221,6194010.807]])
+        el1 = numpy.array([[306200.154,6194018.693] , [306195.596,6194001.307]])
+            
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=10.0, # actual culvert is 10m X 0.35m H, 0 % blockage
+                                    exchange_lines=[el0, el1],
+                                    height=0.35,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_3_41_Angel_St_Culverts',
+                                    verbose=False)        
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_7_Carroll_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[308002.045,6193820.163] , [308001.215,6193819.197]])
+        el1 = numpy.array([[308021.965,6193816.883] , [308021.135,6193815.917]])
+            
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=1.22, #actual culvert is 2x1.22mx0.3m 50% blocked
+                                    exchange_lines=[el0, el1],
+                                    height=0.3,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_7_Carroll_St_Culverts',
+                                    verbose=False)           
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_7_Parker_Rd_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[308105.832,6193803.622] , [308103.648,6193801.118]])
+        el1 = numpy.array([[308126.782,6193800.552] , [308124.598,6193798.048]])
+            
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=3.18, #actual culvert is 3x1.06mW X 0.3mD 0% blocked
+                                    exchange_lines=[el0, el1],
+                                    height=0.3,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_7_Parker_Rd_Culverts',
+                                    verbose=False)     
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_7_Lake_Pde_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[308251.257,6193614.658] , [308248.343,6193618.]])
+        el1 = numpy.array([[308232.,6193593.] , [308225.,6193596.]])   
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=2.36, #actual culvert is 4 750mm diameter pipes 0% blockage, here models as box culvert
+                                    exchange_lines=[el0, el1],
+                                    height=0.75,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_7_Lake_Pde_Culverts',
+                                    verbose=False)                                                                      
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Princes_Hwy_Bridge')							
+        losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
+        el0 = numpy.array([[306607.274,6193707.421] , [306602.635,6193695.720]]) 
+        el1 = numpy.array([[306626.205,6193694.358] , [306622.068,6193683.138]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=12.0,
+                                    exchange_lines=[el0, el1],
+                                    height=3.0,
+                                    apron=0.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=channel_manning,
+                                    logging=False,
+                                    label='Branch_Towradgi_Princes_Hwy_Bridge',
+                                    verbose=False)  
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Pioneer_Rd_Bridge')
+        losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
+        el0 = numpy.array([[307623.,6193610.] , [307622.,6193607.]])
+        el1 = numpy.array([[307610.,6193619.] , [307609., 6193616.]])  
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=20.0,
+                                    exchange_lines=[el0, el1],
+                                    height=3.5,
+                                    apron=0.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=channel_manning,
+                                    logging=False,
+                                    label='Branch_Towradgi_Pioneer_Rd_Bridge',
+                                    verbose=False)                           
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Northern_Distributor_Bridge')
+        losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
+        el0 = numpy.array([[306985.,6193749.] , [306985.,6193736.]])
+        el1 = numpy.array([[306950.,6193745.] , [306950.,6193732.]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=45.0,
+                                    exchange_lines=[el0, el1],
+                                    height=6.0,
+                                    apron=0.0,
+                                    enquiry_gap=10.,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=channel_manning,
+                                    logging=False,
+                                    label='Branch_Towradgi_Northern_Distributor_Bridge',
+                                    verbose=False)    
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Railway_Bridge') 
+        losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
+        el0 = numpy.array([[307236.,6193737.] , [307235.,6193733.]])
+        el1 = numpy.array([[307223.,6193738.] , [307222.,6193734.]]) 
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=20.0,
+                                    exchange_lines=[el0, el1],
+                                    height=8.0,
+                                    apron=0.0,
+                                    enquiry_gap=20.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=channel_manning,
+                                    logging=False,
+                                    label='Branch_Towradgi_Railway_Bridge',
+                                    verbose=False) 
+    
+    # ----------------------------------------------------------------------------------------------------------------------------------------------------
+    # APPLY RAINFALL
+    # ----------------------------------------------------------------------------------------------------------------------------------------------------
+    if myid == 0 and verbose:
+        print('CREATING RAINFALL POLYGONS')
+    
+    Rainfall_Gauge_directory = join('Forcing', 'Rainfall', 'Gauge')
+    for filename in os.listdir(Rainfall_Gauge_directory):
+        Gaugefile = join(Rainfall_Gauge_directory, filename)
+        Rainfile = join('Forcing', 'Rainfall', 'Hort', filename[0:-4]+'.tms')
+        #print(Gaugefile)
+        #print(Rainfile)
+        polygon = anuga.read_polygon(Gaugefile)
+        rainfall = anuga.file_function(Rainfile, quantities='rate')
+        op1 = Rate_operator(domain, rate=rainfall, factor=1.0e-3,
+                            polygon=polygon, default_rate=0.0)
+    
+    barrier()
+    
+    # ------------------------------------------------------------------------------
+    # BOUNDARY CONDITIONS
+    # ------------------------------------------------------------------------------
+    
+    print(f'Available boundary tags on process {myid} ', domain.get_boundary_tags())
+    
+    Bd = anuga.Dirichlet_boundary([0, 0, 0])
+    Bw = anuga.Time_boundary(domain=domain, function=lambda t: [
+                             func(t)[0], 0.0, 0.0])
+    
+    domain.set_boundary({'west': Bd, 'south': Bd, 'north': Bd, 'east': Bw})
+    
+    if myid == 0:
+        print('Start Evolve')
+
+
+
+domain.set_multiprocessor_mode(multiprocessor_mode)
+
+# ------------------------------------------------------------------------------
+# EVOLVE SYSTEM THROUGH TIME
+# ------------------------------------------------------------------------------
+barrier()
+t0 = time.time()
+
+
+for t in domain.evolve(yieldstep=yieldstep, outputstep=outputstep, finaltime=finaltime):
+    if myid == 0:
+        domain.write_time()
+
+barrier()
+
+for p in range(numprocs):
+    if myid == p:
+        print('Processor %g ' % myid)
+        print('That took %.2f seconds' % (time.time()-t0))
+        print('Communication time %.2f seconds' % domain.communication_time)
+        print('Reduction Communication time %.2f seconds'
+              % domain.communication_reduce_time)
+        print('Broadcast time %.2f seconds' %
+              domain.communication_broadcast_time)
+    else:
+        pass
+
+    barrier()
+
+# --------------------------------------------------
+# Merge the individual sww files into one file
+# But don't delete the sub domain sww files
+# --------------------------------------------------
+domain.sww_merge(delete_old=True)
+
+finalize()
diff --git a/validation_tests/case_studies/towradgi/run_towradgi.py b/validation_tests/case_studies/towradgi/run_towradgi.py
index d0c8b9dd6..10f4b2e2c 100644
--- a/validation_tests/case_studies/towradgi/run_towradgi.py
+++ b/validation_tests/case_studies/towradgi/run_towradgi.py
@@ -67,13 +67,31 @@ def read_polygon_list(poly_list):
 # --------------------------------------------------------------------------
 
 verbose = False
-yieldstep=300.
-finaltime=83700.
-scale = 1 # For coarse mesh set to 10, fine mesh set to 1
+# yieldstep=60      # yield evolve loop every 50 seconds
+# outputstep=15*60. # update sww files every 15 minute
+# finaltime=24*3600. # run for 24 hours 
+
+yieldstep=10. 
+outputstep=60. 
+finaltime=140. 
+
+scale = 1 # For coarse mesh set to 10 (135237 triangles), fine mesh set to 1 (256688 triangles)
+maximum_triangle_area = 1000 # This doesn't make much difference for this mesh
+
+# Choose openmp cpu (1) or openmp offloaded gpu (2)
+multiprocessor_mode = 1
 
 checkpoint_time = max(600/scale, 60)
 checkpoint_dir = 'CHECKPOINTS'
-useCheckpointing = True
+
+useCulverts = True # Use this to turn off culverts
+useCheckpointing = False
+
+
+
+#minimum_storable_height = 0.05
+base_friction = 0.04
+alpha = 0.99
 
 
 basename = join('DEM_bridges', 'towradgi')
@@ -307,10 +325,7 @@ def read_polygon_list(poly_list):
     E = 308570
     S = 6193140
     
-    maximum_triangle_area = 1000
-    #minimum_storable_height = 0.05
-    base_friction = 0.04
-    alpha = 0.99
+
                                    
     model_output_dir = 'MODEL_OUTPUTS'
     try:
@@ -440,7 +455,7 @@ def read_polygon_list(poly_list):
     if myid == 0:
         print('CREATING RIVERWALLS')
     
-    domain.riverwallData.create_riverwalls(riverWalls)
+    domain.create_riverwalls(riverWalls)
     
     
     barrier()
@@ -451,426 +466,429 @@ def read_polygon_list(poly_list):
     #------------------------------------------------------------------------------
     # ENTER CULVERT DATA
     #------------------------------------------------------------------------------
-    smoothTS=30. # Smoothing timescale for bridges
-    
-    if myid == 0: print ('Creating Boyd_pipe_operator at Branch_2_Brooker_St_Culvert') 
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[305772.982,6193988.557] , [305772.378,6193987.823]])
-    el1 = numpy.array([[305794.592,6193983.907] , [305793.988,6193983.173]])
-    ## Adjust el0, el1
-    #elOffset=0.
-    #el0M=0.5*(el0[0,:]+el0[1,:]) ; el1M=0.5*(el1[0,:]+el1[1,:]); n0=el0M-el1M; n0=n0/((n0*n0).sum())**0.5;
-    #el0 = el0 
-    culvert = Boyd_pipe_operator(domain,
-                                losses=losses,
-                                diameter=0.9, #actual culvert is a 1.8m diameter, was 75% blocked in 1998
-                                exchange_lines=[el0, el1],
-                                apron=3.0,
-                                enquiry_gap=10.0,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_2_Brooker_St_Culvert',
-                                verbose=False)    
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_2_Meadow_St_Culvert')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[305886.333,6193929.052] , [305883.172,6193922.986]])
-    el1 = numpy.array([[305906.553,6193910.461] , [305903.393,6193904.395]])  
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=5.4, #3x1.8mx0.6m RCBC (50% blocked in 1998) actual culvert dimensions are 3x1.8x1.8 RCBC
-                                exchange_lines=[el0, el1],
-                                height=0.6,
-                                apron=3.0,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_2_Meadow_St_Culvert',
-                                verbose=False)    
-    
-    if myid == 0: print ('Creating Boyd_pipe_operator at Branch_2_Williams_St_Culvert') 
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[305945.955,6193836.293] , [305945.125,6193835.387]])
-    el1 = numpy.array([[306040.565,6193827.573] , [306039.735,6193826.667]])
-    culvert = Boyd_pipe_operator(domain,
-                                losses=losses,
-                                diameter=1.2, #actual culvert is 2X1.2m diameter, was 50% blocked in 1998
-                                exchange_lines=[el0, el1],
-                                apron=3.0,
-                                enquiry_gap=10.0,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_2_Williams_St_Culvert',
-                                verbose=False)     
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Meadow_St_Culvert')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[305812.113,6193591.972] , [305809.390,6193588.820]])
-    el1 = numpy.array([[305834.913,6193588.382] , [305832.190,6193585.230]])  
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=4.0,#2x3.0mx2.2m RCBC (33% blocked in 1998) actual culvert dimensions are 3x3.0mx2.2m RCBC
-                                exchange_lines=[el0, el1],
-                                height=2.2,
-                                apron=3.0,
-                                enquiry_gap=10.0,
-                                smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_Towradgi_Meadow_St_Culvert',
-                                verbose=False)  
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_5_Collins_St_Culverts')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[306330.608,6194817.116] , [306320.768,6194805.884]])
-    el1 = numpy.array([[306369.483,6194811.616] , [306359.643,6194800.384]])   
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=14.4,#4x3.6mx0.93m RCBC (0% blocked in 1998)
-                                exchange_lines=[el0, el1],
-                                height=0.93,
-                                apron=3.0,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_5_Collins_St_Culverts',
-                                verbose=False)                                     
-
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_5_Northern_Distributor_Culverts')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[306956.242,6194465.589] , [306950.446,6194457.411]])
-    el1 = numpy.array([[307003.711,6194446.089] , [306997.916,6194437.911]])   
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=9.09,#2x3.03mx0.85m RCBC (50% blocked in 1998) actual culvert dimensions are 3x3.03mx1.7m RCBC
-                                exchange_lines=[el0, el1],
-                                height=0.85,
-                                apron=3.0,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_5_Northern_Distributor_Culverts',
-                                verbose=False)                                      
-
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_5_Coke_Works_Culverts')                                   
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[307142.161,6194181.3065] , [307138.519,6194174.394]])
-    el1 = numpy.array([[307160.521,6194164.8165] , [307156.879,6194157.904]])   
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=4.56,#use a 4.56mx2.9m RCBC (0% blocked in 1998) actual culvert dimensions are 2x2.9m diameter pipes
-                                exchange_lines=[el0, el1],
-                                height=2.9,
-                                apron=3.1,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_5_Coke_Works_Culverts',
-                                verbose=False)                                      
-
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_6_Northern_Distributor_Culverts')            
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[306950.758,6193454.717] , [306947.804,6193453.283]])
-    el1 = numpy.array([[306988.633,6193474.217] , [306985.679,6193472.783]])  
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=3.6,#3x1.2mx1.2m RCBC (0% blocked in 1998) actual culvert dimensions are 3x1.2mx1.2m RCBC
-                                exchange_lines=[el0, el1],
-                                height=1.2,
-                                apron=3.1,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_6_Northern_Distributor_Culverts',
-                                verbose=False)                                      
-
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_6_Railway_Culverts')                                
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[307139.134,6193474.458] , [307138.492,6193473.542]])
-    el1 = numpy.array([[307150.884,6193469.458] , [307150.242,6193468.542]])
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=1.0,#1x1.0mx3.5m RCBC (67% blocked in 1998) actual culvert dimensions are 1x3.0mx3.5m RCBC
-                                exchange_lines=[el0, el1],
-                                height=3.5,
-                                apron=3.1,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_6_Railway_Culverts',
-                                verbose=False) 
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_6_Colgong_St_Culverts')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[307200.610,6193476.765] , [307199.140,6193475.235]])
-    el1 = numpy.array([[307224.610,6193475.765] , [307223.140,6193474.235]])
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=1.65,#use a 1.65mx1.05m RCBC (0% blocked in 1998) actual culvert dimensions are 2x1.05m diameter pipes
-                                exchange_lines=[el0, el1],
-                                height=1.05,
-                                apron=3.1,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_6_Colgong_St_Culverts',
-                                verbose=False)   
-                                        
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_3_Basin_Outlet_Culverts')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[305629.639,6194408.883] , [305626.521,6194400.457]])
-    el1 = numpy.array([[305665.889,6194347.183] , [305662.771,6194338.757]])
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=6.0,#2x3.0mx0.86m RCBC (0% blocked in 1998) actual culvert dimensions are 2x3.0mx0.86m RCBC
-                                exchange_lines=[el0, el1],
-                                height=0.86,
-                                apron=3.1,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_3_Basin_Outlet_Culverts',
-                                verbose=False)                                      
-
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_3_Bellambi_Rd_Culverts')                                    
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[305777.182,6194305.377] , [305776.444,6194304.623]])
-    el1 = numpy.array([[305873.807,6194303.377] , [305873.069,6194302.623]])
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=1.65,#use a 1.65mx1.05m RCBC (0% blocked in 1998) actual culvert dimensions are 2x1.05m diameter pipes
-                                exchange_lines=[el0, el1],
-                                height=1.05,
-                                apron=3.1,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_3_Bellambi_Rd_Culverts',
-                                verbose=False)    
-    
-    if myid == 0: print ('Creating Boyd_pipe_operator at Branch_3_Meadow_St_Culverts')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[305914.649,6194322.375] , [305913.477,6194321.625]])
-    el1 = numpy.array([[305950.711,6194335.375] , [305949.539,6194334.625]])
+    
+    if useCulverts: # Use this to turn off culverts
+
+        smoothTS=30. # Smoothing timescale for bridges
+
+        if myid == 0: print ('Creating Boyd_pipe_operator at Branch_2_Brooker_St_Culvert') 
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305772.982,6193988.557] , [305772.378,6193987.823]])
+        el1 = numpy.array([[305794.592,6193983.907] , [305793.988,6193983.173]])
+        ## Adjust el0, el1
+        #elOffset=0.
+        #el0M=0.5*(el0[0,:]+el0[1,:]) ; el1M=0.5*(el1[0,:]+el1[1,:]); n0=el0M-el1M; n0=n0/((n0*n0).sum())**0.5;
+        #el0 = el0 
+        culvert = Boyd_pipe_operator(domain,
+                                    losses=losses,
+                                    diameter=0.9, #actual culvert is a 1.8m diameter, was 75% blocked in 1998
+                                    exchange_lines=[el0, el1],
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_2_Brooker_St_Culvert',
+                                    verbose=False)    
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_2_Meadow_St_Culvert')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305886.333,6193929.052] , [305883.172,6193922.986]])
+        el1 = numpy.array([[305906.553,6193910.461] , [305903.393,6193904.395]])  
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=5.4, #3x1.8mx0.6m RCBC (50% blocked in 1998) actual culvert dimensions are 3x1.8x1.8 RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=0.6,
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_2_Meadow_St_Culvert',
+                                    verbose=False)    
         
-    culvert = Boyd_pipe_operator(domain,
-                                losses=losses,
-                                diameter=1.5, # actual culvert is a single 1.5m diameter pipe 0% blocked
-                                exchange_lines=[el0, el1],
-                                apron=3.1,
-                                enquiry_gap=10.0,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_3_Meadow_St_Culverts',
-                                verbose=False)     
-    
-    if myid == 0: print ('Creating Boyd_pipe_operator at Branch_3_13_Meadow_St_Culverts')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[305911.280,6194359.203] , [305910.260,6194358.017]])
-    el1 = numpy.array([[305946.090,6194353.573] , [305945.070,6194352.387]])
+        if myid == 0: print ('Creating Boyd_pipe_operator at Branch_2_Williams_St_Culvert') 
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305945.955,6193836.293] , [305945.125,6193835.387]])
+        el1 = numpy.array([[306040.565,6193827.573] , [306039.735,6193826.667]])
+        culvert = Boyd_pipe_operator(domain,
+                                    losses=losses,
+                                    diameter=1.2, #actual culvert is 2X1.2m diameter, was 50% blocked in 1998
+                                    exchange_lines=[el0, el1],
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_2_Williams_St_Culvert',
+                                    verbose=False)     
         
-    culvert = Boyd_pipe_operator(domain,
-                                losses=losses,
-                                diameter=1.5,# actual culvert is a single 1.5m diameter pipe 0% blocked
-                                exchange_lines=[el0, el1],
-                                apron=3.1,
-                                enquiry_gap=10.0,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_3_13_Meadow_St_Culverts', 
-                                verbose=False)     
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_3_41_Angel_St_Culverts')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[306196.779,6194028.193] , [306192.221,6194010.807]])
-    el1 = numpy.array([[306200.154,6194018.693] , [306195.596,6194001.307]])
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Meadow_St_Culvert')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305812.113,6193591.972] , [305809.390,6193588.820]])
+        el1 = numpy.array([[305834.913,6193588.382] , [305832.190,6193585.230]])  
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=4.0,#2x3.0mx2.2m RCBC (33% blocked in 1998) actual culvert dimensions are 3x3.0mx2.2m RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=2.2,
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_Towradgi_Meadow_St_Culvert',
+                                    verbose=False)  
         
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=10.0, # actual culvert is 10m X 0.35m H, 0 % blockage
-                                exchange_lines=[el0, el1],
-                                height=0.35,
-                                apron=3.1,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_3_41_Angel_St_Culverts',
-                                verbose=False)        
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_7_Carroll_St_Culverts')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[308002.045,6193820.163] , [308001.215,6193819.197]])
-    el1 = numpy.array([[308021.965,6193816.883] , [308021.135,6193815.917]])
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_5_Collins_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[306330.608,6194817.116] , [306320.768,6194805.884]])
+        el1 = numpy.array([[306369.483,6194811.616] , [306359.643,6194800.384]])   
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=14.4,#4x3.6mx0.93m RCBC (0% blocked in 1998)
+                                    exchange_lines=[el0, el1],
+                                    height=0.93,
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_5_Collins_St_Culverts',
+                                    verbose=False)                                     
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_5_Northern_Distributor_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[306956.242,6194465.589] , [306950.446,6194457.411]])
+        el1 = numpy.array([[307003.711,6194446.089] , [306997.916,6194437.911]])   
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=9.09,#2x3.03mx0.85m RCBC (50% blocked in 1998) actual culvert dimensions are 3x3.03mx1.7m RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=0.85,
+                                    apron=3.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_5_Northern_Distributor_Culverts',
+                                    verbose=False)                                      
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_5_Coke_Works_Culverts')                                   
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[307142.161,6194181.3065] , [307138.519,6194174.394]])
+        el1 = numpy.array([[307160.521,6194164.8165] , [307156.879,6194157.904]])   
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=4.56,#use a 4.56mx2.9m RCBC (0% blocked in 1998) actual culvert dimensions are 2x2.9m diameter pipes
+                                    exchange_lines=[el0, el1],
+                                    height=2.9,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_5_Coke_Works_Culverts',
+                                    verbose=False)                                      
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_6_Northern_Distributor_Culverts')            
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[306950.758,6193454.717] , [306947.804,6193453.283]])
+        el1 = numpy.array([[306988.633,6193474.217] , [306985.679,6193472.783]])  
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=3.6,#3x1.2mx1.2m RCBC (0% blocked in 1998) actual culvert dimensions are 3x1.2mx1.2m RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=1.2,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_6_Northern_Distributor_Culverts',
+                                    verbose=False)                                      
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_6_Railway_Culverts')                                
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[307139.134,6193474.458] , [307138.492,6193473.542]])
+        el1 = numpy.array([[307150.884,6193469.458] , [307150.242,6193468.542]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=1.0,#1x1.0mx3.5m RCBC (67% blocked in 1998) actual culvert dimensions are 1x3.0mx3.5m RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=3.5,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_6_Railway_Culverts',
+                                    verbose=False) 
         
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=1.22, #actual culvert is 2x1.22mx0.3m 50% blocked
-                                exchange_lines=[el0, el1],
-                                height=0.3,
-                                apron=3.1,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_7_Carroll_St_Culverts',
-                                verbose=False)           
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_7_Parker_Rd_Culverts')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[308105.832,6193803.622] , [308103.648,6193801.118]])
-    el1 = numpy.array([[308126.782,6193800.552] , [308124.598,6193798.048]])
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_6_Colgong_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[307200.610,6193476.765] , [307199.140,6193475.235]])
+        el1 = numpy.array([[307224.610,6193475.765] , [307223.140,6193474.235]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=1.65,#use a 1.65mx1.05m RCBC (0% blocked in 1998) actual culvert dimensions are 2x1.05m diameter pipes
+                                    exchange_lines=[el0, el1],
+                                    height=1.05,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_6_Colgong_St_Culverts',
+                                    verbose=False)   
+                                            
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_3_Basin_Outlet_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305629.639,6194408.883] , [305626.521,6194400.457]])
+        el1 = numpy.array([[305665.889,6194347.183] , [305662.771,6194338.757]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=6.0,#2x3.0mx0.86m RCBC (0% blocked in 1998) actual culvert dimensions are 2x3.0mx0.86m RCBC
+                                    exchange_lines=[el0, el1],
+                                    height=0.86,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_3_Basin_Outlet_Culverts',
+                                    verbose=False)                                      
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_3_Bellambi_Rd_Culverts')                                    
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305777.182,6194305.377] , [305776.444,6194304.623]])
+        el1 = numpy.array([[305873.807,6194303.377] , [305873.069,6194302.623]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=1.65,#use a 1.65mx1.05m RCBC (0% blocked in 1998) actual culvert dimensions are 2x1.05m diameter pipes
+                                    exchange_lines=[el0, el1],
+                                    height=1.05,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_3_Bellambi_Rd_Culverts',
+                                    verbose=False)    
         
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=3.18, #actual culvert is 3x1.06mW X 0.3mD 0% blocked
-                                exchange_lines=[el0, el1],
-                                height=0.3,
-                                apron=3.1,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_7_Parker_Rd_Culverts',
-                                verbose=False)     
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_7_Lake_Pde_Culverts')
-    losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
-    el0 = numpy.array([[308251.257,6193614.658] , [308248.343,6193618.]])
-    el1 = numpy.array([[308232.,6193593.] , [308225.,6193596.]])   
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=2.36, #actual culvert is 4 750mm diameter pipes 0% blockage, here models as box culvert
-                                exchange_lines=[el0, el1],
-                                height=0.75,
-                                apron=3.1,
-                                enquiry_gap=10.0,
-    							smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=0.013,
-                                logging=False,
-                                label='Branch_7_Lake_Pde_Culverts',
-                                verbose=False)                                                                      
-
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Princes_Hwy_Bridge')							
-    losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
-    el0 = numpy.array([[306607.274,6193707.421] , [306602.635,6193695.720]]) 
-    el1 = numpy.array([[306626.205,6193694.358] , [306622.068,6193683.138]])
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=12.0,
-                                exchange_lines=[el0, el1],
-                                height=3.0,
-                                apron=0.0,
-                                enquiry_gap=10.0,
-                                smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=channel_manning,
-                                logging=False,
-                                label='Branch_Towradgi_Princes_Hwy_Bridge',
-                                verbose=False)  
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Pioneer_Rd_Bridge')
-    losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
-    el0 = numpy.array([[307623.,6193610.] , [307622.,6193607.]])
-    el1 = numpy.array([[307610.,6193619.] , [307609., 6193616.]])  
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=20.0,
-                                exchange_lines=[el0, el1],
-                                height=3.5,
-                                apron=0.0,
-                                enquiry_gap=10.0,
-                                smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=channel_manning,
-                                logging=False,
-                                label='Branch_Towradgi_Pioneer_Rd_Bridge',
-                                verbose=False)                           
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Northern_Distributor_Bridge')
-    losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
-    el0 = numpy.array([[306985.,6193749.] , [306985.,6193736.]])
-    el1 = numpy.array([[306950.,6193745.] , [306950.,6193732.]])
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=45.0,
-                                exchange_lines=[el0, el1],
-                                height=6.0,
-                                apron=0.0,
-                                enquiry_gap=10.,
-                                smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=channel_manning,
-                                logging=False,
-                                label='Branch_Towradgi_Northern_Distributor_Bridge',
-                                verbose=False)    
-    
-    if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Railway_Bridge') 
-    losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
-    el0 = numpy.array([[307236.,6193737.] , [307235.,6193733.]])
-    el1 = numpy.array([[307223.,6193738.] , [307222.,6193734.]]) 
-    culvert = Boyd_box_operator(domain,
-                                losses=losses,
-                                width=20.0,
-                                exchange_lines=[el0, el1],
-                                height=8.0,
-                                apron=0.0,
-                                enquiry_gap=20.0,
-                                smoothing_timescale=smoothTS,
-                                use_momentum_jet=True,
-                                use_velocity_head=True,
-                                manning=channel_manning,
-                                logging=False,
-                                label='Branch_Towradgi_Railway_Bridge',
-                                verbose=False) 
+        if myid == 0: print ('Creating Boyd_pipe_operator at Branch_3_Meadow_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305914.649,6194322.375] , [305913.477,6194321.625]])
+        el1 = numpy.array([[305950.711,6194335.375] , [305949.539,6194334.625]])
+            
+        culvert = Boyd_pipe_operator(domain,
+                                    losses=losses,
+                                    diameter=1.5, # actual culvert is a single 1.5m diameter pipe 0% blocked
+                                    exchange_lines=[el0, el1],
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_3_Meadow_St_Culverts',
+                                    verbose=False)     
+        
+        if myid == 0: print ('Creating Boyd_pipe_operator at Branch_3_13_Meadow_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[305911.280,6194359.203] , [305910.260,6194358.017]])
+        el1 = numpy.array([[305946.090,6194353.573] , [305945.070,6194352.387]])
+            
+        culvert = Boyd_pipe_operator(domain,
+                                    losses=losses,
+                                    diameter=1.5,# actual culvert is a single 1.5m diameter pipe 0% blocked
+                                    exchange_lines=[el0, el1],
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_3_13_Meadow_St_Culverts', 
+                                    verbose=False)     
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_3_41_Angel_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[306196.779,6194028.193] , [306192.221,6194010.807]])
+        el1 = numpy.array([[306200.154,6194018.693] , [306195.596,6194001.307]])
+            
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=10.0, # actual culvert is 10m X 0.35m H, 0 % blockage
+                                    exchange_lines=[el0, el1],
+                                    height=0.35,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_3_41_Angel_St_Culverts',
+                                    verbose=False)        
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_7_Carroll_St_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[308002.045,6193820.163] , [308001.215,6193819.197]])
+        el1 = numpy.array([[308021.965,6193816.883] , [308021.135,6193815.917]])
+            
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=1.22, #actual culvert is 2x1.22mx0.3m 50% blocked
+                                    exchange_lines=[el0, el1],
+                                    height=0.3,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_7_Carroll_St_Culverts',
+                                    verbose=False)           
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_7_Parker_Rd_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[308105.832,6193803.622] , [308103.648,6193801.118]])
+        el1 = numpy.array([[308126.782,6193800.552] , [308124.598,6193798.048]])
+            
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=3.18, #actual culvert is 3x1.06mW X 0.3mD 0% blocked
+                                    exchange_lines=[el0, el1],
+                                    height=0.3,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_7_Parker_Rd_Culverts',
+                                    verbose=False)     
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_7_Lake_Pde_Culverts')
+        losses = {'inlet':0.5, 'outlet':1.0, 'bend':0.0, 'grate':0.0, 'pier': 0.0, 'other': 0.0}
+        el0 = numpy.array([[308251.257,6193614.658] , [308248.343,6193618.]])
+        el1 = numpy.array([[308232.,6193593.] , [308225.,6193596.]])   
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=2.36, #actual culvert is 4 750mm diameter pipes 0% blockage, here models as box culvert
+                                    exchange_lines=[el0, el1],
+                                    height=0.75,
+                                    apron=3.1,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=0.013,
+                                    logging=False,
+                                    label='Branch_7_Lake_Pde_Culverts',
+                                    verbose=False)                                                                      
+
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Princes_Hwy_Bridge')							
+        losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
+        el0 = numpy.array([[306607.274,6193707.421] , [306602.635,6193695.720]]) 
+        el1 = numpy.array([[306626.205,6193694.358] , [306622.068,6193683.138]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=12.0,
+                                    exchange_lines=[el0, el1],
+                                    height=3.0,
+                                    apron=0.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=channel_manning,
+                                    logging=False,
+                                    label='Branch_Towradgi_Princes_Hwy_Bridge',
+                                    verbose=False)  
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Pioneer_Rd_Bridge')
+        losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
+        el0 = numpy.array([[307623.,6193610.] , [307622.,6193607.]])
+        el1 = numpy.array([[307610.,6193619.] , [307609., 6193616.]])  
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=20.0,
+                                    exchange_lines=[el0, el1],
+                                    height=3.5,
+                                    apron=0.0,
+                                    enquiry_gap=10.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=channel_manning,
+                                    logging=False,
+                                    label='Branch_Towradgi_Pioneer_Rd_Bridge',
+                                    verbose=False)                           
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Northern_Distributor_Bridge')
+        losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
+        el0 = numpy.array([[306985.,6193749.] , [306985.,6193736.]])
+        el1 = numpy.array([[306950.,6193745.] , [306950.,6193732.]])
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=45.0,
+                                    exchange_lines=[el0, el1],
+                                    height=6.0,
+                                    apron=0.0,
+                                    enquiry_gap=10.,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=channel_manning,
+                                    logging=False,
+                                    label='Branch_Towradgi_Northern_Distributor_Bridge',
+                                    verbose=False)    
+        
+        if myid == 0: print ('Creating Boyd_box_operator at Branch_Towradgi_Railway_Bridge') 
+        losses = {'inlet':0.0, 'outlet':0.0, 'bend':0.0, 'grate':0.0, 'pier': 1.0, 'other': 0.0}
+        el0 = numpy.array([[307236.,6193737.] , [307235.,6193733.]])
+        el1 = numpy.array([[307223.,6193738.] , [307222.,6193734.]]) 
+        culvert = Boyd_box_operator(domain,
+                                    losses=losses,
+                                    width=20.0,
+                                    exchange_lines=[el0, el1],
+                                    height=8.0,
+                                    apron=0.0,
+                                    enquiry_gap=20.0,
+                                    smoothing_timescale=smoothTS,
+                                    use_momentum_jet=True,
+                                    use_velocity_head=True,
+                                    manning=channel_manning,
+                                    logging=False,
+                                    label='Branch_Towradgi_Railway_Bridge',
+                                    verbose=False) 
     
     # ----------------------------------------------------------------------------------------------------------------------------------------------------
     # APPLY RAINFALL
@@ -908,6 +926,8 @@ def read_polygon_list(poly_list):
 
 
 
+domain.set_multiprocessor_mode(multiprocessor_mode)
+
 # ------------------------------------------------------------------------------
 # EVOLVE SYSTEM THROUGH TIME
 # ------------------------------------------------------------------------------
@@ -915,7 +935,7 @@ def read_polygon_list(poly_list):
 t0 = time.time()
 
 
-for t in domain.evolve(yieldstep=yieldstep, finaltime=finaltime):
+for t in domain.evolve(yieldstep=yieldstep, outputstep=outputstep, finaltime=finaltime):
     if myid == 0:
         domain.write_time()
 
@@ -939,6 +959,6 @@ def read_polygon_list(poly_list):
 # Merge the individual sww files into one file
 # But don't delete the sub domain sww files
 # --------------------------------------------------
-domain.sww_merge()
+domain.sww_merge(delete_old=True)
 
 finalize()
diff --git a/validation_tests/experimental_data/okushiri/loading_pts_test.py b/validation_tests/experimental_data/okushiri/loading_pts_test.py
index eea91210d..a3a742497 100644
--- a/validation_tests/experimental_data/okushiri/loading_pts_test.py
+++ b/validation_tests/experimental_data/okushiri/loading_pts_test.py
@@ -35,7 +35,7 @@
 # Create Domain from mesh
 #-------------------------
 domain = Domain(project.mesh_filename, use_cache=True, verbose=True)
-print domain.statistics()
+print(domain.statistics())
 
 
 #-------------------------
@@ -48,7 +48,7 @@
 t0 = time.time()
 bathymetry_filename=project.bathymetry_filename
 bathymetry_filename='Benchmark_2_Bathymetry_very_thin.pts'
-print 'Starting domain.set_quantity.  Loading ', bathymetry_filename
+print('Starting domain.set_quantity.  Loading ', bathymetry_filename)
 s = "domain.set_quantity('elevation',filename=bathymetry_filename,alpha=0.02,verbose=True,use_cache=True)"
 
 
@@ -57,7 +57,7 @@
 
 profile.run(s, FN)
 
-print 'Set_quantity elevation took %.2f seconds' %(time.time()-t0)
+print('Set_quantity elevation took %.2f seconds' %(time.time()-t0))
 
 S = pstats.Stats(FN)
 #S.sort_stats('time').print_stats(20)
diff --git a/validation_tests/experimental_data/okushiri/plot_results.py b/validation_tests/experimental_data/okushiri/plot_results.py
index f51b5cb7a..38b67f049 100644
--- a/validation_tests/experimental_data/okushiri/plot_results.py
+++ b/validation_tests/experimental_data/okushiri/plot_results.py
@@ -40,7 +40,7 @@
     from matplotlib.pyplot import xlabel, ylabel, savefig
     hold(False)  # Check if this command can be issued
 except:
-    print 'Could not import pylab'
+    print('Could not import pylab')
     plotting = False
 else:
     # Create plots as png files
@@ -99,7 +99,7 @@
 # Read validation data
 #-------------------------
 
-if verbose: print 'Reading', project.boundary_filename
+if verbose: print('Reading', project.boundary_filename)
 
 fid = NetCDFFile(project.boundary_filename, 'r')
 input_time = fid.variables['time'][:]
@@ -151,12 +151,12 @@ def report_difference(name, computed_value, reference_value, rtol, atol):
         msg = '%s (expected, computed):\n  (%.18e, %.18e):\n  Relative error=%.18e'\
               %(name, reference_value, computed_value,
                 abs(reference_value-computed_value)/reference_value)
-        print msg
+        print(msg)
         
 
     msg = '  Absolute error=%.18e'\
           %(abs(reference_value-computed_value))        
-    print msg
+    print(msg)
 
     
     #print 'Allclose:', allclose(reference_value, computed_value,
@@ -182,7 +182,7 @@ def report_difference(name, computed_value, reference_value, rtol, atol):
 #rtol = 1.0e-2
 #atol = 1.0e-2
 
-if verbose: print 'Precisions used: rtol=%e, atol=%e' %(rtol, atol)
+if verbose: print('Precisions used: rtol=%e, atol=%e' %(rtol, atol))
 
 
 #print reference_time
@@ -192,8 +192,8 @@ def report_difference(name, computed_value, reference_value, rtol, atol):
     denom = 0
     model = []
     if verbose: 
-        print 
-        print 'Validating ' + name
+        print() 
+        print('Validating ' + name)
     observed_timeseries = validation_data[name]
     for i, t in enumerate(reference_time):
         model.append(f(t, point_id=k)[0])
@@ -256,16 +256,16 @@ def report_difference(name, computed_value, reference_value, rtol, atol):
 q, loc, time = get_maximum_inundation_data(sww_filename, return_time=True)
 
 if verbose:
-    print 'Observed results'
-    print 'Max runup elevation (m): 0.0875, 0.09, 0.08, 0.09, 0.1, 0.09, Average 0.09'
-    print 'Max runup elevation (scaled by 400) (m): Average 36'
-    print 'Max runup location:  [5.1575, 1.88]'
-    print 'Max runup time (s): 16.5'
-    print 'Model Results'
-    print 'Max runup elevation (m): ', q
-    print 'Max runup elevation (scaled by 400) (m): ', q*400
-    print 'Max runup location:  ', loc
-    print 'Max runup time (s): ',time
+    print('Observed results')
+    print('Max runup elevation (m): 0.0875, 0.09, 0.08, 0.09, 0.1, 0.09, Average 0.09')
+    print('Max runup elevation (scaled by 400) (m): Average 36')
+    print('Max runup location:  [5.1575, 1.88]')
+    print('Max runup time (s): 16.5')
+    print('Model Results')
+    print('Max runup elevation (m): ', q)
+    print('Max runup elevation (scaled by 400) (m): ', q*400)
+    print('Max runup location:  ', loc)
+    print('Max runup time (s): ',time)
 
 
 #assert is_inside_polygon(loc, gulleys)
diff --git a/validation_tests/other_references/radial_dam_break_dry/radial_dam_break.py b/validation_tests/other_references/radial_dam_break_dry/radial_dam_break.py
index 34e548d46..a7ae43214 100644
--- a/validation_tests/other_references/radial_dam_break_dry/radial_dam_break.py
+++ b/validation_tests/other_references/radial_dam_break_dry/radial_dam_break.py
@@ -10,7 +10,7 @@
 import sys
 import anuga
 from math import cos
-from numpy import zeros, float
+from numpy import zeros
 
 #------------------------------------------------------------------------------
 # Setup parameters and utilitiy functions
diff --git a/validation_tests/other_references/radial_dam_break_wet/radial_dam_break.py b/validation_tests/other_references/radial_dam_break_wet/radial_dam_break.py
index 52d63cd98..e33f1c62d 100644
--- a/validation_tests/other_references/radial_dam_break_wet/radial_dam_break.py
+++ b/validation_tests/other_references/radial_dam_break_wet/radial_dam_break.py
@@ -10,7 +10,7 @@
 import sys
 import anuga
 from math import cos
-from numpy import zeros, float
+from numpy import zeros
 
 
 #------------------------------------------------------------------------------