Merge pull request #180 from openworm/experimental

pgleeson · web-flow · commit a179581c3bbb · 2026-04-29T12:58:24.000+01:00
Update dependencies and fix issues with pandas and json schema
diff --git a/.github/workflows/ci_python.yml b/.github/workflows/ci_python.yml
@@ -2,9 +2,9 @@ name: Test Python
 
 on:
   push:
-    branches: [ master, development ]
+    branches: [ master, development, experimental, test* ]
   pull_request:
-    branches: [ master, development ]
+    branches: [ master, development, experimental, test* ]
 
 jobs:
   build:
@@ -13,13 +13,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
 
     steps:
 
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v6
     - name: Set up Python  ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v6
       with:
         python-version:  ${{ matrix.python-version }}
 
@@ -31,6 +31,16 @@ jobs:
     - name: Install Python package
       run: |
         cd src/Python
-        python setup.py install
+        pip install .
+
+    - name: Run tests
+      run: |
+        cd src/Python/tests
+        python tests.py 
+        python diagnostic_test.py ../../../tests/minimax.wcon 
+
+    - name: Final version info
+      run: |
+        pip list
         
 
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,5 @@ src/Python/wcon.egg*
 
 # Scala
 src/scala/target/*
+/src/Python/example_saved_file.WCON
+/src/Python/wcon/wcon_schema.json
diff --git a/src/Python/examples/view_wcon.py b/src/Python/examples/view_wcon.py
@@ -4,5 +4,5 @@
 sys.path.append('..')
 from wcon import WCONWorms, MeasurementUnit
 
-file_name = 'asic-1 (ok415) on food L_2010_07_08__11_46_40___7___5.wcon'
+file_name = '../../../tests/minimax.wcon'
 w = WCONWorms.load_from_file(file_name)
diff --git a/src/Python/setup.py b/src/Python/setup.py
@@ -12,6 +12,7 @@
 from codecs import open
 from os import path
 import os
+import shutil
 exec(open('wcon/version.py').read())
 
 here = path.abspath(path.dirname(__file__))
@@ -24,7 +25,13 @@
     with open(readme_path, encoding='utf-8') as f:
         long_description += f.read()
 
-print(os.listdir('.'))  # DEBUG
+# The canonical wcon_schema.json lives at the repository root so it can be
+# shared by every language implementation. setuptools cannot package files
+# from outside the package directory, so copy it into wcon/ at build time.
+repo_schema = path.join(here, '..', '..', 'wcon_schema.json')
+pkg_schema = path.join(here, 'wcon', 'wcon_schema.json')
+if path.exists(repo_schema):
+    shutil.copyfile(repo_schema, pkg_schema)
 
 setup(
     name='wcon',
@@ -51,8 +58,9 @@
     ],
     keywords='C. elegans worm tracking',
     packages=['wcon'],
-    package_data={'': ['../../wcon_schema.json']},
-    install_requires=['jsonschema', 'six', 'numpy', 'scipy<=0.17.1']
+    package_data={'wcon': ['wcon_schema.json']},
+    include_package_data=True,
+    install_requires=['jsonschema', 'six', 'scipy', 'pandas', 'psutil'],
     # Actually also requires numpy, scipy and numpy but I don't want to force
     # pip to install these since pip is bad at that for those packages.
 )
diff --git a/src/Python/tests/diagnostic_test.py b/src/Python/tests/diagnostic_test.py
@@ -18,6 +18,7 @@
 idx = pd.IndexSlice
 import numpy as np
 import time
+import pprint as pp
 
 sys.path.append('..')
 from wcon import WCONWorms, MeasurementUnit
@@ -51,6 +52,12 @@ def timing_function():
                                           validate_against_schema=False)
             print("Time to load w1: " + str(timing_function() - start_time))
 
+            print(" ------- W1 has " + str(len(w1.data)) + " rows and " +
+                  str(len(w1.data.columns)) + " columns")
+
+            print(pp.pformat(w1.data_as_odict))
+            print (' -------- ')
+
             # Save these worm tracks to a file, then load that file
             test_path = 'test.wcon'
             start_time = timing_function()
@@ -62,6 +69,12 @@ def timing_function():
                                           validate_against_schema=False)
             print("Time to load w2: " + str(timing_function() - start_time))
 
+            print(" ------- W2 has " + str(len(w2.data)) + " rows and " +
+                  str(len(w2.data.columns)) + " columns")
+
+            print(pp.pformat(w2.data_as_odict))
+            print (' -------- ')
+
             # x1 = w1.data.loc[:, idx[0, 'x', 0]].fillna(0)
             # x2 = w2.data.loc[:, idx[0, 'x', 0]].fillna(0)
             # cmm = np.flatnonzero(x1 != x2)
@@ -76,6 +89,14 @@ def timing_function():
             # "id" first in a data segment, etc.)
             w3 = WCONWorms.load_from_file(test_path,
                                           validate_against_schema=False)
+            
+
+            print(" ------- W3 has " + str(len(w3.data)) + " rows and " +
+                  str(len(w3.data.columns)) + " columns")
+
+            print(pp.pformat(w3.data_as_odict))
+            print (' -------- ')
+
             assert(w2 == w3)
             assert(w1 == w2)
             assert(w1 == w3)
diff --git a/src/Python/tests/tests.py b/src/Python/tests/tests.py
@@ -41,7 +41,7 @@ def flatten(list_of_lists):
     for element in list_of_lists:
         # If it's iterable but not a string or bytes, then recurse, otherwise
         # we are at a "leaf" node of our traversal
-        if(isinstance(element, collections.Iterable) and
+        if(isinstance(element, collections.abc.Iterable) and
            not isinstance(element, (str, bytes))):
             for sub_element in flatten(element):
                 yield sub_element
diff --git a/src/Python/wcon/measurement_unit.py b/src/Python/wcon/measurement_unit.py
@@ -637,12 +637,15 @@ def _create_from_atomic(cls, unit_string):
     @classmethod
     def _create_from_node(cls, node):
         """
-        node: is ast.Num or ast.BinOp or ast.UnaryOp or ast.Str or ast.Name
+        node: is ast.Constant (numeric) or ast.BinOp or ast.UnaryOp or ast.Name
             The expression to be transformed into a MeasurementUnit
 
         """
-        if isinstance(node, ast.Num):  # <number>
-            n = node.n
+        # ast.Num was deprecated in Python 3.8 and removed in 3.14;
+        # ast.Constant now represents all literal values.
+        if isinstance(node, ast.Constant) and isinstance(
+                node.value, (int, float)):  # <number>
+            n = node.value
             assert(n != 0)  # A unit cannot have zero in the expression
 
             u = cls()
diff --git a/src/Python/wcon/version.py b/src/Python/wcon/version.py
@@ -6,4 +6,4 @@
 # 2) we can import it in setup.py for the same reason
 # 3) we can import it into your module module
 # (from http://stackoverflow.com/questions/458550/)
-__version__ = '1.1.0'
+__version__ = '1.2.1'
diff --git a/src/Python/wcon/wcon_data.py b/src/Python/wcon/wcon_data.py
@@ -119,6 +119,13 @@ def df_upsert(src, dest):
             dest_sliced.sort_index(inplace=True)
             src_sliced.sort_index(inplace=True)
 
+            # Align src_sliced's row/column labels to dest_sliced. The two
+            # were built with independent .isin() masks so column order may
+            # differ; pandas >=1.x refuses to compare DataFrames whose
+            # labels are not identical.
+            src_sliced = src_sliced.reindex(index=dest_sliced.index,
+                                            columns=dest_sliced.columns)
+
             # Obtain a mask of the conflicts in the current segment
             # as compared with all previously loaded data.  That is:
             # NaN NaN = False
@@ -189,24 +196,30 @@ def convert_origin(df):
             # `for` loop loops through both `x` and `y`.
 
             if offset in cur_worm.columns.get_level_values(0):
-                # Consider offset as 0 if not available in a certain frame
-                ox_column = cur_worm.loc[:, (offset)].fillna(0)
+                # Consider offset as 0 if not available in a certain frame.
+                # Coerce to numeric: the parser can leave the offset column
+                # with object dtype (mixed str/int entries) when offsets
+                # are present in some segments but not others.
+                ox_column = cur_worm.loc[:, (offset)].apply(
+                    pd.to_numeric, errors='coerce').fillna(0)
 
                 # Shift our 'x' values by offset
-                all_x_columns = cur_worm.loc[:, (coord)]
-                ox_affine_change = (np.array(ox_column) *
+                all_x_columns = cur_worm.loc[:, (coord)].apply(
+                    pd.to_numeric, errors='coerce')
+                ox_affine_change = (np.array(ox_column, dtype=float) *
                                     np.ones(all_x_columns.shape))
                 all_x_columns += ox_affine_change
 
                 if centroid in cur_worm.columns.get_level_values(0):
-                    cx_column = cur_worm.loc[:, (centroid)]
+                    cx_column = cur_worm.loc[:, (centroid)].apply(
+                        pd.to_numeric, errors='coerce')
                     # Shift the centroid by the offset
                     cx_column += ox_column
 
                     # Now make the centroid our new offset, since the rule
                     # is that if the offset exists, the centroid is not
                     # the offset, but we want it to be.
-                    cx_affine_change = (np.array(cx_column) *
+                    cx_affine_change = (np.array(cx_column, dtype=float) *
                                         np.ones(all_x_columns.shape))
                     all_x_columns -= cx_affine_change
 
@@ -224,7 +237,8 @@ def convert_origin(df):
     # This is so DataFrames with and without offsets
     # will show as comparing identically.
     for offset_key in offset_keys:
-        df.drop(offset_key, axis=1, level='key', inplace=True)
+        df.drop(offset_key, axis=1, level='key', inplace=True,
+                errors='ignore')
 
     # Because of a known issue in Pandas
     # (https://github.com/pydata/pandas/issues/2770), the dropped columns
@@ -389,7 +403,7 @@ def _obtain_time_series_data_frame(time_series_data):
             for i in range(len(cur_timeframes)):
                 data_segment[k][i] = (
                     data_segment[k][i] +
-                    [np.NaN] * (max_aspect_size - len(data_segment[k][i])))
+                    [np.nan] * (max_aspect_size - len(data_segment[k][i])))
 
         num_timeframes = len(cur_timeframes)
 
@@ -402,7 +416,7 @@ def _obtain_time_series_data_frame(time_series_data):
         cur_df = pd.DataFrame(cur_data, columns=cur_columns)
 
         cur_df.index = cur_timeframes
-        cur_df.index.names = 't'
+        cur_df.index.names = ['t']
 
         # We want the index (time) to be in order.
         cur_df.sort_index(axis=0, inplace=True)
@@ -466,7 +480,7 @@ def _obtain_time_series_data_frame(time_series_data):
         with warnings.catch_warnings():
             warnings.filterwarnings(action="ignore", category=FutureWarning)
             df_odict[worm_id] = \
-                df_odict[worm_id].convert_objects(convert_numeric=True)
+                df_odict[worm_id].infer_objects()
 
         # If 'head' or 'ventral' is NaN, we must specify '?' since
         # otherwise, when saving this object, to specify "no value" we would
@@ -478,21 +492,27 @@ def _obtain_time_series_data_frame(time_series_data):
 
         # We must replace NaN with None, otherwise the JSON encoder will
         # save 'NaN' as the string and this will get rejected by our schema
-        # on any subsequent loads
-        # Note we can't use .fillna(None) due to this issue:
-        # https://github.com/pydata/pandas/issues/1972
+        # on any subsequent loads.
+        # Pandas 3.0 infers 'str' dtype for these columns, and assigning
+        # NaN on a str-dtype column coerces to the string 'nan'. Force
+        # object dtype and map both real NaN and stringified 'nan' back
+        # to None so downstream JSON serialization writes null.
         df_keys = set(df_odict[worm_id].columns.get_level_values('key'))
         for k in ['head', 'ventral']:
             if k in df_keys:
-                cur_slice = df_odict[worm_id].loc[:, idx[:, k, :]]
-                df_odict[worm_id].loc[:, idx[:, k, :]] = \
-                    cur_slice.fillna(value=np.nan)
-
-        # Make sure aspect_size is a float, since only floats are nullable:
+                df = df_odict[worm_id]
+                for col in [c for c in df.columns if c[1] == k]:
+                    s = df[col].astype(object)
+                    df[col] = s.where(s.notna() & (s != 'nan'), None)
+
+        # Make sure aspect_size is a float, since only floats are nullable.
+        # Replace the column whole rather than assigning via .loc[]; pandas
+        # 2.x preserves the parent column's existing (object/str) dtype on
+        # .loc[] assignment and raises TypeError on non-string values.
         if 'aspect_size' in df_keys:
-            df_odict[worm_id].loc[:, idx[:, 'aspect_size', :]] = \
-                df_odict[worm_id].loc[:, idx[:, 'aspect_size', :]] \
-                .astype(float)
+            df = df_odict[worm_id]
+            for col in [c for c in df.columns if c[1] == 'aspect_size']:
+                df[col] = df[col].astype(float)
 
     return sort_odict(df_odict)
 
diff --git a/src/Python/wcon/wcon_parser.py b/src/Python/wcon/wcon_parser.py
@@ -385,21 +385,30 @@ def to_canon(self):
         for data_key in self.units:
             mu = self.units[data_key]
 
-            # Don't bother to "convert" units that are already in their
-            # canonical form.
-            if mu.unit_string == mu.canonical_unit_string:
-                continue
-
             tmu = self.units['t']
+            already_canonical = (mu.unit_string == mu.canonical_unit_string)
             for worm_id in w.worm_ids:
 
                 try:
-                    # Apply across all worm ids and all aspects
-                    mu_slice = \
-                        w._data[worm_id].loc[:, idx[:, data_key, :]].copy()
-
-                    w._data[worm_id].loc[:, idx[:, data_key, :]] = \
-                        mu_slice.applymap(mu.to_canon)
+                    df = w._data[worm_id]
+                    target_cols = [c for c in df.columns
+                                   if c[1] == data_key]
+                    if not target_cols:
+                        raise KeyError(data_key)
+
+                    # The parser can leave numeric columns with object
+                    # dtype (e.g. mixed int/str entries from how segments
+                    # are merged). Coerce so downstream arithmetic and
+                    # JSON serialization treat them as numbers, even when
+                    # the unit is already canonical and no conversion is
+                    # otherwise required. Replace each column whole rather
+                    # than via .loc[] assignment, which would preserve the
+                    # parent column's existing (object) dtype.
+                    for col in target_cols:
+                        new_col = pd.to_numeric(df[col], errors='coerce')
+                        if not already_canonical:
+                            new_col = new_col.apply(mu.to_canon)
+                        df[col] = new_col
                 except KeyError:
                     # Just ignore cases where there are "units" entries but no
                     # corresponding data
@@ -822,7 +831,8 @@ def pd_equals(df1, df2):
         return False
 
     try:
-        pd.util.testing.assert_frame_equal(df1, df2)
+        # pd.util.testing was removed in pandas 2.0; use pd.testing.
+        pd.testing.assert_frame_equal(df1, df2)
     except AssertionError:
         return False
 
diff --git a/wcon_schema.json b/wcon_schema.json
@@ -1,5 +1,5 @@
 {
-    "$schema": "http://json-schema.org/schema",
+    "$schema": "http://json-schema.org/draft-07/schema",
     "title": "Worm tracker Commons Object Notation (WCON)",
     "description": "A text-based data interchange format for *C. elegans* trackers. It is a constrained subset of JSON. It is designed to be both human and machine readable and to facilitate data exchange and inter-operability for worm tracking data that is independent of platform and the language used for implementation.",
     "type": "object",