Merge pull request #266 from terrafying/master

jeremymanning · web-flow · commit adadb0ed0fbd · 2025-04-24T12:13:58.000-04:00
dependency updates! mainly for numpy v2
diff --git a/hypertools/tools/format_data.py b/hypertools/tools/format_data.py
@@ -75,7 +75,7 @@ def format_data(x, vectorizer='CountVectorizer',
     from ..datageometry import DataGeometry
 
     # if x is not a list, make it one
-    if type(x) is not list:
+    if not isinstance(x, list):
         x = [x]
 
     if all([isinstance(xi, str) for xi in x]):
@@ -98,7 +98,7 @@ def format_data(x, vectorizer='CountVectorizer',
         text_data = []
         for i,j in zip(x, dtypes):
             if j in ['list_str', 'str', 'arr_str']:
-                text_data.append(np.array(i).reshape(-1, 1))
+                text_data.append(np.asarray(i, dtype=object).reshape(-1, 1))
         # convert text to numerical matrices
         text_data = text2mat(text_data, **text_args)
 
@@ -164,14 +164,15 @@ def format_data(x, vectorizer='CountVectorizer',
 
 
 def fill_missing(x):
-
+    """Fill missing values using PPCA"""
     # ppca if missing data
     m = PPCA()
-    m.fit(data=np.vstack(x))
+    x_stacked = np.vstack(x)
+    m.fit(data=x_stacked)
     x_pca = m.transform()
 
     # if the whole row is missing, return nans
-    all_missing = [idx for idx, a in enumerate(np.vstack(x)) if all([type(b)==np.nan for b in a])]
+    all_missing = [idx for idx, a in enumerate(x_stacked) if np.all(np.isnan(a))]
     if len(all_missing)>0:
         for i in all_missing:
             x_pca[i, :] = np.nan
diff --git a/hypertools/tools/normalize.py b/hypertools/tools/normalize.py
@@ -48,14 +48,23 @@ def normalize(x, normalize='across', internal=False, format_data=True):
     if normalize in [False, None]:
         return x
     else:
-
         if format_data:
             x = formatter(x, ppca=True)
 
-        zscore = lambda X, y: (y - np.mean(X)) / np.std(X) if len(set(y)) > 1 else np.zeros(y.shape)
+        def zscore(X, y):
+            # Handle empty arrays and single-value arrays
+            if len(y) == 0 or len(set(y.ravel())) <= 1:
+                return np.zeros_like(y, dtype=np.float64)
+            
+            mean = np.mean(X)
+            std = np.std(X)
+            # Avoid division by zero
+            if std == 0:
+                return np.zeros_like(y, dtype=np.float64)
+            return (y - mean) / std
 
         if normalize == 'across':
-            x_stacked=np.vstack(x)
+            x_stacked = np.vstack(x)
             normalized_x = [np.array([zscore(x_stacked[:,j], i[:,j]) for j in range(i.shape[1])]).T for i in x]
 
         elif normalize == 'within':
diff --git a/hypertools/tools/reduce.py b/hypertools/tools/reduce.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import warnings
+import numpy as np
 from sklearn.decomposition import PCA, FastICA, IncrementalPCA, KernelPCA, FactorAnalysis, TruncatedSVD, SparsePCA, MiniBatchSparsePCA, DictionaryLearning, MiniBatchDictionaryLearning
 from sklearn.manifold import TSNE, MDS, SpectralEmbedding, LocallyLinearEmbedding, Isomap
 from umap import UMAP
@@ -92,7 +93,7 @@ def reduce(x, reduce='IncrementalPCA', ndims=None, normalize=None, align=None,
     if reduce is None:
         return x
 
-    elif isinstance(reduce, (str, np.string_)):
+    elif isinstance(reduce, str):  # Remove np.string_ check as it's deprecated in NumPy 2.0
         model_name = reduce
         model_params = {
             'n_components': ndims
@@ -112,7 +113,7 @@ def reduce(x, reduce='IncrementalPCA', ndims=None, normalize=None, align=None,
 
     try:
         # if the model passed is a string, make sure it's one of the supported options
-        if isinstance(model_name, (str, np.string_)):
+        if isinstance(model_name, str):  # Remove np.string_ check as it's deprecated in NumPy 2.0
             model = models[model_name]
         # otherwise check any custom object for necessary methods
         else:
@@ -142,16 +143,18 @@ def reduce(x, reduce='IncrementalPCA', ndims=None, normalize=None, align=None,
     if model_params['n_components'] is None or all([i.shape[1] <= model_params['n_components'] for i in x]):
         return x
 
-    stacked_x = np.vstack(x)
+    # Handle empty arrays and type conversion
+    stacked_x = np.vstack([np.asarray(arr, dtype=np.float64) for arr in x])
+    
     if stacked_x.shape[0] == 1:
         warnings.warn('Cannot reduce the dimensionality of a single row of'
                       ' data. Return zeros length of ndims')
-        return [np.zeros((1, model_params['n_components']))]
-
+        return [np.zeros((1, model_params['n_components']), dtype=np.float64)]
 
     elif stacked_x.shape[0] < model_params['n_components']:
             warnings.warn('The number of rows in your data is less than ndims.'
                           ' The data will be reduced to the number of rows.')
+            model_params['n_components'] = stacked_x.shape[0]
 
     # deprecation warnings
     if normalize is not None:
@@ -179,8 +182,17 @@ def reduce(x, reduce='IncrementalPCA', ndims=None, normalize=None, align=None,
 
 # sub functions
 def reduce_list(x, model):
+    """Helper function to reduce a list of arrays"""
+    # Ensure all arrays are float64 for consistent handling
+    x = [np.asarray(arr, dtype=np.float64) for arr in x]
     split = np.cumsum([len(xi) for xi in x])[:-1]
-    x_r = np.vsplit(model.fit_transform(np.vstack(x)), split)
+    stacked = np.vstack(x)
+    
+    # Handle potential NaN values
+    if np.any(np.isnan(stacked)):
+        warnings.warn('NaN values detected in input data. These may affect the reduction results.')
+    
+    x_r = np.vsplit(model.fit_transform(stacked), split)
     if len(x) > 1:
         return [xi for xi in x_r]
     else:
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,10 @@
 PPCA>=0.0.2
-scikit-learn>=0.24
-pandas>=0.18.0
-seaborn>=0.8.1
-matplotlib>=1.5.1
-scipy>=1.0.0
-numpy>=1.10.4
-umap-learn>=0.4.6
-requests
-ipympl
+scikit-learn>=1.4.0
+pandas>=2.2.0
+seaborn>=0.13.0
+matplotlib>=3.8.0
+scipy>=1.13.0
+numpy>=2.0.0
+umap-learn>=0.5.5
+requests>=2.31.0
+ipympl>=0.9.3
diff --git a/setup.py b/setup.py
@@ -6,13 +6,13 @@
 os.environ["MPLCONFIGDIR"] = "."
 
 NAME = 'hypertools'
-VERSION = '0.8.0'
+VERSION = '0.8.1'
 AUTHOR = 'Contextual Dynamics Lab'
 AUTHOR_EMAIL = 'contextualdynamics@gmail.com'
 URL = 'https://github.com/ContextLab/hypertools'
 DOWNLOAD_URL = URL
 LICENSE = 'MIT'
-REQUIRES_PYTHON = '>=3.6'
+REQUIRES_PYTHON = '>=3.9'
 PACKAGES = find_packages(exclude=('images', 'examples', 'tests'))
 with open('requirements.txt', 'r') as f:
     REQUIREMENTS = f.read().splitlines()
@@ -35,10 +35,10 @@
 """
 CLASSIFIERS = [
     'Intended Audience :: Science/Research',
-    'Programming Language :: Python :: 3.6',
-    'Programming Language :: Python :: 3.7',
-    'Programming Language :: Python :: 3.8',
     'Programming Language :: Python :: 3.9',
+    'Programming Language :: Python :: 3.10',
+    'Programming Language :: Python :: 3.11',
+    'Programming Language :: Python :: 3.12',
     'Topic :: Scientific/Engineering :: Visualization',
     'Topic :: Multimedia :: Graphics',
     'Operating System :: POSIX',