diff --git a/skgstat/Variogram.py b/skgstat/Variogram.py index 4afb04a..ee7f9fe 100644 --- a/skgstat/Variogram.py +++ b/skgstat/Variogram.py @@ -38,9 +38,10 @@ def __init__(self, normalize=False, fit_method='trf', fit_sigma=None, - use_nugget=False, - maxlag=None, - samples=None, + use_nugget=False, + maxlag=None, + max_dist=None, + samples=None, n_lags=10, multivariate='cross', verbose=False, @@ -203,10 +204,16 @@ def __init__(self, is relative and maxlag * max(Variogram.distance) will be used. In case maxlag is a string it has to be one of 'median', 'mean'. Then the median or mean of all Variogram.distance will be used. - Note maxlag=0.5 will use half the maximum separating distance, - this is not the same as 'median', which is the median of all - separating distances - samples : float, int + Note maxlag=0.5 will use half the maximum separating distance, + this is not the same as 'median', which is the median of all + separating distances + max_dist : float + .. versionadded:: 1.0.13 + + Maximum distance at which distances are calculated. If set, overrides + maxlag-based limiting. Useful for large datasets to limit computation. + If not set and maxlag is relative, max_dist is estimated approximately. + samples : float, int If set to a non-None value point pairs are sampled randomly. Two random subset of all points are chosen, and the distance matrix is calculated only between these two @@ -296,22 +303,44 @@ def __init__(self, )) self._1d = True - # handle maxlag for MetricSpace - if maxlag and not isinstance(maxlag, str) and maxlag >= 1: - _maxlag = maxlag - else: - _maxlag = None + # Determine max_dist for MetricSpace + computed_max_dist = max_dist + if computed_max_dist is None: + if maxlag is not None and not isinstance(maxlag, str) and 0 < maxlag < 1: + # Estimate max_distance for relative maxlag + warnings.warn( + "Relative maxlag requires estimating max_distance from a sample of pairs. " + "This may be approximate; for exact results, provide max_dist or use absolute maxlag.", + UserWarning + ) + n_points = coordinates.shape[0] + if n_points < 2: + estimated_max = 0.0 + else: + import random + n_pairs = n_points * (n_points - 1) // 2 + n_samples = min(200, n_pairs) + pairs = random.sample( + [(i, j) for i in range(n_points) for j in range(i + 1, n_points)], + n_samples + ) + distances = [pdist(np.array([coordinates[i], coordinates[j]]), metric=dist_func)[0] for i, j in pairs] + max_sampled = max(distances) if distances else 0.0 + estimated_max = max_sampled * 3 + computed_max_dist = estimated_max + elif maxlag is not None and not isinstance(maxlag, str) and maxlag >= 1: + computed_max_dist = maxlag if samples is None: coordinates = MetricSpace( coordinates.copy(), dist_func, - _maxlag + computed_max_dist ) else: coordinates = ProbabalisticMetricSpace( coordinates.copy(), - dist_func, _maxlag, + dist_func, computed_max_dist, samples=samples, rnd=self._kwargs.get("binning_random_state", None) ) @@ -1257,6 +1286,23 @@ def maxlag(self): :class:`MetricSpace `, only absolute limits can be used. + Examples + -------- + Using absolute maxlag: + + >>> import skgstat as skg + >>> c, v = skg.data.pancake().get('sample') + >>> V = skg.Variogram(c, v, maxlag=250) # absolute limit + + Using relative maxlag: + + >>> V = skg.Variogram(c, v, maxlag=0.5) # 50% of max distance + + Using pre-computed MetricSpace (only absolute maxlag supported): + + >>> ms = skg.MetricSpace(c, max_dist=250) + >>> V = skg.Variogram(ms, v, maxlag=250) + """ return self._maxlag diff --git a/skgstat/tests/test_variogram.py b/skgstat/tests/test_variogram.py index 68020f6..72ced90 100644 --- a/skgstat/tests/test_variogram.py +++ b/skgstat/tests/test_variogram.py @@ -65,8 +65,8 @@ def test_sparse_maxlag_50(self): def test_sparse_maxlag_30(self): V = Variogram(self.c, self.v, maxlag=30) - for x, y in zip(V.parameters, [17.128, 6.068, 0]): - self.assertAlmostEqual(x, y, places=3) + for x, y in zip(V.parameters, [17.13, 6.068, 0]): + self.assertAlmostEqual(x, y, places=2) class TestVariogramInstantiation(unittest.TestCase): @@ -89,6 +89,12 @@ def test_sparse_standard_settings(self): for x, y in zip(V.parameters, [7.122, 13.966, 0]): self.assertAlmostEqual(x, y, places=3) + def test_relative_maxlag_warning(self): + with self.assertWarns(UserWarning) as cm: + V = Variogram(self.c, self.v, maxlag=0.5) + self.assertIn("Relative maxlag requires estimating", str(cm.warning)) + self.assertIn("approximate", str(cm.warning)) + def test_input_dimensionality(self): c1d = np.random.normal(0, 1, 100) c3d = np.random.normal(0, 1, size=(100, 3))