Fix spelling mistakes in the code

e10e3 · e10e3 · commit f85f12cf4f20 · 2025-01-31T17:46:50.000+01:00
diff --git a/river/anomaly/lof.py b/river/anomaly/lof.py
@@ -149,17 +149,17 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector):
 
     The algorithm take into account the following elements:
         - `NewPoints`: new points;
-        - `kNN(p)`: the k-nearest neighboors of `p` (the k-closest points to `p`);
-        - `RkNN(p)`: the reverse-k-nearest neighboors of `p` (points that have `p` as one of their neighboors);
+        - `kNN(p)`: the k-nearest neighbors of `p` (the k-closest points to `p`);
+        - `RkNN(p)`: the reverse-k-nearest neighbors of `p` (points that have `p` as one of their neighbors);
         - `set_upd_lrd`: Set of points that need to have the local reachability distance updated;
         - `set_upd_lof`: Set of points that need to have the local outlier factor updated.
 
     This current implementation within `River`, based on the original one in the paper, follows the following steps:
         1) Insert new data points (`NewPoints`) and calculate its distance to existing points;
-        2) Update the nreaest neighboors and reverse nearest neighboors of all the points;
+        2) Update the nearest neighbors and reverse nearest neighbors of all the points;
         3) Define sets of affected points that required updates;
-        4) Calculate the reachability-distance from new point to neighboors (`NewPoints` -> `kNN(NewPoints)`)
-           and from rev-neighboors to new point (`RkNN(NewPoints)` -> `NewPoints`);
+        4) Calculate the reachability-distance from new point to neighbors (`NewPoints` -> `kNN(NewPoints)`)
+           and from rev-neighbors to new point (`RkNN(NewPoints)` -> `NewPoints`);
         5) Update the reachability-distance for affected points: `RkNN(RkNN(NewPoints))` -> `RkNN(NewPoints)`
         6) Update local reachability distance of affected points: `lrd(set_upd_lrd)`;
         7) Update local outlier factor: `lof(set_upd_lof)`.
diff --git a/river/base/base.py b/river/base/base.py
@@ -415,7 +415,7 @@ def log_method_calls(
 ):
     """A context manager to log method calls.
 
-    All method calls will be logged by default. This behavior can be overriden by passing filtering
+    All method calls will be logged by default. This behavior can be overridden by passing filtering
     functions.
 
     Parameters
diff --git a/river/cluster/dbstream.py b/river/cluster/dbstream.py
@@ -13,7 +13,7 @@ class DBSTREAM(base.Clusterer):
 
     DBSTREAM [^1] is a clustering algorithm for evolving data streams.
     It is the first micro-cluster-based online clustering component that
-    explicitely captures the density between micro-clusters via a shared
+    explicitly captures the density between micro-clusters via a shared
     density graph. The density information in the graph is then exploited
     for reclustering based on actual density between adjacent micro clusters.
 
diff --git a/river/cluster/odac.py b/river/cluster/odac.py
@@ -73,7 +73,7 @@ class ODAC(base.Clusterer):
             ├── CH1_LVL_3 d1=0.71 [5, 6]
             └── CH2_LVL_3 d1=0.71 [7, 8]
 
-    You can acess some properties of the clustering model directly:
+    You can access some properties of the clustering model directly:
 
     >>> model.n_clusters
     11
diff --git a/river/cluster/streamkmeans.py b/river/cluster/streamkmeans.py
@@ -11,7 +11,7 @@ class STREAMKMeans(base.Clusterer):
 
     However, instead of using the traditional k-means, which requires a total reclustering
     each time the temporary chunk of data points is full, the implementation of this algorithm
-    uses an increamental k-means.
+    uses an incremental k-means.
 
     At first, the cluster centers are initialized with a `KMeans` instance. For a new point `p`:
 
diff --git a/river/cluster/textclust.py b/river/cluster/textclust.py
@@ -583,7 +583,7 @@ def merge(self, microcluster, t, omega, fading_factor, term_fading, realtime):
             microcluster.fade(t, omega, fading_factor, term_fading, realtime)
 
             self.time = t
-            # here we merge an existing mc wth the current mc. The tf values as well as the ids have to be transferred
+            # here we merge an existing mc with the current mc. The tf values as well as the ids have to be transferred
             for k in list(microcluster.tf.keys()):
                 if k in self.tf:
                     self.tf[k]["tf"] += microcluster.tf[k]["tf"]
diff --git a/river/compose/renamer.py b/river/compose/renamer.py
@@ -11,7 +11,7 @@ class Renamer(base.Transformer):
     Parameters
     ----------
     mapping
-        Dictionnary describing substitution rules. Keys in `mapping` that are not a feature's name are silently ignored.
+        Dictionary describing substitution rules. Keys in `mapping` that are not a feature's name are silently ignored.
 
     Examples
     --------
diff --git a/river/datasets/restaurants.py b/river/datasets/restaurants.py
@@ -10,7 +10,7 @@
 class Restaurants(base.RemoteDataset):
     """Data from the Kaggle Recruit Restaurants challenge.
 
-    The goal is to predict the number of visitors in each of 829 Japanese restaurants over a priod
+    The goal is to predict the number of visitors in each of 829 Japanese restaurants over a period
     of roughly 16 weeks. The data is ordered by date and then by restaurant ID.
 
     References
diff --git a/river/drift/dummy.py b/river/drift/dummy.py
@@ -82,7 +82,7 @@ class DummyDriftDetector(base.DriftDetector):
     The 'w' value must be greater than zero when 'trigger_method' is 'random'.
 
     Since we set `dynamic_cloning` to `True`, a clone of the periodic trigger will
-    have its internal paramenters changed:
+    have its internal parameters changed:
 
     >>> rtrigger = rtrigger.clone()
     >>> for i, v in enumerate(data):
diff --git a/river/drift/retrain.py b/river/drift/retrain.py
@@ -7,7 +7,7 @@ class DriftRetrainingClassifier(base.Wrapper, base.Classifier):
     """Drift retraining classifier.
 
     This classifier is a wrapper for any classifier. It monitors the incoming data for concept
-    drifts and warnings in the model's accurary. In case a warning is detected, a background model
+    drifts and warnings in the model's accuracy. In case a warning is detected, a background model
     starts to train. If a drift is detected, the model will be replaced by the background model,
     and the background model will be reset.
 
diff --git a/river/ensemble/boosting.py b/river/ensemble/boosting.py
@@ -303,7 +303,7 @@ def learn_one(self, x, y, **kwargs):
         # the best model's not yet trained will receive lambda values for training from the model's that correctly classified an instance.
         # the values of lambda increase in case a mistake is made and decrease in case a right prediction is made.
         # the worst models are more likely to make mistakes, increasing the value of lambda.
-        # Then, the best's model are likely to receive a high value of lambda and decreasing gradually throughout the remaning models to be trained
+        # Then, the best's model are likely to receive a high value of lambda and decreasing gradually throughout the remaining models to be trained
         # It's similar to a system where the rich get richer.
         for i in range(self.n_models):
             if correct:
diff --git a/river/imblearn/random.py b/river/imblearn/random.py
@@ -194,7 +194,7 @@ class RandomSampler(ClassificationSampler):
     desired_dist
         The desired class distribution. The keys are the classes whilst the values are the desired
         class percentages. The values must sum up to 1. If set to `None`, then the observations
-        will be sampled uniformly at random, which is stricly equivalent to using
+        will be sampled uniformly at random, which is strictly equivalent to using
         `ensemble.BaggingClassifier`.
     sampling_rate
         The desired ratio of data to sample.
diff --git a/river/linear_model/base.py b/river/linear_model/base.py
@@ -125,7 +125,7 @@ def _fit(self, x, y, w, get_grad):
     def _update_weights(self, x):
         # L1 cumulative penalty helper
 
-        # Apply penalty to each weight iteratively, with the potential of being parrallelized by using VectorDict
+        # Apply penalty to each weight iteratively, with the potential of being parallelized by using VectorDict
         for j, xj in x.items():
             wj_temp = self._weights[j]
 
diff --git a/river/linear_model/test_glm.py b/river/linear_model/test_glm.py
@@ -88,7 +88,7 @@ def test_finite_differences(lm, dataset):
 
         # d is a set of weight perturbations
         for d in iter_perturbations(weights.keys()):
-            # Pertubate the weights and obtain the loss with the new weights
+            # Perturb the weights and obtain the loss with the new weights
             lm._weights = utils.VectorDict({i: weights[i] + eps * di for i, di in d.items()})
             forward = lm.loss(y_true=y, y_pred=lm._raw_dot_one(x))
             lm._weights = utils.VectorDict({i: weights[i] - eps * di for i, di in d.items()})
diff --git a/river/metrics/expected_mutual_info.pyx b/river/metrics/expected_mutual_info.pyx
@@ -41,7 +41,7 @@ def expected_mutual_info(confusion_matrix):
     the AMI will be one order of magnitude slower than most other implemented metrics.
 
     Note that, different form most of the implementations of other mutual information metrics,
-    the expected mutual information wil be implemented using numpy arrays. This implementation
+    the expected mutual information will be implemented using numpy arrays. This implementation
     inherits from the implementation of the expected mutual information in scikit-learn.
 
     Parameters
diff --git a/river/metrics/mutual_info.py b/river/metrics/mutual_info.py
@@ -119,7 +119,7 @@ class NormalizedMutualInfo(metrics.base.MultiClassMetric):
     agreement solely due to chance); as a result, the Adjusted Mutual Info Score will mostly be preferred.
     However, this metric is still symmetric, which means that switching true and predicted labels will not
     alter the score value. This fact can be useful when the metric is used to measure the agreement between
-    two indepedent label solutions on the same dataset, when the ground truth remains unknown.
+    two independent label solutions on the same dataset, when the ground truth remains unknown.
 
     Another advantage of the metric is that as it is based on the calculation of entropy-related measures,
     it is independent of the permutation of class/cluster labels.
diff --git a/river/metrics/rand.py b/river/metrics/rand.py
@@ -119,7 +119,7 @@ class AdjustedRand(metrics.base.MultiClassMetric):
 
     The Adjusted Rand Index is the corrected-for-chance version of the Rand Index [^1] [^2].
     Such a correction for chance establishes a baseline by using the expected similarity
-    of all pair-wise comparisions between clusterings specified by a random model.
+    of all pair-wise comparisons between clusterings specified by a random model.
 
     Traditionally, the Rand Index was corrected using the Permutation Model for Clustering.
     However, the premises of the permutation model are frequently violated; in many
diff --git a/river/metrics/silhouette.py b/river/metrics/silhouette.py
@@ -51,7 +51,7 @@ class Silhouette(metrics.base.ClusteringMetric):
 
     References
     ----------
-    [^1]: Rousseeuw, P. (1987). Silhouettes: a graphical aid to the intepretation and validation
+    [^1]: Rousseeuw, P. (1987). Silhouettes: a graphical aid to the interpretation and validation
           of cluster analysis 20, 53 - 65. DOI: 10.1016/0377-0427(87)90125-7
     [^2]: Bifet, A. et al. (2018). "Machine Learning for Data Streams".
           DOI: 10.7551/mitpress/10654.001.0001.
diff --git a/river/metrics/vbeta.py b/river/metrics/vbeta.py
@@ -120,7 +120,7 @@ class Completeness(metrics.base.MultiClassMetric):
     the proposed cluster distribution given the class of the component data points.
     However, in the worst case scenario, each class is represented by every cluster
     with a distribution equal to the distribution of cluster sizes. Therefore,
-    symmetric to the claculation above, we define completeness as:
+    symmetric to the calculation above, we define completeness as:
 
     $$
     c = \begin{cases}
@@ -209,7 +209,7 @@ class VBeta(metrics.base.MultiClassMetric):
     It provides an elegant solution to many problems that affect previously defined
     cluster evaluation measures including
 
-    * Dependance of clustering algorithm or dataset,
+    * Dependence of clustering algorithm or dataset,
 
     * The "problem of matching", where the clustering of only a portion of data
     points are evaluated, and
diff --git a/river/misc/sdft.py b/river/misc/sdft.py
@@ -38,7 +38,7 @@ class SDFT(base.Base):
 
     References
     ----------
-    [^1]: [Jacobsen, E. asample_average.pynd Lyons, R., 2003. The sliding DFT. IEEE Signal Processing Magazine, 20(2), pp.74-80.](https://www.comm.utoronto.ca/~dimitris/ece431/slidingdft.pdf)
+    [^1]: [Jacobsen, E. and Lyons, R., 2003. The sliding DFT. IEEE Signal Processing Magazine, 20(2), pp.74-80.](https://www.comm.utoronto.ca/~dimitris/ece431/slidingdft.pdf)
     [^2]: [Understanding and Implementing the Sliding DFT](https://www.dsprelated.com/showarticle/776.php)
 
     """
diff --git a/river/naive_bayes/bernoulli.py b/river/naive_bayes/bernoulli.py
@@ -261,7 +261,7 @@ def joint_log_likelihood_many(self, X: pd.DataFrame) -> pd.DataFrame:
             X[missing] = 0
             if is_sparse:
                 # The new values need to be converted to preserve the sparseness of the dataframe.
-                # Input values can be intergers or floats, converting all to float preserves the behaviour without the need for complex conversion logic.
+                # Input values can be integers or floats, converting all to float preserves the behaviour without the need for complex conversion logic.
                 X = X.astype(pd.SparseDtype(float, 0.0))
 
         index, columns = X.index, X.columns
diff --git a/river/naive_bayes/test_naive_bayes.py b/river/naive_bayes/test_naive_bayes.py
@@ -113,7 +113,7 @@ def test_learn_one_methods(model):
 def test_learn_many_vs_learn_one(model, batch_model):
     """Assert that the Naive Bayes river models provide the same results when learning in
     incremental and mini-batch modes. The models tested are MultinomialNB, BernoulliNB and
-    ComplementNB with differents alpha parameters..
+    ComplementNB with different alpha parameters..
     """
     for x, y in yield_dataset():
         model.learn_one(x, y)
@@ -134,11 +134,11 @@ def test_learn_many_vs_learn_one(model, batch_model):
             batch_model.predict_proba_many(x_batch)["no"][0]
         )
 
-    # Assert class probabilities are the same when trainig Naive Bayes in pure online and in batch.
+    # Assert class probabilities are the same when training Naive Bayes in pure online and in batch.
     assert model["model"].p_class("yes") == batch_model["model"].p_class("yes")
     assert model["model"].p_class("no") == batch_model["model"].p_class("no")
 
-    # Assert conditionnal probabilities are the same when training Naive Bayes in pure online and
+    # Assert conditional probabilities are the same when training Naive Bayes in pure online and
     # in batch.
     if isinstance(model["model"], naive_bayes.BernoulliNB) or isinstance(
         model["model"], naive_bayes.MultinomialNB
@@ -213,7 +213,7 @@ def test_river_vs_sklearn(model, sk_model, bag):
     """Assert that river Naive Bayes models and sklearn Naive Bayes models provide the same results
     when the input data are the same. Also check that the behaviour of Naives Bayes models are the
     same with dense and sparse dataframe. Models tested are MultinomialNB, BernoulliNB and
-    ComplementNB with differents alpha parameters.
+    ComplementNB with different alpha parameters.
     """
     for x, y in yield_batch_dataset():
         model.learn_many(x, y)
diff --git a/river/neighbors/knn_regressor.py b/river/neighbors/knn_regressor.py
@@ -92,7 +92,7 @@ def _check_aggregation_method(self, method):
         Parameters
         ----------
         method
-            The suplied aggregration method.
+            The supplied aggreration method.
         """
         if method not in {self._MEAN, self._MEDIAN, self._WEIGHTED_MEAN}:
             raise ValueError(
diff --git a/river/proba/gaussian.py b/river/proba/gaussian.py
@@ -270,11 +270,11 @@ def __repr__(self):
         return f"𝒩(\n    μ=({mu_str}),\n    σ^2=(\n{var_str}\n    )\n)"
 
     def update(self, x):
-        # TODO: add support for weigthed samples
+        # TODO: add support for weighted samples
         self._var.update(x)
 
     def revert(self, x):
-        # TODO: add support for weigthed samples
+        # TODO: add support for weighted samples
         self._var.revert(x)
 
     def __call__(self, x: dict[str, float]):
@@ -285,11 +285,11 @@ def __call__(self, x: dict[str, float]):
             try:
                 pdf_ = multivariate_normal([*self.mu.values()], var).pdf(x_)
                 return float(pdf_)
-            # TODO: validate occurence of ValueError
+            # TODO: validate occurrence of ValueError
             # The input matrix must be symmetric positive semidefinite.
             except ValueError:  # pragma: no cover
                 return 0.0
-            # TODO: validate occurence of OverflowError
+            # TODO: validate occurrence of OverflowError
             except OverflowError:  # pragma: no cover
                 return 0.0
         return 0.0  # pragma: no cover
diff --git a/river/sketch/counter.py b/river/sketch/counter.py
@@ -87,7 +87,7 @@ class Counter(base.Base):
     >>> cms[532]
     15
 
-    Keep in mind that CMS is an approximate sketch algorithm. Couting estimates for unseen values
+    Keep in mind that CMS is an approximate sketch algorithm. Counting estimates for unseen values
     might not be always reliable:
 
     >>> cms[1001]
diff --git a/river/stats/kolmogorov_smirnov.py b/river/stats/kolmogorov_smirnov.py
@@ -168,15 +168,15 @@ class KolmogorovSmirnov(stats.base.Bivariate):
     $$
 
     This implementation is the incremental version of the previously mentioned statistics, with the change being in
-    the ability to insert and remove an observation thorugh time. This can be done using a randomized tree called
+    the ability to insert and remove an observation through time. This can be done using a randomized tree called
     Treap (or Cartesian Tree) [^2] with bulk operation and lazy propagation.
 
     The implemented algorithm is able to perform the insertion and removal operations
     in O(logN) with high probability and calculate the Kolmogorov-Smirnov test in O(1),
     where N is the number of sample observations. This is a significant improvement compared
     to the O(N logN) cost of non-incremental implementation.
 
-    This implementation also supports the calculation of the Kuiper statistics. Different from the orginial
+    This implementation also supports the calculation of the Kuiper statistics. Different from the original
     Kolmogorov-Smirnov statistics, Kuiper's test [^3] calculates the sum of the absolute sizes of the most positive and
     most negative differences between the two cumulative distribution functions taken into account. As such,
     Kuiper's test is very sensitive in the tails as at the median.
diff --git a/river/stats/test_stats.py b/river/stats/test_stats.py
@@ -251,7 +251,7 @@ def test_bivariate(stat, func):
     ],
 )
 def test_rolling_bivariate(stat, func):
-    # Enough alread
+    # Enough already
 
     def tail(iterable, n):
         return collections.deque(iterable, maxlen=n)
diff --git a/river/stream/iter_vaex.py b/river/stream/iter_vaex.py
@@ -16,7 +16,7 @@ def iter_vaex(
     Parameters
     ----------
     X
-        A vaex DataFrame housing the training featuers.
+        A vaex DataFrame housing the training features.
     y
         The column or expression containing the target variable.
     features
diff --git a/river/tree/mondrian/mondrian_tree_classifier.py b/river/tree/mondrian/mondrian_tree_classifier.py
@@ -218,7 +218,7 @@ def _compute_split_time(
             if isinstance(node, MondrianLeafClassifier):
                 return split_time
             # Otherwise we apply Mondrian process dark magic :)
-            # 1. We get the creation time of the childs (left and right is the same)
+            # 1. We get the creation time of the children (left and right is the same)
             left, _ = node.children
             child_time = left.time
             # 2. We check if splitting time occurs before child creation time
@@ -400,7 +400,7 @@ def _go_downwards(self, x, y):
                         current_node = left
 
                     # This is the leaf containing the sample point (we've just
-                    # splitted the current node with the data point)
+                    # split the current node with the data point)
                     leaf = current_node
                     self._update_downwards(x, y, leaf, False)
                     return leaf
diff --git a/river/tree/mondrian/mondrian_tree_regressor.py b/river/tree/mondrian/mondrian_tree_regressor.py
@@ -144,7 +144,7 @@ def _compute_split_time(
             if isinstance(node, MondrianLeafRegressor):
                 return split_time
             # Otherwise we apply Mondrian process dark magic :)
-            # 1. We get the creation time of the childs (left and right is the same)
+            # 1. We get the creation time of the children (left and right is the same)
             left, _ = node.children
             child_time = left.time
             # 2. We check if splitting time occurs before child creation time
@@ -326,7 +326,7 @@ def _go_downwards(self):
                         current_node = left
 
                     # This is the leaf containing the sample point (we've just
-                    # splitted the current node with the data point)
+                    # split the current node with the data point)
                     leaf = current_node
                     self._update_downwards(leaf, False)
                     return leaf
diff --git a/river/tree/nodes/last_nodes.py b/river/tree/nodes/last_nodes.py
diff --git a/river/utils/random.py b/river/utils/random.py

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ class ODAC(base.Clusterer):`
`73`	`73`	`├── CH1_LVL_3 d1=0.71 [5, 6]`
`74`	`74`	`└── CH2_LVL_3 d1=0.71 [7, 8]`
`75`	`75`
`76`		`- You can acess some properties of the clustering model directly:`
	`76`	`+ You can access some properties of the clustering model directly:`
`77`	`77`
`78`	`78`	`>>> model.n_clusters`
`79`	`79`	`11`
Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,7 @@ class Counter(base.Base):`
`87`	`87`	`>>> cms[532]`
`88`	`88`	`15`
`89`	`89`
`90`		`- Keep in mind that CMS is an approximate sketch algorithm. Couting estimates for unseen values`
	`90`	`+ Keep in mind that CMS is an approximate sketch algorithm. Counting estimates for unseen values`
`91`	`91`	`might not be always reliable:`
`92`	`92`
`93`	`93`	`>>> cms[1001]`
Original file line number	Diff line number	Diff line change
`@@ -251,7 +251,7 @@ def test_bivariate(stat, func):`
`251`	`251`	`],`
`252`	`252`	`)`
`253`	`253`	`def test_rolling_bivariate(stat, func):`
`254`		`- # Enough alread`
	`254`	`+ # Enough already`
`255`	`255`
`256`	`256`	`def tail(iterable, n):`
`257`	`257`	`return collections.deque(iterable, maxlen=n)`
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ def iter_vaex(`
`16`	`16`	`Parameters`
`17`	`17`	`----------`
`18`	`18`	`X`
`19`		`- A vaex DataFrame housing the training featuers.`
	`19`	`+ A vaex DataFrame housing the training features.`
`20`	`20`	`y`
`21`	`21`	`The column or expression containing the target variable.`
`22`	`22`	`features`