diff --git a/.github/workflows/pr_precommit.yml b/.github/workflows/pr_precommit.yml
index 809803fecc..92ed16ee98 100644
--- a/.github/workflows/pr_precommit.yml
+++ b/.github/workflows/pr_precommit.yml
@@ -30,6 +30,7 @@ jobs:
           repository: ${{ github.event.pull_request.head.repo.full_name }}
           ref: ${{ github.head_ref }}
           token: ${{ steps.app-token.outputs.token }}
+          fetch-depth: 0
 
       - name: Setup Python 3.10
         uses: actions/setup-python@v5
diff --git a/aeon/classification/dictionary_based/_redcomets.py b/aeon/classification/dictionary_based/_redcomets.py
index 601f573bdf..6a716acf21 100644
--- a/aeon/classification/dictionary_based/_redcomets.py
+++ b/aeon/classification/dictionary_based/_redcomets.py
@@ -49,7 +49,7 @@ class REDCOMETS(BaseClassifier):
         ``-1`` means using all processors.
     parallel_backend : str, ParallelBackendBase instance or None, default=None
         Specify the parallelisation backend implementation in joblib,
-        if ``None`` a 'prefer' value of "threads" is used by default.
+        if ``None`` a ``prefer`` value of "threads" is used by default.
         Valid options are "loky", "multiprocessing", "threading" or a custom backend.
         See the joblib Parallel documentation for more details.
 
diff --git a/aeon/classification/dictionary_based/_tde.py b/aeon/classification/dictionary_based/_tde.py
index ba3eae07a5..6ccedd4854 100644
--- a/aeon/classification/dictionary_based/_tde.py
+++ b/aeon/classification/dictionary_based/_tde.py
@@ -32,9 +32,9 @@ class TemporalDictionaryEnsemble(BaseClassifier):
     Implementation of the dictionary based Temporal Dictionary Ensemble as described
     in [1]_.
 
-    Overview: Input 'n' series length 'm' with 'd' dimensions
-    TDE searches 'k' parameter values selected using a Gaussian processes
-    regressor, evaluating each with a LOOCV. It then retains 's'
+    Overview: Input ``n`` series length ``m`` with ``d`` dimensions
+    TDE searches ``k`` parameter values selected using a Gaussian processes
+    regressor, evaluating each with a LOOCV. It then retains ``s``
     ensemble members.
     There are six primary parameters for individual classifiers:
             - alpha: alphabet size
@@ -46,14 +46,14 @@ class TemporalDictionaryEnsemble(BaseClassifier):
     For any combination, an individual TDE classifier slides a window of
     length w along the series. The w length window is shortened to
     an l length word through taking a Fourier transform and keeping the
-    first l/2 complex coefficients. These lcoefficients are then discretised
+    first l/2 complex coefficients. These coefficients are then discretised
     into alpha possible values, to form a word length l using breakpoints
     found using b. A histogram of words for each series is formed and stored,
     using a spatial pyramid of h levels. For multivariate series, accuracy
     from a reduced histogram is used to select dimensions.
 
     fit involves finding n histograms.
-    predict uses 1 nearest neighbour with the histogram intersection
+    predict uses 1 nearest neighbor with the histogram intersection
     distance function.
 
     Parameters
@@ -645,7 +645,7 @@ class IndividualTDE(BaseClassifier):
 
     See Also
     --------
-    TemporalDictinaryEnsemble, SFA
+    TemporalDictionaryEnsemble, SFA
         TDE extends BOSS and uses SFA.
 
     Notes
diff --git a/aeon/classification/dictionary_based/_weasel.py b/aeon/classification/dictionary_based/_weasel.py
index 03b86a8c17..dceeac9fdb 100644
--- a/aeon/classification/dictionary_based/_weasel.py
+++ b/aeon/classification/dictionary_based/_weasel.py
@@ -23,7 +23,7 @@ class WEASEL(BaseClassifier):
     """
     Word Extraction for Time Series Classification (WEASEL).
 
-    As described in [1]_. Overview: Input 'n' series length 'm'
+    As described in [1]_. Overview: Input ``n`` series length ``m``
     WEASEL is a dictionary classifier that builds a bag-of-patterns using SFA
     for different window lengths and learns a logistic regression classifier
     on this bag.
@@ -74,10 +74,10 @@ class WEASEL(BaseClassifier):
         Sets the feature selections strategy to be used. One of {"chi2", "none",
         "random"}.  Large amounts of memory may beneeded depending on the setting of
         bigrams (true is more) or alpha (larger is more).
-        'chi2' reduces the number of words, keeping those above the 'p_threshold'.
-        'random' reduces the number to at most 'max_feature_count',
+        ``chi2`` reduces the number of words, keeping those above the ``p_threshold``.
+        ``random`` reduces the number to at most ``max_feature_count``,
         by randomly selecting features.
-        'none' does not apply any feature selection and yields large bag of words.
+        ``none`` does not apply any feature selection and yields large bag of words.
     support_probabilities : bool, default: False
         If set to False, a RidgeClassifierCV will be trained, which has higher accuracy
         and is faster, yet does not support predict_proba.
diff --git a/aeon/classification/dictionary_based/_weasel_v2.py b/aeon/classification/dictionary_based/_weasel_v2.py
index b8d014a089..d6520835d0 100644
--- a/aeon/classification/dictionary_based/_weasel_v2.py
+++ b/aeon/classification/dictionary_based/_weasel_v2.py
@@ -34,7 +34,7 @@ class WEASEL_V2(BaseClassifier):
     """
     Word Extraction for Time Series Classification (WEASEL) v2.0.
 
-    Overview: Input 'n' series length 'm'
+    Overview: Input ``n`` series length ``m``
     WEASEL is a dictionary classifier that builds a bag-of-patterns using SFA
     for different window lengths and learns a logistic regression classifier
     on this bag.
@@ -72,11 +72,11 @@ class WEASEL_V2(BaseClassifier):
         Sets the feature selections strategy to be used. Options from {"chi2_top_k",
         "none", "random"}. Large amounts of memory may be needed depending on the
         setting of bigrams (true is more) or alpha (larger is more).
-        'chi2_top_k' reduces the number of words to at most 'max_feature_count',
+        ``chi2_top_k`` reduces the number of words to at most 'max_feature_count',
         dropping values based on p-value.
-        'random' reduces the number to at most 'max_feature_count', by randomly
+        ``random`` reduces the number to at most ``max_feature_count``, by randomly
         selecting features.
-        'none' does not apply any feature selection and yields large bag of words
+        ``none`` does not apply any feature selection and yields large bag of words
     max_feature_count : int, default=30_000
        size of the dictionary - number of words to use - if feature_selection set to
        "chi2" or "random". Else ignored.
@@ -290,11 +290,11 @@ class WEASELTransformerV2:
         Sets the feature selections strategy to be used. Large amounts of memory may be
         needed depending on the setting of bigrams (true is more) or
         alpha (larger is more).
-        'chi2_top_k' reduces the number of words to at most 'max_feature_count',
+        ``chi2_top_k`` reduces the number of words to at most ``max_feature_count``,
         dropping values based on p-value.
-        'random' reduces the number to at most 'max_feature_count',
+        ``random`` reduces the number to at most ``max_feature_count``,
         by randomly selecting features.
-        'none' does not apply any feature selection and yields large bag of words
+        ``none`` does not apply any feature selection and yields large bag of words
     max_feature_count : int, default=30_000
        size of the dictionary - number of words to use - if feature_selection set to
        "chi2" or "random". Else ignored.
diff --git a/aeon/classification/distance_based/tests/test_elastic_ensemble.py b/aeon/classification/distance_based/tests/test_elastic_ensemble.py
index 53215dad2f..6605e99a31 100644
--- a/aeon/classification/distance_based/tests/test_elastic_ensemble.py
+++ b/aeon/classification/distance_based/tests/test_elastic_ensemble.py
@@ -83,7 +83,7 @@ def test_proportion_train_in_param_finding():
 
 
 def test_all_distance_measures():
-    """Test the 'all' option of the distance_measures parameter."""
+    """Test the ``all`` option of the distance_measures parameter."""
     X = np.random.random(size=(10, 1, 10))
     y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
     ee = ElasticEnsemble(distance_measures="all", proportion_train_in_param_finding=0.2)
diff --git a/aeon/classification/feature_based/_catch22.py b/aeon/classification/feature_based/_catch22.py
index 26a56d0a91..78cc7d506e 100644
--- a/aeon/classification/feature_based/_catch22.py
+++ b/aeon/classification/feature_based/_catch22.py
@@ -67,7 +67,7 @@ class Catch22Classifier(BaseClassifier):
         ``-1`` means using all processors.
     parallel_backend : str, ParallelBackendBase instance or None, default=None
         Specify the parallelisation backend implementation in joblib for Catch22,
-        if None a 'prefer' value of "threads" is used by default.
+        if None a ``prefer`` value of "threads" is used by default.
         Valid options are "loky", "multiprocessing", "threading" or a custom backend.
         See the joblib Parallel documentation for more details.
     class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
diff --git a/aeon/classification/hybrid/_hivecote_v1.py b/aeon/classification/hybrid/_hivecote_v1.py
index 22925487a6..73ad1f4a94 100644
--- a/aeon/classification/hybrid/_hivecote_v1.py
+++ b/aeon/classification/hybrid/_hivecote_v1.py
@@ -60,7 +60,7 @@ class HIVECOTEV1(BaseClassifier):
         ``-1`` means using all processors.
     parallel_backend : str, ParallelBackendBase instance or None, default=None
         Specify the parallelisation backend implementation in joblib for Catch22,
-        if None a 'prefer' value of "threads" is used by default.
+        if None a ``prefer`` value of "threads" is used by default.
         Valid options are "loky", "multiprocessing", "threading" or a custom backend.
         See the joblib Parallel documentation for more details.
 
diff --git a/aeon/classification/hybrid/_hivecote_v2.py b/aeon/classification/hybrid/_hivecote_v2.py
index f53167cc8a..bedb9de851 100644
--- a/aeon/classification/hybrid/_hivecote_v2.py
+++ b/aeon/classification/hybrid/_hivecote_v2.py
@@ -59,7 +59,7 @@ class HIVECOTEV2(BaseClassifier):
         ``-1`` means using all processors.
     parallel_backend : str, ParallelBackendBase instance or None, default=None
         Specify the parallelisation backend implementation in joblib for Catch22,
-        if None a 'prefer' value of "threads" is used by default.
+        if None a ``prefer`` value of "threads" is used by default.
         Valid options are "loky", "multiprocessing", "threading" or a custom backend.
         See the joblib Parallel documentation for more details.