Merge pull request #53 from petrobras/documentation_improvements

ricardoevvargas · web-flow · commit ed6e43cb1a94 · 2023-01-27T12:52:30.000-03:00
Incorporation of MAIS in the 3W toolkit: conclusion of the 1st phase.
diff --git a/toolkit/mais/README.md b/toolkit/mais/README.md
@@ -73,6 +73,15 @@ The name of theses experiments reflect what they implements, for examples, the e
 * mixed = both statistical and wavelet features; 
 * select = Feature selector; and
 * wavelets = Wavelets
+
+For example, between experiments 4 and 6, the difference is the kind of features will be computed. In the first on the exponentially weighted statistical features are used and in the second one, just the wavelets. And to do that, the difference is basically assign the correspondent feature wrapper. The following image is the wrapper for the Experiment 4 of the examples list.
+
+![Statistical Wrapper](images/README/stats.jpg "Statistical features wrapper")
+
+And for experiment 6, we have:
+
+![Wavelets Wrapper](images/README/wavelets.jpg "Wavelets features wrapper")
+
 # How to use
 
 After creating the experiment and putting it into the experiment folder (for example, 'experiments/multiclass/experiments/example.py', 
diff --git a/toolkit/mais/experiments/multiclass/experiments/multi_stats_mrl_nonan.py b/toolkit/mais/experiments/multiclass/experiments/multi_stats_mrl_nonan.py
@@ -6,6 +6,7 @@
 """
 import numpy as np
 
+from sklearn.metrics import accuracy_score, get_scorer
 from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.impute import SimpleImputer
 from sklearn.decomposition import PCA
@@ -58,6 +59,7 @@ def raw_transform(self, event, transient_only=True, no_nans=True):
         labels = event["labels"]
         event_type = event["event_type"]
 
+        # trim estabilished fault if has transient
         if transient_only and MAEDataset.TRANSIENT_CLASS[event_type]:
             transients = labels.values != event_type
             tags = tags[transients]
diff --git a/toolkit/mais/experiments/multiclass/experiments/multi_wavelets_mrl_nonan.py b/toolkit/mais/experiments/multiclass/experiments/multi_wavelets_mrl_nonan.py
@@ -60,6 +60,7 @@ def raw_transform(self, event, transient_only=True, no_nans=True):
         labels = event["labels"]
         event_type = event["event_type"]
 
+        # trim estabilished fault if has transient
         if transient_only and MAEDataset.TRANSIENT_CLASS[event_type]:
             transients = labels.values != event_type
             tags = tags[transients]
diff --git a/toolkit/mais/images/README/stats.jpg b/toolkit/mais/images/README/stats.jpg
diff --git a/toolkit/mais/images/README/wavelets.jpg b/toolkit/mais/images/README/wavelets.jpg
diff --git a/toolkit/mais/mais/data/dataset.py b/toolkit/mais/mais/data/dataset.py
@@ -36,7 +36,7 @@ class MAEDataset:
 
         - **n_jobs: INT** -- number of processes to use
 
-        -- **events**
+        - **events**
 
     * Important fields:
 
@@ -57,17 +57,17 @@ class MAEDataset:
 
         - **_make_set()**
 
-        -  **process(fname, event_type)**
+        - **process(fname, event_type)**
 
     """
 
-    # tag corresponding to instance label
+    # Tag corresponding to instance label
     LABEL_NAME = "class"
 
-    # tag corresponding to index
+    # Tag corresponding to index
     INDEX_NAME = "timestamp"
 
-    # fault description
+    # Fault description
     CLASS_NAMES = {
         0: "NORMAL",
         1: "ABRUPT_INCREASE_OF_BSW",
@@ -80,10 +80,10 @@ class MAEDataset:
         8: "HYDRATE_IN_PRODUCTION_LINE",
     }
 
-    # list of used classes
+    # List of used classes
     KNOWN_CLASSES = list(CLASS_NAMES.keys())
 
-    # transient properties of events
+    # Transient properties of events
     TRANSIENT_CLASS = {
         0: False,
         1: True,
@@ -118,7 +118,9 @@ def __init__(
         feature_mapper=tuple,  # transformer from event to features
         n_jobs=-1,
     ):
-        """Load and process dataset using supplied strategies"""
+        """
+        Load and process dataset using supplied strategies.
+        """
 
         # save parameters
         self.root_dir = root_dir
@@ -127,14 +129,16 @@ def __init__(
         self.n_jobs = n_jobs
         self.feature_mapper = feature_mapper
 
-        # call the heavy load _make_set passing the (maybe Null) events
+        # Call the heavy load _make_set passing the (maybe Null) events
         self._make_set(events)
 
     def _instance_type(fname):
-        """Detects if instance type is selected
+        """
+        Detects if instance type is selected.
 
         * Parameters:
             - **fname**: STRING - name of the instance file
+
         * Returns:
             - **STRING** - string representing the instance type of the input file name
 
@@ -147,9 +151,26 @@ def _instance_type(fname):
             return "real"
 
     def load_events(data_root, n_jobs=-1):
-        """scan data_root for raw files and return dict. useful for preloads"""
+        """
+        Scan data_root for raw files and return dict. useful for preloads.
+
+        * Parameters:
+            - **data_root: STRING** - base location of events separated by event type
+
+        * Returns:
+            - **events**: [LIST] - Optional list of preloaded events
+        """
 
         def _read(tgt, fname):
+            """
+            Return a dict with the summary of a target.
+
+            * Parameters:
+                - **tgt: STRING** - Target location
+
+            * Returns:
+                - **fname**: STRING - Filename
+            """
             df = pandas.read_csv(
                 fname,
                 index_col=MAEDataset.INDEX_NAME,
@@ -179,7 +200,9 @@ def _read(tgt, fname):
     def transform_events(
         events, raw_mapper, tgt_events=None, instance_types=None, n_jobs=-1
     ):
-        """apply raw_mapper to list of events, filtering by target events and instance types"""
+        """
+        Apply raw_mapper to list of events, filtering by target events and instance types
+        """
         if tgt_events is not None:
             events = [e for e in events if (e["event_type"] in tgt_events)]
         if instance_types is not None:
@@ -205,9 +228,9 @@ def gather(transformed_events):
         return Dataset(X=X, y=y, g=g, g_class=g_class)
 
     def _make_set(self, events=None):
-        """Loads all instances of target classes from the desired types,
-           transforming the raw data to obtain its features by calling the
-           *feature_mapper()* method for each instance.
+        """
+        Loads all instances of target classes from the desired types, transforming the raw data to obtain its
+        features by calling the *feature_mapper()* method for each instance.
 
         * Parameters:
             - **events**: [LIST] - Optional list of preloaded events
diff --git a/toolkit/mais/mais/data/feature_mappers.py b/toolkit/mais/mais/data/feature_mappers.py
@@ -9,7 +9,10 @@
 
 
 class StatisticalFeatureMapper:
-    """generates statistical descriptor for a window of our data"""
+    """
+    Generates statistical descriptor for a window of our data
+
+    """
 
     FEATURES = {
         "mean": lambda x: x.mean(),
@@ -48,7 +51,9 @@ def __call__(self, tags, event_type=None):
 
 
 class TorchStatisticalFeatureMapper:
-    """PyTorch implementation of the statistical feature mapper"""
+    """
+    PyTorch implementation of the statistical feature mapper
+    """
 
     FEATURES = ["mean", "std", "skew", "kurt", "min", "1qrt", "med", "3qrt", "max"]
 
@@ -250,7 +255,10 @@ def __call__(self, tags, event_type=None):
 
 
 class MixedMapper:
-    """join features of multiple mappers. Feature sizes must be consistent"""
+    """
+    Join features of multiple mappers. Feature sizes must be consistent.
+
+    """
 
     def __init__(self, *args):
         self.mappers = args
diff --git a/toolkit/mais/mais/data/label_mappers.py b/toolkit/mais/mais/data/label_mappers.py
@@ -1,5 +1,3 @@
-""" Strategies for deciding on label for a given region of data, using pandas or torch backends """
-
 import pandas as pd
 import numpy as np
 import scipy.stats as sp
@@ -9,8 +7,25 @@
 
 
 class RollingLabelStrategy:
-    """Base class that just wraps applications of apply,
-    leverages pandas' Rolling function"""
+    """
+    Base class that just wraps applications of apply,
+    leverages pandas' Rolling function
+
+
+    * Constructor arguments:
+        - **window_size: INT** -- Size of sliding window
+
+        - **stride: INT** -- Number of samples between consecutive windows
+
+        - **offset: INT** -- Control how much to offset each window
+
+    * Methods:
+
+        - **apply(y, event_type)**
+
+        - **__call__(labels, event_type)**
+
+    """
 
     def __init__(self, window_size, stride=1, offset=0):
         self.window_size = window_size
@@ -29,26 +44,36 @@ def f(y):
 
 
 class BinaryMCLStrategy(RollingLabelStrategy):
-    """Window label gets assigned to most common value,
-    mapping transients and faults of ALL classes to true"""
+    """
+    Window label gets assigned to most common value,
+    mapping transients and faults of ALL classes to true
+    """
 
     def apply(self, y, event_type=None):
-        """map all fault types to True and apply mode over window"""
+        """
+        Map all fault types to True and apply mode over window
+        """
         return sp.mode(y > 0)[0]
 
 
 class MulticlassMCLStrategy(RollingLabelStrategy):
-    """Window label gets assigned to most common value,
-    mapping transients and faults to the CORRESPONDING CLASS CODE"""
+    """
+    Window label gets assigned to most common value,
+    mapping transients and faults to the CORRESPONDING CLASS CODE
+    """
 
     def apply(self, y, event_type=None):
-        """map transient codes to fault codes and apply mode over window"""
+        """
+        Map transient codes to fault codes and apply mode over window
+        """
         return sp.mode(y % 100)[0]
 
 
 class OVAMCLStrategy(RollingLabelStrategy):
-    """Window label gets assigned to most common value,
-    mapping transients and faults of SPECIFIC CLASS to true"""
+    """
+    Window label gets assigned to most common value,
+    mapping transients and faults of SPECIFIC CLASS to true
+    """
 
     def __init__(self, fault_code, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -59,8 +84,10 @@ def apply(self, y, event_type=None):
 
 
 class TorchLabelStrategy:
-    """Base class that just wraps applications of apply,
-    leverages pytorch unfold function"""
+    """
+    Base class that just wraps applications of apply,
+    leverages pytorch unfold function
+    """
 
     def __init__(self, window_size, stride=1, offset=0):
         self.window_size = window_size
@@ -99,21 +126,27 @@ def __call__(self, labels, event_type):
 
 
 class TorchBinaryMCLStrategy(TorchLabelStrategy):
-    """any fault indicator, most common label"""
+    """
+    Any fault indicator, most common label
+    """
 
     def apply(self, y, event_type=None):
         return torch.mode(y, dim=-1)[0] > 0
 
 
 class TorchBinaryMRLStrategy(TorchLabelStrategy):
-    """any fault indicator, most recent label"""
+    """
+    Any fault indicator, most recent label
+    """
 
     def apply(self, y, event_type=None):
         return y[:, -1] > 0
 
 
 class TorchOVAMCLStrategy(TorchLabelStrategy):
-    """specific class indicator, most common label"""
+    """
+    Specific class indicator, most common label
+    """
 
     def __init__(self, fault_code, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -124,7 +157,9 @@ def apply(self, y, event_type=None):
 
 
 class TorchOVATransientMCLStrategy(TorchLabelStrategy):
-    """transients of specific class, most common label"""
+    """
+    Transients of specific class, most common label
+    """
 
     def __init__(self, fault_code, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/toolkit/mais/mais/data/utils.py b/toolkit/mais/mais/data/utils.py
@@ -3,13 +3,37 @@
 
 
 class StratifiedGroupKFold(BaseCrossValidator):
-    """GroupKFold with stratification based on the event type"""
+    """
+    GroupKFold with stratification based on the event type.
+
+    * Constructor arguments:
+        - **n_splits: int** -- Number of folds
+
+    * Methods:
+
+    - **split()**
+
+    - **get_n_splits()**
+    """
 
     def __init__(self, n_splits, event_types):
         self.base_splitter = StratifiedKFold(n_splits)
         self.event_types = event_types
 
     def split(self, X, y, groups):
+        """
+        Create the splits.
+
+        * Parameters:
+            - **X: np.ndarray** - Data
+
+            - **y: np.array** - Labels
+
+            - **groups: np.array** - Groups
+
+        * Yields:
+            - **splits**: [TUPLE] - Tuple with the index of training and test samples for each fold.
+        """
         unique_g = np.unique(groups)
         event_y = np.array([self.event_types[_] for _ in unique_g])
         indices = np.arange(groups.size)
diff --git a/toolkit/mais/mais/visualization/generate_report.py b/toolkit/mais/mais/visualization/generate_report.py