Merge pull request #450 from jspeed-meyers/add_comments

cglewis · web-flow · commit 839b89089c40 · 2019-10-16T10:30:46.000+13:00
added comments
diff --git a/networkml/NetworkML.py b/networkml/NetworkML.py
@@ -16,35 +16,65 @@
 
 class NetworkML():
     """'
-    Main class to run different algorithms against different network
-    traffic data sources
+    Main class that instantiates prediction models of the types of devices found
+    in computer network traffic and whether that device is acting normal
+    given its type (also based on network traffic). The three model types
+    built in to this class are random forests, neural networks, and stochastic
+    outlier selection (SOS).
     """
 
     def __init__(self):
+
+        ## Set logging information for instance
         self.logger = logging.getLogger(__name__)
         logging.basicConfig(level=logging.INFO)
 
+        ## Take arguments from command line
         self.args = None
         self.read_args()
+
+        ## Take input from configuration file
         self.get_config()
         self.common = Common(config=self.config)
+
+        ## Instantiate a logger to to leg messages to aid debugging
         self.logger = Common().setup_logger(self.logger)
+
+        ## Add network traffic files for parsing
         self.get_files()
         self.model_hash = None
         self.model = Model(duration=self.duration, hidden_size=None,
                            model_type=self.args.algorithm)
+
+        ## Check whether operation is evaluation, train, or test
+        ## Evaluation returns predictions that are useful for the deployment
+        ## of networkml in an operational environment.
         if self.args.operation == 'eval':
             self.load_model()
+
+            ## Check for model type specified
+            ## onelayer refers to a one-layer neural network
             if self.args.algorithm == 'onelayer':
                 BaseAlgorithm(files=self.files, config=self.config,
-                              model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval(self.args.algorithm)
+                              model=self.model, model_hash=self.model_hash,
+                              model_path=self.args.trained_model).eval(self.args.algorithm)
+
+            ## Random forests refers to a decision tree-based model
             elif self.args.algorithm == 'randomforest':
                 BaseAlgorithm(files=self.files, config=self.config,
-                              model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval(self.args.algorithm)
+                              model=self.model, model_hash=self.model_hash,
+                              model_path=self.args.trained_model).eval(self.args.algorithm)
+
+            ## SOS refers to statistical outlier selection model
             elif self.args.algorithm == 'sos':
                 from networkml.algorithms.sos.eval_SoSModel import eval_pcap
                 eval_pcap(self.args.path, self.conf_labels, self.time_const)
+
+        ## Train entails training a new model on specific packet captures
         elif self.args.operation == 'train':
+
+            ## Check for model type specified
+            ## onelayer refers to a one-layer neural network
             if self.args.algorithm == 'onelayer':
                 m = MLPClassifier(
                     (self.state_size),
@@ -54,7 +84,10 @@ def __init__(self):
                 )
                 BaseAlgorithm(files=self.files, config=self.config,
                               model=self.model, model_hash=self.model_hash,
-                              model_path=self.args.trained_model).train(self.args.path, self.args.save, m, self.args.algorithm)
+                              model_path=self.args.trained_model).train(self.args.path,
+                                self.args.save, m, self.args.algorithm)
+
+            ## Random forests refers to a decision tree-based model
             elif self.args.algorithm == 'randomforest':
                 m = RandomForestClassifier(
                     n_estimators=100,
@@ -63,24 +96,44 @@ def __init__(self):
                 )
                 BaseAlgorithm(files=self.files, config=self.config,
                               model=self.model, model_hash=self.model_hash,
-                              model_path=self.args.trained_model).train(self.args.path, self.args.save, m, self.args.algorithm)
+                              model_path=self.args.trained_model).train(self.args.path,
+                                self.args.save, m, self.args.algorithm)
+
+            ## SOS refers to statistical outlier selection model
             elif self.args.algorithm == 'sos':
                 from networkml.algorithms.sos.train_SoSModel import train
                 train(self.args.path, self.time_const, self.rnn_size,
                       self.conf_labels, self.args.save)
+
+        ## Test is for checking overall performance of networkML models for
+        ## the device classification task. It is a benchmarking operation.
         elif self.args.operation == 'test':
             self.load_model()
+
+            ## Check for model type specified
+            ## onelayer refers to a one-layer neural network
             if self.args.algorithm == 'onelayer':
                 BaseAlgorithm(files=self.files, config=self.config,
-                              model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test(self.args.path, self.args.save)
+                              model=self.model, model_hash=self.model_hash,
+                              model_path=self.args.trained_model).test(self.args.path,
+                                self.args.save)
+
+            # Random forests refers to a decision tree-based model
             elif self.args.algorithm == 'randomforest':
                 BaseAlgorithm(files=self.files, config=self.config,
-                              model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test(self.args.path, self.args.save)
+                              model=self.model, model_hash=self.model_hash,
+                              model_path=self.args.trained_model).test(self.args.path,
+                                self.args.save)
+
+            ## SOS refers to statistical outlier selection model
             elif self.args.algorithm == 'sos':
-                self.logger.info(
-                    'There is no testing operation for the SoSModel.')
+                self.logger.info('There is no testing operation for the SoSModel.')
 
     def read_args(self):
+        """
+        Read arguments from command line to determine what operations to
+        implement.
+        """
         parser = argparse.ArgumentParser()
         parser.add_argument('--algorithm', '-a', default='onelayer',
                             choices=['onelayer', 'randomforest', 'sos'],
@@ -102,7 +155,12 @@ def read_args(self):
         return
 
     def get_files(self):
-        # TODO checking extensions here should be moved to parsers, and it should probably use 'magic' rather than extensions
+        """
+        Add directory of files or file for parsing.
+        """
+        # TODO checking extensions here should be moved to parsers, and it should
+        # probably use 'magic' rather than extensions. See Python magic library
+
         self.files = []
         if Path(self.args.path).is_dir():
             for root, dirnames, filenames in os.walk(self.args.path):
@@ -121,31 +179,60 @@ def get_files(self):
                 'Did not find file(s) from \'%s\'.', str(self.args.path))
         return
 
-    def get_config(self, cfg_file='networkml/configs/config.json', labels_file='networkml/configs/label_assignments.json'):
+    def get_config(self, cfg_file='networkml/configs/config.json',
+                   labels_file='networkml/configs/label_assignments.json'):
+        """
+        Load values from configuration file.
+
+        Args:
+            cfg_file: path to configuration file
+            labels_file: path to labels (or the types of devices predicted)
+        """
         try:
             with open(cfg_file, 'r') as config_file:
                 self.config = json.load(config_file)
+
+            ## Time constant is used for creating a moving average
             self.time_const = self.config['time constant']
+
+            ## State size sets the number of nodes in the neural network
             self.state_size = self.config['state size']
-            self.look_time = self.config['look time']
+
+            ## An amount of time set between investigations of a potentially
+            ## suspicious device
+            self.look_time = self.config['look time'] ## time in seconds
+
+            ## Threshold sets the confidence needed to identify abnormal
+            ## behavior
             self.threshold = self.config['threshold']
+
+            ## Set parameter for SOS model
             self.rnn_size = self.config['rnn size']
+
+            ## Duration for time window of network traffic for which to computer
+            ## information on features
             self.duration = self.config['duration']
+
             #self.batch_size = self.config['batch size']
+
+            ## Import device label typology
             with open(labels_file, 'r') as label_file:
                 labels = json.load(label_file)
             self.conf_labels = []
             for label in labels:
                 self.conf_labels.append(labels[label])
             self.conf_labels.append('Unknown')
             self.config['conf labels'] = self.conf_labels
+
         except Exception as e:  # pragma: no cover
             self.logger.error(
                 "Unable to read '%s' properly because: %s", cfg_file, str(e))
         return
 
     def load_model(self):
-        # Compute model hash
+        """
+        Load trained machine learning model.
+        """
         with open(self.args.trained_model, 'rb') as handle:
             self.model_hash = hashlib.sha224(handle.read()).hexdigest()
 
diff --git a/networkml/configs/README.md b/networkml/configs/README.md
@@ -1,28 +1,38 @@
-# Config Files Explained
+# Configuration Files Explained
 
 ## Overview
-Config Files define the variables that the NetworkML model will monitor, how 
-those variables are analyzed to determine device types operating on a network, 
-and what are the typical operating characteristics of common network devices.  
+These configuration files define the variables that the NetworkML model will
+monitor, how those variables are computed, and how the machine learning model
+should be executed.
 
-## Config 
-Establishes limits on variables used by the code to identify types of devices on various networks. 
-Also consolidates the location of these variables, allowing for ease of customization.
+## Config
+This file consolidates these variables into one location to make future
+adjustments to these variable values easier.
 
-### Config File Value Definitions
+### Configuration File Value Definitions
 
-1. batch size
-2. duration
-3. look time
-4. max port
-6. rnn size
-7. session threshhold
-8. source identifier
-9. state size
-10. threshhold
-11. time constant
+1. Batch size - The number of training examples in a single pass. This is a
+parameter used to train the stochastic outlier selection model.
+2. Duration - This variable defines the time window of network traffic for which to computer information on features.
+3. Look time - This variable defines (in seconds) the minimum time between
+re-investigation of a potentially suspicious device.
+4. Max Port - This variable sets the maximum port number for feature creation.
+All ports below this number are included as part of the feature creation process.
+1024 is the value because these are the so-called well-known ports, i.e. the
+most common ports.
+6. RNN Size - This variable is a parameter in the stochastic outlier selection
+model.
+7. Session Threshold - This is the minimum number of packets needed for a
+session to be included in analysis.
+8. Source Identifier - Variable for how networkML determines what device is
+initiating a session.
+9. State Size - A variable for the number of neurons (or nodes)in the neural
+network model.
+10. Threshold - A percentage threshold for the confidence needed to deem a session
+bin abnormal. 99 is an arbitrary cut point.
+11. Time Constant - This variable is used as part of an operation to take a
+moving average. The value 86,400 is the number of seconds in a day. (60 * 60 * 24)
 
 ## Label Assignments
-Defines the various device classes that the model will identify on a network. The model builds 
-a profile of typical behavior of the various device classes and can identify when these devices 
-are acting abnormally. This can be customized to cover the specific device classes needed by individual users.
+These labels define the various device classes that the model will identify on a network. The model builds a profile of typical behavior of these device classes and can identify when these devices are acting abnormally, e.g. when a printer is
+acting abnormally. These labels can be customized to the specific device classes needed by individual users.