Skip to content

Commit 839b890

Browse files
authored
Merge pull request #450 from jspeed-meyers/add_comments
added comments
2 parents babfb70 + f655f4d commit 839b890

2 files changed

Lines changed: 132 additions & 35 deletions

File tree

networkml/NetworkML.py

100755100644
Lines changed: 101 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,35 +16,65 @@
1616

1717
class NetworkML():
1818
"""'
19-
Main class to run different algorithms against different network
20-
traffic data sources
19+
Main class that instantiates prediction models of the types of devices found
20+
in computer network traffic and whether that device is acting normal
21+
given its type (also based on network traffic). The three model types
22+
built in to this class are random forests, neural networks, and stochastic
23+
outlier selection (SOS).
2124
"""
2225

2326
def __init__(self):
27+
28+
## Set logging information for instance
2429
self.logger = logging.getLogger(__name__)
2530
logging.basicConfig(level=logging.INFO)
2631

32+
## Take arguments from command line
2733
self.args = None
2834
self.read_args()
35+
36+
## Take input from configuration file
2937
self.get_config()
3038
self.common = Common(config=self.config)
39+
40+
## Instantiate a logger to to leg messages to aid debugging
3141
self.logger = Common().setup_logger(self.logger)
42+
43+
## Add network traffic files for parsing
3244
self.get_files()
3345
self.model_hash = None
3446
self.model = Model(duration=self.duration, hidden_size=None,
3547
model_type=self.args.algorithm)
48+
49+
## Check whether operation is evaluation, train, or test
50+
## Evaluation returns predictions that are useful for the deployment
51+
## of networkml in an operational environment.
3652
if self.args.operation == 'eval':
3753
self.load_model()
54+
55+
## Check for model type specified
56+
## onelayer refers to a one-layer neural network
3857
if self.args.algorithm == 'onelayer':
3958
BaseAlgorithm(files=self.files, config=self.config,
40-
model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval(self.args.algorithm)
59+
model=self.model, model_hash=self.model_hash,
60+
model_path=self.args.trained_model).eval(self.args.algorithm)
61+
62+
## Random forests refers to a decision tree-based model
4163
elif self.args.algorithm == 'randomforest':
4264
BaseAlgorithm(files=self.files, config=self.config,
43-
model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval(self.args.algorithm)
65+
model=self.model, model_hash=self.model_hash,
66+
model_path=self.args.trained_model).eval(self.args.algorithm)
67+
68+
## SOS refers to statistical outlier selection model
4469
elif self.args.algorithm == 'sos':
4570
from networkml.algorithms.sos.eval_SoSModel import eval_pcap
4671
eval_pcap(self.args.path, self.conf_labels, self.time_const)
72+
73+
## Train entails training a new model on specific packet captures
4774
elif self.args.operation == 'train':
75+
76+
## Check for model type specified
77+
## onelayer refers to a one-layer neural network
4878
if self.args.algorithm == 'onelayer':
4979
m = MLPClassifier(
5080
(self.state_size),
@@ -54,7 +84,10 @@ def __init__(self):
5484
)
5585
BaseAlgorithm(files=self.files, config=self.config,
5686
model=self.model, model_hash=self.model_hash,
57-
model_path=self.args.trained_model).train(self.args.path, self.args.save, m, self.args.algorithm)
87+
model_path=self.args.trained_model).train(self.args.path,
88+
self.args.save, m, self.args.algorithm)
89+
90+
## Random forests refers to a decision tree-based model
5891
elif self.args.algorithm == 'randomforest':
5992
m = RandomForestClassifier(
6093
n_estimators=100,
@@ -63,24 +96,44 @@ def __init__(self):
6396
)
6497
BaseAlgorithm(files=self.files, config=self.config,
6598
model=self.model, model_hash=self.model_hash,
66-
model_path=self.args.trained_model).train(self.args.path, self.args.save, m, self.args.algorithm)
99+
model_path=self.args.trained_model).train(self.args.path,
100+
self.args.save, m, self.args.algorithm)
101+
102+
## SOS refers to statistical outlier selection model
67103
elif self.args.algorithm == 'sos':
68104
from networkml.algorithms.sos.train_SoSModel import train
69105
train(self.args.path, self.time_const, self.rnn_size,
70106
self.conf_labels, self.args.save)
107+
108+
## Test is for checking overall performance of networkML models for
109+
## the device classification task. It is a benchmarking operation.
71110
elif self.args.operation == 'test':
72111
self.load_model()
112+
113+
## Check for model type specified
114+
## onelayer refers to a one-layer neural network
73115
if self.args.algorithm == 'onelayer':
74116
BaseAlgorithm(files=self.files, config=self.config,
75-
model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test(self.args.path, self.args.save)
117+
model=self.model, model_hash=self.model_hash,
118+
model_path=self.args.trained_model).test(self.args.path,
119+
self.args.save)
120+
121+
# Random forests refers to a decision tree-based model
76122
elif self.args.algorithm == 'randomforest':
77123
BaseAlgorithm(files=self.files, config=self.config,
78-
model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test(self.args.path, self.args.save)
124+
model=self.model, model_hash=self.model_hash,
125+
model_path=self.args.trained_model).test(self.args.path,
126+
self.args.save)
127+
128+
## SOS refers to statistical outlier selection model
79129
elif self.args.algorithm == 'sos':
80-
self.logger.info(
81-
'There is no testing operation for the SoSModel.')
130+
self.logger.info('There is no testing operation for the SoSModel.')
82131

83132
def read_args(self):
133+
"""
134+
Read arguments from command line to determine what operations to
135+
implement.
136+
"""
84137
parser = argparse.ArgumentParser()
85138
parser.add_argument('--algorithm', '-a', default='onelayer',
86139
choices=['onelayer', 'randomforest', 'sos'],
@@ -102,7 +155,12 @@ def read_args(self):
102155
return
103156

104157
def get_files(self):
105-
# TODO checking extensions here should be moved to parsers, and it should probably use 'magic' rather than extensions
158+
"""
159+
Add directory of files or file for parsing.
160+
"""
161+
# TODO checking extensions here should be moved to parsers, and it should
162+
# probably use 'magic' rather than extensions. See Python magic library
163+
106164
self.files = []
107165
if Path(self.args.path).is_dir():
108166
for root, dirnames, filenames in os.walk(self.args.path):
@@ -121,31 +179,60 @@ def get_files(self):
121179
'Did not find file(s) from \'%s\'.', str(self.args.path))
122180
return
123181

124-
def get_config(self, cfg_file='networkml/configs/config.json', labels_file='networkml/configs/label_assignments.json'):
182+
def get_config(self, cfg_file='networkml/configs/config.json',
183+
labels_file='networkml/configs/label_assignments.json'):
184+
"""
185+
Load values from configuration file.
186+
187+
Args:
188+
cfg_file: path to configuration file
189+
labels_file: path to labels (or the types of devices predicted)
190+
"""
125191
try:
126192
with open(cfg_file, 'r') as config_file:
127193
self.config = json.load(config_file)
194+
195+
## Time constant is used for creating a moving average
128196
self.time_const = self.config['time constant']
197+
198+
## State size sets the number of nodes in the neural network
129199
self.state_size = self.config['state size']
130-
self.look_time = self.config['look time']
200+
201+
## An amount of time set between investigations of a potentially
202+
## suspicious device
203+
self.look_time = self.config['look time'] ## time in seconds
204+
205+
## Threshold sets the confidence needed to identify abnormal
206+
## behavior
131207
self.threshold = self.config['threshold']
208+
209+
## Set parameter for SOS model
132210
self.rnn_size = self.config['rnn size']
211+
212+
## Duration for time window of network traffic for which to computer
213+
## information on features
133214
self.duration = self.config['duration']
215+
134216
#self.batch_size = self.config['batch size']
217+
218+
## Import device label typology
135219
with open(labels_file, 'r') as label_file:
136220
labels = json.load(label_file)
137221
self.conf_labels = []
138222
for label in labels:
139223
self.conf_labels.append(labels[label])
140224
self.conf_labels.append('Unknown')
141225
self.config['conf labels'] = self.conf_labels
226+
142227
except Exception as e: # pragma: no cover
143228
self.logger.error(
144229
"Unable to read '%s' properly because: %s", cfg_file, str(e))
145230
return
146231

147232
def load_model(self):
148-
# Compute model hash
233+
"""
234+
Load trained machine learning model.
235+
"""
149236
with open(self.args.trained_model, 'rb') as handle:
150237
self.model_hash = hashlib.sha224(handle.read()).hexdigest()
151238

networkml/configs/README.md

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,38 @@
1-
# Config Files Explained
1+
# Configuration Files Explained
22

33
## Overview
4-
Config Files define the variables that the NetworkML model will monitor, how
5-
those variables are analyzed to determine device types operating on a network,
6-
and what are the typical operating characteristics of common network devices.
4+
These configuration files define the variables that the NetworkML model will
5+
monitor, how those variables are computed, and how the machine learning model
6+
should be executed.
77

8-
## Config
9-
Establishes limits on variables used by the code to identify types of devices on various networks.
10-
Also consolidates the location of these variables, allowing for ease of customization.
8+
## Config
9+
This file consolidates these variables into one location to make future
10+
adjustments to these variable values easier.
1111

12-
### Config File Value Definitions
12+
### Configuration File Value Definitions
1313

14-
1. batch size
15-
2. duration
16-
3. look time
17-
4. max port
18-
6. rnn size
19-
7. session threshhold
20-
8. source identifier
21-
9. state size
22-
10. threshhold
23-
11. time constant
14+
1. Batch size - The number of training examples in a single pass. This is a
15+
parameter used to train the stochastic outlier selection model.
16+
2. Duration - This variable defines the time window of network traffic for which to computer information on features.
17+
3. Look time - This variable defines (in seconds) the minimum time between
18+
re-investigation of a potentially suspicious device.
19+
4. Max Port - This variable sets the maximum port number for feature creation.
20+
All ports below this number are included as part of the feature creation process.
21+
1024 is the value because these are the so-called well-known ports, i.e. the
22+
most common ports.
23+
6. RNN Size - This variable is a parameter in the stochastic outlier selection
24+
model.
25+
7. Session Threshold - This is the minimum number of packets needed for a
26+
session to be included in analysis.
27+
8. Source Identifier - Variable for how networkML determines what device is
28+
initiating a session.
29+
9. State Size - A variable for the number of neurons (or nodes)in the neural
30+
network model.
31+
10. Threshold - A percentage threshold for the confidence needed to deem a session
32+
bin abnormal. 99 is an arbitrary cut point.
33+
11. Time Constant - This variable is used as part of an operation to take a
34+
moving average. The value 86,400 is the number of seconds in a day. (60 * 60 * 24)
2435

2536
## Label Assignments
26-
Defines the various device classes that the model will identify on a network. The model builds
27-
a profile of typical behavior of the various device classes and can identify when these devices
28-
are acting abnormally. This can be customized to cover the specific device classes needed by individual users.
37+
These labels define the various device classes that the model will identify on a network. The model builds a profile of typical behavior of these device classes and can identify when these devices are acting abnormally, e.g. when a printer is
38+
acting abnormally. These labels can be customized to the specific device classes needed by individual users.

0 commit comments

Comments
 (0)