Skip to content

Commit 31e98ec

Browse files
committed
Update
1 parent 8558b6a commit 31e98ec

File tree

6 files changed

+38
-28
lines changed

6 files changed

+38
-28
lines changed
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"qps_progress": [0.0, 0.0, 0.0, 0.0, 0.01960, 0.03921, 0.05882, 0.07843, 0.09803, 0.11764, 0.13725, 0.15686, 0.17647, 0.19607, 0.21568, 0.23529, 0.25490, 0.27450, 0.29411, 0.31372, 0.33333, 0.35294, 0.37254, 0.39215, 0.41176, 0.43137, 0.45098, 0.47058, 0.49019, 0.50980, 0.52941, 0.54901, 0.56862, 0.58823, 0.60784, 0.62745, 0.64705, 0.66666, 0.68627, 0.70588, 0.72549, 0.74509, 0.76470, 0.78431, 0.80392, 0.82352, 0.84313, 0.86274, 0.88235, 0.90196, 0.92156, 0.94117, 0.96078, 0.98039, 1.0], "qps_mean": [0.10791, 1.87514, 2.57384, 2.67391, 3.02341, 3.12603, 3.32061, 3.53444, 3.46963, 3.26175, 3.26541, 3.25010, 3.12243, 3.58137, 3.23714, 3.23993, 3.01573, 2.96503, 2.97348, 2.94934, 2.89418, 2.66501, 2.35917, 2.42358, 2.69232, 2.77047, 2.54807, 2.37015, 2.47146, 2.20014, 2.17202, 2.25608, 2.19878, 2.25824, 1.92781, 1.99448, 2.14607, 2.05767, 1.95278, 2.01574, 1.68691, 1.55761, 1.66053, 1.67427, 1.86622, 1.76643, 1.60481, 1.53903, 1.67944, 1.72388, 1.51765, 1.32410, 1.52983, 1.55146, 1.87647], "qps_stddev": [0, 1.25455, 1.46699, 1.25969, 1.04913, 1.22202, 1.40702, 1.45786, 1.42813, 1.35803, 1.43339, 1.41791, 1.35332, 1.36855, 1.37899, 1.41477, 1.29630, 1.38904, 1.43945, 1.31643, 1.10465, 1.17494, 1.17286, 1.36767, 1.29234, 1.36281, 1.18392, 1.26153, 1.27737, 1.07307, 1.00691, 1.19452, 1.03432, 1.14718, 0.92830, 0.94255, 1.03902, 1.05562, 0.93414, 0.96145, 0.77778, 0.58446, 0.72601, 0.67886, 0.70404, 0.60771, 0.59011, 0.89848, 1.02205, 1.00180, 0.63176, 0.31394, 0.70842, 0.66721, 0.66586], "cpu_progress": [0.0, 0.00806, 0.03225, 0.05645, 0.08064, 0.10483, 0.12903, 0.15322, 0.17741, 0.20161, 0.22580, 0.25, 0.27419, 0.29838, 0.32258, 0.34677, 0.37096, 0.39516, 0.41935, 0.44354, 0.46774, 0.49193, 0.51612, 0.54032, 0.56451, 0.58870, 0.61290, 0.63709, 0.66129, 0.68548, 0.70967, 0.73387, 0.75806, 0.78225, 0.80645, 0.83064, 0.85483, 0.87903, 0.90322, 0.92741, 0.95161, 0.97580, 1.0], "cpu_util": [1.73577, 14.50419, 25.70489, 52.76610, 63.46297, 80.74637, 88.01955, 93.27711, 94.66833, 98.84798, 98.16609, 99.82730, 99.97933, 99.97475, 99.94684, 99.97874, 99.98940, 99.98598, 99.98760, 99.98402, 99.98447, 99.98795, 99.98720, 99.98923, 99.98865, 99.97695, 99.96919, 99.98981, 99.98569, 99.98836, 99.98894, 99.98940, 99.99044, 99.99143, 99.98836, 99.99143, 99.99096, 99.98766, 99.99085, 99.98963, 99.99125, 99.98842, 99.99154]}
1+
{"qps_progress": [0.0, 0.0, 0.0, 0.01960, 0.03921, 0.05882, 0.07843, 0.09803, 0.11764, 0.13725, 0.15686, 0.17647, 0.19607, 0.21568, 0.23529, 0.25490, 0.27450, 0.29411, 0.31372, 0.33333, 0.35294, 0.37254, 0.39215, 0.41176, 0.43137, 0.45098, 0.47058, 0.49019, 0.50980, 0.52941, 0.54901, 0.56862, 0.58823, 0.60784, 0.62745, 0.64705, 0.66666, 0.68627, 0.70588, 0.72549, 0.74509, 0.76470, 0.78431, 0.80392, 0.82352, 0.84313, 0.86274, 0.88235, 0.90196, 0.92156, 0.94117, 0.96078, 0.98039, 1.0], "qps_mean": [2.64677, 3.57961, 3.68220, 4.19988, 3.85661, 4.03202, 4.01345, 4.28924, 4.15916, 4.18421, 4.00328, 4.11490, 3.91890, 3.64332, 3.76922, 3.70205, 3.53869, 3.65586, 3.58430, 3.68154, 3.62205, 3.72631, 3.81293, 3.54382, 3.72380, 3.75496, 3.88210, 3.75489, 3.84858, 3.62499, 3.67680, 3.64462, 3.68641, 3.55463, 3.47014, 3.47452, 3.50587, 3.40765, 3.59155, 3.47037, 3.32335, 3.02397, 2.94533, 2.98257, 2.88237, 2.88561, 2.91041, 2.97865, 3.04269, 3.13992, 3.19752, 3.20575, 3.03941, 2.74009], "qps_stddev": [1.97793, 1.81928, 1.67036, 0.93039, 1.10248, 0.90809, 0.86348, 0.65000, 0.85314, 0.89402, 0.96294, 0.82436, 1.02399, 1.00079, 0.92081, 0.81030, 0.87219, 0.95310, 0.86728, 0.90045, 0.83746, 0.84886, 1.00717, 1.02732, 1.01025, 0.77168, 0.78320, 0.95893, 1.19147, 1.23287, 1.09565, 0.95003, 0.90181, 0.89878, 0.97484, 1.04238, 1.08577, 1.11040, 0.98032, 0.83285, 0.83222, 0.66948, 0.84222, 0.78958, 0.70269, 0.61645, 0.74523, 0.89199, 1.09117, 1.21397, 1.09409, 0.98614, 1.03202, 0.89128], "cpu_progress": [0.0, 0.14, 0.29, 0.43, 0.57, 0.71, 0.86, 1.0], "cpu_util": [44.75, 52.73, 76.51, 73.48, 90.63, 90.83, 99.17, 99.47]}

docs/source/performance_analysis/analysis.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Analyzing the performance
1+
Analyzing the Performance
22
=========================
33

44
.. currentmodule:: spdl.pipeline

docs/source/performance_analysis/logging.rst

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
Measuring the performance
2-
=========================
1+
Collecting the Runtime Statistics
2+
=================================
33

44
.. note::
55

6-
Meta employees, please refer to `this <https://fburl.com/workplace/goxtxyng>`_.
6+
If you are Meta employee, please refer to `this <https://fburl.com/workplace/goxtxyng>`_.
77

88
.. py:currentmodule:: spdl.pipeline
99
@@ -31,7 +31,7 @@ Similarly for :py:class:`TaskPerfStats`
3131
#. In the ``interval_stats_callback`` method, save the fields of ``TaskPerfStats`` to
3232
a location you can access later. ††
3333
#. Create a factory function that takes a name of the stage functoin and
34-
returns a list of :py:class:`TaskHook`s applied to the stage.
34+
returns a list of :py:class:`TaskHook` s applied to the stage.
3535
#. Provide the factory function to :py:meth:`PipelineBuilder.build` method.
3636

3737
.. note::

docs/source/performance_analysis/noisy_neighbour.rst

+7-4
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@ for the CPU to launch GPU kernels in timely manner.
2222
We call this phenomenon "noisy neighbour".
2323

2424
To show the effect of the noisy neighbour, we conducted an experiment.
25-
We ran a training pipeline, and at the end of each epoch, we added a subprocess
26-
that serves no functionality but consumes significant CPU resource.
27-
We kept adding subprocesses until (in fact even after) the CPU utilization hit 100%.
25+
We ran a pipeline that trains an image classification model.
26+
After some steps, we spawned a subprocess that serves no functionality but consumes
27+
CPU resource.
28+
We repeat this until the CPU utilization hit 100%.
2829

2930
The following plot shows the how the training speed (batch per second) drops as
3031
we add more and more CPU loads.
@@ -36,7 +37,9 @@ we add more and more CPU loads.
3637

3738
.. include:: ../plots/noisy_neighbour.txt
3839

39-
What this means is that data loading needs to be not only fast but also efficient.
40+
This suggests that data loading needs to be not only fast but also efficient.
4041
At some point, we thought that since we are using GPUs for model computation, we
4142
can use CPU resources for data loading, and we should be utilizing CPU as much as
4243
possible. This turned out to be an anti-pattern.
44+
45+
We now recommend to keep the CPU utilization at most 40% for efficient training.

docs/source/performance_analysis/setup.rst

+6-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
Bulding a Pipeline
2-
==================
1+
Building a Pipeline
2+
===================
33

44
.. py:currentmodule:: spdl.pipeline
55
@@ -69,13 +69,11 @@ Since downloading takes some time but does not consume much CPU resources,
6969
we make multiple download calls concurrently.
7070

7171
Decoding the raw data and applying preprocessing can be time-consuming and
72-
compute-intensive. In AI model training, it is recommended to keep the total
73-
CPU utilization at most around 40% for the sake of keeping training QPS high.
74-
(When CPU utilization is high, CPU cannot schedule GPU kernel launch in
75-
timely manner. Every increase in CPU utilization comes with a drop in
76-
training QPS.)
72+
compute-intensive.
73+
As `previously described <./noisy_neighbour.html>`_, it is recommended to keep
74+
the total CPU utilization at most around 40% to avoid QPS drop.
7775
However, we want to prevent the training from suffering from data starvation.
78-
For this purpose, we want to apply some concurrency to preprocessing stage.
76+
For this purpose, we apply some concurrency to preprocessing stage.
7977

8078
The data are then batched and transferred to GPU. These stages usually
8179
do not require concurrency. Concurrent GPU data transfer is not feasible

docs/source/plots/noisy_neighbour.txt

+19-10
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,29 @@
33
<script src='https://cdn.plot.ly/plotly-2.34.0.min.js'></script>
44
<script>
55
function plot(div_id, data) {
6-
console.log(data);
6+
const x = data['qps_progress'].map(v => 100*v);
7+
const upper = data['qps_mean'].map((v, i) => v + data['qps_stddev'][i]);
8+
const lower = data['qps_mean'].map((v, i) => v - data['qps_stddev'][i]);
9+
const env_x = x.concat(x.slice().reverse());
10+
const env_y = upper.concat(lower.reverse());
711
const plot_data = [
812
{
9-
x: data['qps_progress'].map(v => 100*v),
13+
x,
1014
y: data['qps_mean'],
1115
name: 'Training Speed [Batch/sec]',
1216
line: {
1317
size: 0.1,
1418
},
1519
},
20+
{
21+
x: env_x,
22+
y: env_y,
23+
fill: "tozerox",
24+
fillcolor: "rgba(0, 100, 80, 0.2)",
25+
line: {color: "transparent"},
26+
showlegend: false,
27+
type: "scatter",
28+
},
1629
{
1730
x: data['cpu_progress'].map(v => 100*v),
1831
y: data['cpu_util'],
@@ -38,15 +51,10 @@
3851
yanchor: 'top'
3952
},
4053
xaxis: {
41-
title: {
42-
text: 'Training Progress',
43-
},
54+
title: 'Progress [%]',
4455
},
4556
yaxis: {
46-
title: {
47-
text: 'Training Speed [Batch/sec]',
48-
},
49-
range: [0, null],
57+
title: 'Training Speed [Batch/sec]',
5058
},
5159
yaxis2: {
5260
title: {
@@ -55,7 +63,8 @@
5563
showgrid: false,
5664
overlaying: 'y',
5765
side: 'right',
58-
range: [0, null],
66+
range: [0, null],
67+
zeroline: false,
5968
},
6069
},
6170
},

0 commit comments

Comments
 (0)