Update

mthrok · mthrok · commit 31e98ec1d410 · 2025-05-06T00:27:48.000-04:00
diff --git a/docs/source/_static/data/noisy_neighbour.json b/docs/source/_static/data/noisy_neighbour.json
@@ -1 +1 @@
-{"qps_progress": [0.0, 0.0, 0.0, 0.0, 0.01960, 0.03921, 0.05882, 0.07843, 0.09803, 0.11764, 0.13725, 0.15686, 0.17647, 0.19607, 0.21568, 0.23529, 0.25490, 0.27450, 0.29411, 0.31372, 0.33333, 0.35294, 0.37254, 0.39215, 0.41176, 0.43137, 0.45098, 0.47058, 0.49019, 0.50980, 0.52941, 0.54901, 0.56862, 0.58823, 0.60784, 0.62745, 0.64705, 0.66666, 0.68627, 0.70588, 0.72549, 0.74509, 0.76470, 0.78431, 0.80392, 0.82352, 0.84313, 0.86274, 0.88235, 0.90196, 0.92156, 0.94117, 0.96078, 0.98039, 1.0], "qps_mean": [0.10791, 1.87514, 2.57384, 2.67391, 3.02341, 3.12603, 3.32061, 3.53444, 3.46963, 3.26175, 3.26541, 3.25010, 3.12243, 3.58137, 3.23714, 3.23993, 3.01573, 2.96503, 2.97348, 2.94934, 2.89418, 2.66501, 2.35917, 2.42358, 2.69232, 2.77047, 2.54807, 2.37015, 2.47146, 2.20014, 2.17202, 2.25608, 2.19878, 2.25824, 1.92781, 1.99448, 2.14607, 2.05767, 1.95278, 2.01574, 1.68691, 1.55761, 1.66053, 1.67427, 1.86622, 1.76643, 1.60481, 1.53903, 1.67944, 1.72388, 1.51765, 1.32410, 1.52983, 1.55146, 1.87647], "qps_stddev": [0, 1.25455, 1.46699, 1.25969, 1.04913, 1.22202, 1.40702, 1.45786, 1.42813, 1.35803, 1.43339, 1.41791, 1.35332, 1.36855, 1.37899, 1.41477, 1.29630, 1.38904, 1.43945, 1.31643, 1.10465, 1.17494, 1.17286, 1.36767, 1.29234, 1.36281, 1.18392, 1.26153, 1.27737, 1.07307, 1.00691, 1.19452, 1.03432, 1.14718, 0.92830, 0.94255, 1.03902, 1.05562, 0.93414, 0.96145, 0.77778, 0.58446, 0.72601, 0.67886, 0.70404, 0.60771, 0.59011, 0.89848, 1.02205, 1.00180, 0.63176, 0.31394, 0.70842, 0.66721, 0.66586], "cpu_progress": [0.0, 0.00806, 0.03225, 0.05645, 0.08064, 0.10483, 0.12903, 0.15322, 0.17741, 0.20161, 0.22580, 0.25, 0.27419, 0.29838, 0.32258, 0.34677, 0.37096, 0.39516, 0.41935, 0.44354, 0.46774, 0.49193, 0.51612, 0.54032, 0.56451, 0.58870, 0.61290, 0.63709, 0.66129, 0.68548, 0.70967, 0.73387, 0.75806, 0.78225, 0.80645, 0.83064, 0.85483, 0.87903, 0.90322, 0.92741, 0.95161, 0.97580, 1.0], "cpu_util": [1.73577, 14.50419, 25.70489, 52.76610, 63.46297, 80.74637, 88.01955, 93.27711, 94.66833, 98.84798, 98.16609, 99.82730, 99.97933, 99.97475, 99.94684, 99.97874, 99.98940, 99.98598, 99.98760, 99.98402, 99.98447, 99.98795, 99.98720, 99.98923, 99.98865, 99.97695, 99.96919, 99.98981, 99.98569, 99.98836, 99.98894, 99.98940, 99.99044, 99.99143, 99.98836, 99.99143, 99.99096, 99.98766, 99.99085, 99.98963, 99.99125, 99.98842, 99.99154]} 
+{"qps_progress": [0.0, 0.0, 0.0, 0.01960, 0.03921, 0.05882, 0.07843, 0.09803, 0.11764, 0.13725, 0.15686, 0.17647, 0.19607, 0.21568, 0.23529, 0.25490, 0.27450, 0.29411, 0.31372, 0.33333, 0.35294, 0.37254, 0.39215, 0.41176, 0.43137, 0.45098, 0.47058, 0.49019, 0.50980, 0.52941, 0.54901, 0.56862, 0.58823, 0.60784, 0.62745, 0.64705, 0.66666, 0.68627, 0.70588, 0.72549, 0.74509, 0.76470, 0.78431, 0.80392, 0.82352, 0.84313, 0.86274, 0.88235, 0.90196, 0.92156, 0.94117, 0.96078, 0.98039, 1.0], "qps_mean": [2.64677, 3.57961, 3.68220, 4.19988, 3.85661, 4.03202, 4.01345, 4.28924, 4.15916, 4.18421, 4.00328, 4.11490, 3.91890, 3.64332, 3.76922, 3.70205, 3.53869, 3.65586, 3.58430, 3.68154, 3.62205, 3.72631, 3.81293, 3.54382, 3.72380, 3.75496, 3.88210, 3.75489, 3.84858, 3.62499, 3.67680, 3.64462, 3.68641, 3.55463, 3.47014, 3.47452, 3.50587, 3.40765, 3.59155, 3.47037, 3.32335, 3.02397, 2.94533, 2.98257, 2.88237, 2.88561, 2.91041, 2.97865, 3.04269, 3.13992, 3.19752, 3.20575, 3.03941, 2.74009], "qps_stddev": [1.97793, 1.81928, 1.67036, 0.93039, 1.10248, 0.90809, 0.86348, 0.65000, 0.85314, 0.89402, 0.96294, 0.82436, 1.02399, 1.00079, 0.92081, 0.81030, 0.87219, 0.95310, 0.86728, 0.90045, 0.83746, 0.84886, 1.00717, 1.02732, 1.01025, 0.77168, 0.78320, 0.95893, 1.19147, 1.23287, 1.09565, 0.95003, 0.90181, 0.89878, 0.97484, 1.04238, 1.08577, 1.11040, 0.98032, 0.83285, 0.83222, 0.66948, 0.84222, 0.78958, 0.70269, 0.61645, 0.74523, 0.89199, 1.09117, 1.21397, 1.09409, 0.98614, 1.03202, 0.89128], "cpu_progress": [0.0, 0.14, 0.29, 0.43, 0.57, 0.71, 0.86, 1.0], "cpu_util": [44.75, 52.73, 76.51, 73.48, 90.63, 90.83, 99.17, 99.47]} 
diff --git a/docs/source/performance_analysis/analysis.rst b/docs/source/performance_analysis/analysis.rst
@@ -1,4 +1,4 @@
-Analyzing the performance
+Analyzing the Performance
 =========================
 
 .. currentmodule:: spdl.pipeline
diff --git a/docs/source/performance_analysis/logging.rst b/docs/source/performance_analysis/logging.rst
@@ -1,9 +1,9 @@
-Measuring the performance
-=========================
+Collecting the Runtime Statistics
+=================================
 
 .. note::
 
-   Meta employees, please refer to `this <https://fburl.com/workplace/goxtxyng>`_.
+   If you are Meta employee, please refer to `this <https://fburl.com/workplace/goxtxyng>`_.
 
 .. py:currentmodule:: spdl.pipeline
 
@@ -31,7 +31,7 @@ Similarly for :py:class:`TaskPerfStats`
 #. In the ``interval_stats_callback`` method, save the fields of ``TaskPerfStats`` to
    a location you can access later. ††
 #. Create a factory function that takes a name of the stage functoin and
-   returns a list of :py:class:`TaskHook`s applied to the stage.
+   returns a list of :py:class:`TaskHook` s applied to the stage.
 #. Provide the factory function to :py:meth:`PipelineBuilder.build` method.
 
 .. note::
diff --git a/docs/source/performance_analysis/noisy_neighbour.rst b/docs/source/performance_analysis/noisy_neighbour.rst
@@ -22,9 +22,10 @@ for the CPU to launch GPU kernels in timely manner.
 We call this phenomenon "noisy neighbour".
 
 To show the effect of the noisy neighbour, we conducted an experiment.
-We ran a training pipeline, and at the end of each epoch, we added a subprocess
-that serves no functionality but consumes significant CPU resource.
-We kept adding subprocesses until (in fact even after) the CPU utilization hit 100%.
+We ran a pipeline that trains an image classification model.
+After some steps, we spawned a subprocess that serves no functionality but consumes
+CPU resource.
+We repeat this until the CPU utilization hit 100%.
 
 The following plot shows the how the training speed (batch per second) drops as
 we add more and more CPU loads.
@@ -36,7 +37,9 @@ we add more and more CPU loads.
 
 .. include:: ../plots/noisy_neighbour.txt
 
-What this means is that data loading needs to be not only fast but also efficient.
+This suggests that data loading needs to be not only fast but also efficient.
 At some point, we thought that since we are using GPUs for model computation, we
 can use CPU resources for data loading, and we should be utilizing CPU as much as
 possible. This turned out to be an anti-pattern.
+
+We now recommend to keep the CPU utilization at most 40% for efficient training.
diff --git a/docs/source/performance_analysis/setup.rst b/docs/source/performance_analysis/setup.rst
@@ -1,5 +1,5 @@
-Bulding a Pipeline
-==================
+Building a Pipeline
+===================
 
 .. py:currentmodule:: spdl.pipeline
 
@@ -69,13 +69,11 @@ Since downloading takes some time but does not consume much CPU resources,
 we make multiple download calls concurrently.
 
 Decoding the raw data and applying preprocessing can be time-consuming and
-compute-intensive. In AI model training, it is recommended to keep the total
-CPU utilization at most around 40% for the sake of keeping training QPS high.
-(When CPU utilization is high, CPU cannot schedule GPU kernel launch in
-timely manner. Every increase in CPU utilization comes with a drop in
-training QPS.)
+compute-intensive.
+As `previously described <./noisy_neighbour.html>`_, it is recommended to keep
+the total CPU utilization at most around 40% to avoid QPS drop.
 However, we want to prevent the training from suffering from data starvation.
-For this purpose, we want to apply some concurrency to preprocessing stage.
+For this purpose, we apply some concurrency to preprocessing stage.
 
 The data are then batched and transferred to GPU. These stages usually
 do not require concurrency. Concurrent GPU data transfer is not feasible
diff --git a/docs/source/plots/noisy_neighbour.txt b/docs/source/plots/noisy_neighbour.txt
@@ -3,16 +3,29 @@
    <script src='https://cdn.plot.ly/plotly-2.34.0.min.js'></script>
    <script>
     function plot(div_id, data) {
-        console.log(data);
+        const x = data['qps_progress'].map(v => 100*v);
+        const upper = data['qps_mean'].map((v, i) => v + data['qps_stddev'][i]);
+        const lower = data['qps_mean'].map((v, i) => v - data['qps_stddev'][i]);
+        const env_x = x.concat(x.slice().reverse());
+        const env_y = upper.concat(lower.reverse());
         const plot_data = [
             {
-                x: data['qps_progress'].map(v => 100*v),
+                x,
                 y: data['qps_mean'],
                 name: 'Training Speed [Batch/sec]',
                 line: {
                     size: 0.1,
                 },
             },
+            {
+                x: env_x,
+                y: env_y,
+                fill: "tozerox",
+                fillcolor: "rgba(0, 100, 80, 0.2)",
+                line: {color: "transparent"},
+                showlegend: false,
+                type: "scatter",
+            },
             {
                 x: data['cpu_progress'].map(v => 100*v),
                 y: data['cpu_util'],
@@ -38,15 +51,10 @@
                         yanchor: 'top'
                     },
                     xaxis: {
-                        title: {
-                            text: 'Training Progress',
-                        },
+                        title: 'Progress [%]',
                     },
                     yaxis: {
-                        title: {
-                            text: 'Training Speed [Batch/sec]',
-                        },
-                        range: [0, null],
+                        title: 'Training Speed [Batch/sec]',
                     },
                     yaxis2: {
                         title: {
@@ -55,7 +63,8 @@
                         showgrid: false,
                         overlaying: 'y',
                         side: 'right',
-                        range: [0, null],
+                        range: [0, null], 
+                        zeroline: false,
                     },
                 },
             },

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"qps_progress": [0.0, 0.0, 0.0, 0.0, 0.01960, 0.03921, 0.05882, 0.07843, 0.09803, 0.11764, 0.13725, 0.15686, 0.17647, 0.19607, 0.21568, 0.23529, 0.25490, 0.27450, 0.29411, 0.31372, 0.33333, 0.35294, 0.37254, 0.39215, 0.41176, 0.43137, 0.45098, 0.47058, 0.49019, 0.50980, 0.52941, 0.54901, 0.56862, 0.58823, 0.60784, 0.62745, 0.64705, 0.66666, 0.68627, 0.70588, 0.72549, 0.74509, 0.76470, 0.78431, 0.80392, 0.82352, 0.84313, 0.86274, 0.88235, 0.90196, 0.92156, 0.94117, 0.96078, 0.98039, 1.0], "qps_mean": [0.10791, 1.87514, 2.57384, 2.67391, 3.02341, 3.12603, 3.32061, 3.53444, 3.46963, 3.26175, 3.26541, 3.25010, 3.12243, 3.58137, 3.23714, 3.23993, 3.01573, 2.96503, 2.97348, 2.94934, 2.89418, 2.66501, 2.35917, 2.42358, 2.69232, 2.77047, 2.54807, 2.37015, 2.47146, 2.20014, 2.17202, 2.25608, 2.19878, 2.25824, 1.92781, 1.99448, 2.14607, 2.05767, 1.95278, 2.01574, 1.68691, 1.55761, 1.66053, 1.67427, 1.86622, 1.76643, 1.60481, 1.53903, 1.67944, 1.72388, 1.51765, 1.32410, 1.52983, 1.55146, 1.87647], "qps_stddev": [0, 1.25455, 1.46699, 1.25969, 1.04913, 1.22202, 1.40702, 1.45786, 1.42813, 1.35803, 1.43339, 1.41791, 1.35332, 1.36855, 1.37899, 1.41477, 1.29630, 1.38904, 1.43945, 1.31643, 1.10465, 1.17494, 1.17286, 1.36767, 1.29234, 1.36281, 1.18392, 1.26153, 1.27737, 1.07307, 1.00691, 1.19452, 1.03432, 1.14718, 0.92830, 0.94255, 1.03902, 1.05562, 0.93414, 0.96145, 0.77778, 0.58446, 0.72601, 0.67886, 0.70404, 0.60771, 0.59011, 0.89848, 1.02205, 1.00180, 0.63176, 0.31394, 0.70842, 0.66721, 0.66586], "cpu_progress": [0.0, 0.00806, 0.03225, 0.05645, 0.08064, 0.10483, 0.12903, 0.15322, 0.17741, 0.20161, 0.22580, 0.25, 0.27419, 0.29838, 0.32258, 0.34677, 0.37096, 0.39516, 0.41935, 0.44354, 0.46774, 0.49193, 0.51612, 0.54032, 0.56451, 0.58870, 0.61290, 0.63709, 0.66129, 0.68548, 0.70967, 0.73387, 0.75806, 0.78225, 0.80645, 0.83064, 0.85483, 0.87903, 0.90322, 0.92741, 0.95161, 0.97580, 1.0], "cpu_util": [1.73577, 14.50419, 25.70489, 52.76610, 63.46297, 80.74637, 88.01955, 93.27711, 94.66833, 98.84798, 98.16609, 99.82730, 99.97933, 99.97475, 99.94684, 99.97874, 99.98940, 99.98598, 99.98760, 99.98402, 99.98447, 99.98795, 99.98720, 99.98923, 99.98865, 99.97695, 99.96919, 99.98981, 99.98569, 99.98836, 99.98894, 99.98940, 99.99044, 99.99143, 99.98836, 99.99143, 99.99096, 99.98766, 99.99085, 99.98963, 99.99125, 99.98842, 99.99154]}
	`1`	+{"qps_progress": [0.0, 0.0, 0.0, 0.01960, 0.03921, 0.05882, 0.07843, 0.09803, 0.11764, 0.13725, 0.15686, 0.17647, 0.19607, 0.21568, 0.23529, 0.25490, 0.27450, 0.29411, 0.31372, 0.33333, 0.35294, 0.37254, 0.39215, 0.41176, 0.43137, 0.45098, 0.47058, 0.49019, 0.50980, 0.52941, 0.54901, 0.56862, 0.58823, 0.60784, 0.62745, 0.64705, 0.66666, 0.68627, 0.70588, 0.72549, 0.74509, 0.76470, 0.78431, 0.80392, 0.82352, 0.84313, 0.86274, 0.88235, 0.90196, 0.92156, 0.94117, 0.96078, 0.98039, 1.0], "qps_mean": [2.64677, 3.57961, 3.68220, 4.19988, 3.85661, 4.03202, 4.01345, 4.28924, 4.15916, 4.18421, 4.00328, 4.11490, 3.91890, 3.64332, 3.76922, 3.70205, 3.53869, 3.65586, 3.58430, 3.68154, 3.62205, 3.72631, 3.81293, 3.54382, 3.72380, 3.75496, 3.88210, 3.75489, 3.84858, 3.62499, 3.67680, 3.64462, 3.68641, 3.55463, 3.47014, 3.47452, 3.50587, 3.40765, 3.59155, 3.47037, 3.32335, 3.02397, 2.94533, 2.98257, 2.88237, 2.88561, 2.91041, 2.97865, 3.04269, 3.13992, 3.19752, 3.20575, 3.03941, 2.74009], "qps_stddev": [1.97793, 1.81928, 1.67036, 0.93039, 1.10248, 0.90809, 0.86348, 0.65000, 0.85314, 0.89402, 0.96294, 0.82436, 1.02399, 1.00079, 0.92081, 0.81030, 0.87219, 0.95310, 0.86728, 0.90045, 0.83746, 0.84886, 1.00717, 1.02732, 1.01025, 0.77168, 0.78320, 0.95893, 1.19147, 1.23287, 1.09565, 0.95003, 0.90181, 0.89878, 0.97484, 1.04238, 1.08577, 1.11040, 0.98032, 0.83285, 0.83222, 0.66948, 0.84222, 0.78958, 0.70269, 0.61645, 0.74523, 0.89199, 1.09117, 1.21397, 1.09409, 0.98614, 1.03202, 0.89128], "cpu_progress": [0.0, 0.14, 0.29, 0.43, 0.57, 0.71, 0.86, 1.0], "cpu_util": [44.75, 52.73, 76.51, 73.48, 90.63, 90.83, 99.17, 99.47]}
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Analyzing the performance`
	`1`	`+Analyzing the Performance`
`2`	`2`	`=========================`
`3`	`3`
`4`	`4`	`.. currentmodule:: spdl.pipeline`