Gamma parameter gradients show high deviations compared to conv kernel gradient blocks. Perhaps add a streamable layerscale module?
0.0.bias: mean abs diff=7.424616e-17, max abs diff=4.440892e-16,
0.0.weight: mean abs diff=1.307843e-14, max abs diff=1.403322e-13,
1.0.blocks.0.conv_dw.bias: mean abs diff=6.397660e-16, max abs diff=7.105427e-15,
1.0.blocks.0.conv_dw.weight: mean abs diff=7.089380e-16, max abs diff=3.641532e-14,
1.0.blocks.0.gamma: mean abs diff=5.871523e-01, max abs diff=2.660054e+00,
1.0.blocks.0.mlp.fc1.bias: mean abs diff=1.224335e-17, max abs diff=2.220446e-16,
1.0.blocks.0.mlp.fc1.weight: mean abs diff=5.823970e-17, max abs diff=1.776357e-15,
1.0.blocks.0.mlp.fc2.bias: mean abs diff=8.630249e-17, max abs diff=4.440892e-16,
1.0.blocks.0.mlp.fc2.weight: mean abs diff=1.474591e-16, max abs diff=8.881784e-15,
1.0.blocks.1.conv_dw.bias: mean abs diff=1.082467e-16, max abs diff=8.881784e-16,
1.0.blocks.1.conv_dw.weight: mean abs diff=7.973810e-16, max abs diff=1.243450e-14,
1.0.blocks.1.gamma: mean abs diff=2.162112e-01, max abs diff=1.134846e+00,
1.0.blocks.1.mlp.fc1.bias: mean abs diff=9.008534e-18, max abs diff=1.110223e-16,
1.0.blocks.1.mlp.fc1.weight: mean abs diff=1.266210e-16, max abs diff=7.105427e-15,
1.0.blocks.1.mlp.fc2.bias: mean abs diff=8.951173e-17, max abs diff=4.440892e-16,
1.0.blocks.1.mlp.fc2.weight: mean abs diff=7.805527e-17, max abs diff=4.440892e-15,
1.1.blocks.0.conv_dw.bias: mean abs diff=7.285839e-16, max abs diff=5.329071e-15,
1.1.blocks.0.conv_dw.weight: mean abs diff=1.218858e-16, max abs diff=3.552714e-15,
1.1.blocks.0.gamma: mean abs diff=2.083396e-01, max abs diff=1.931796e+00,
1.1.blocks.0.mlp.fc1.bias: mean abs diff=9.041187e-18, max abs diff=1.110223e-16,
1.1.blocks.0.mlp.fc1.weight: mean abs diff=3.701872e-17, max abs diff=3.552714e-15,
1.1.blocks.0.mlp.fc2.bias: mean abs diff=1.157061e-16, max abs diff=8.881784e-16,
1.1.blocks.0.mlp.fc2.weight: mean abs diff=6.458364e-17, max abs diff=3.552714e-15,
1.1.blocks.1.conv_dw.bias: mean abs diff=5.091413e-16, max abs diff=3.552714e-15,
1.1.blocks.1.conv_dw.weight: mean abs diff=2.511419e-16, max abs diff=9.769963e-15,
1.1.blocks.1.gamma: mean abs diff=1.005777e-01, max abs diff=9.112262e-01,
1.1.blocks.1.mlp.fc1.bias: mean abs diff=8.898251e-18, max abs diff=2.220446e-16,
1.1.blocks.1.mlp.fc1.weight: mean abs diff=5.329241e-17, max abs diff=2.842171e-14,
1.1.blocks.1.mlp.fc2.bias: mean abs diff=1.420739e-16, max abs diff=8.881784e-16,
1.1.blocks.1.mlp.fc2.weight: mean abs diff=7.804615e-17, max abs diff=3.108624e-15,
1.1.downsample.1.bias: mean abs diff=2.175777e-16, max abs diff=1.776357e-15,
1.1.downsample.1.weight: mean abs diff=2.291909e-15, max abs diff=1.030287e-13,
1.2.blocks.0.conv_dw.bias: mean abs diff=4.158349e-16, max abs diff=5.329071e-15,
1.2.blocks.0.conv_dw.weight: mean abs diff=3.973715e-17, max abs diff=1.332268e-15,
1.2.blocks.0.gamma: mean abs diff=1.674242e-01, max abs diff=1.467530e+00,
1.2.blocks.0.mlp.fc1.bias: mean abs diff=4.992518e-18, max abs diff=1.110223e-16,
1.2.blocks.0.mlp.fc1.weight: mean abs diff=1.743732e-17, max abs diff=2.220446e-15,
1.2.blocks.0.mlp.fc2.bias: mean abs diff=9.285650e-17, max abs diff=8.881784e-16,
1.2.blocks.0.mlp.fc2.weight: mean abs diff=2.115008e-17, max abs diff=1.776357e-15,
1.2.blocks.1.conv_dw.bias: mean abs diff=3.696262e-16, max abs diff=1.776357e-15,
1.2.blocks.1.conv_dw.weight: mean abs diff=4.248679e-17, max abs diff=1.776357e-15,
1.2.blocks.1.gamma: mean abs diff=9.885591e-02, max abs diff=9.602708e-01,
1.2.blocks.1.mlp.fc1.bias: mean abs diff=4.519307e-18, max abs diff=1.110223e-16,
1.2.blocks.1.mlp.fc1.weight: mean abs diff=1.894024e-17, max abs diff=3.996803e-15,
1.2.blocks.1.mlp.fc2.bias: mean abs diff=8.100074e-17, max abs diff=8.881784e-16,
1.2.blocks.1.mlp.fc2.weight: mean abs diff=1.888286e-17, max abs diff=3.108624e-15,
1.2.blocks.2.conv_dw.bias: mean abs diff=3.841870e-16, max abs diff=1.776357e-15,
1.2.blocks.2.conv_dw.weight: mean abs diff=6.055337e-17, max abs diff=5.329071e-15,
1.2.blocks.2.gamma: mean abs diff=5.728735e-02, max abs diff=4.270648e-01,
1.2.blocks.2.mlp.fc1.bias: mean abs diff=4.173268e-18, max abs diff=1.110223e-16,
1.2.blocks.2.mlp.fc1.weight: mean abs diff=1.599104e-17, max abs diff=3.996803e-15,
1.2.blocks.2.mlp.fc2.bias: mean abs diff=9.401117e-17, max abs diff=1.332268e-15,
1.2.blocks.2.mlp.fc2.weight: mean abs diff=1.953590e-17, max abs diff=1.110223e-15,
1.2.blocks.3.conv_dw.bias: mean abs diff=2.494424e-16, max abs diff=1.776357e-15,
1.2.blocks.3.conv_dw.weight: mean abs diff=8.190154e-17, max abs diff=2.131628e-14,
1.2.blocks.3.gamma: mean abs diff=5.176520e-02, max abs diff=2.586028e-01,
1.2.blocks.3.mlp.fc1.bias: mean abs diff=3.705144e-18, max abs diff=1.110223e-16,
1.2.blocks.3.mlp.fc1.weight: mean abs diff=1.154254e-17, max abs diff=4.440892e-15,
1.2.blocks.3.mlp.fc2.bias: mean abs diff=9.627038e-17, max abs diff=8.881784e-16,
1.2.blocks.3.mlp.fc2.weight: mean abs diff=1.523018e-17, max abs diff=1.776357e-15,
1.2.blocks.4.conv_dw.bias: mean abs diff=8.977601e-17, max abs diff=4.440892e-16,
1.2.blocks.4.conv_dw.weight: mean abs diff=2.181298e-17, max abs diff=6.661338e-16,
1.2.blocks.4.gamma: mean abs diff=4.759725e-02, max abs diff=4.262359e-01,
1.2.blocks.4.mlp.fc1.bias: mean abs diff=2.401824e-18, max abs diff=5.551115e-17,
1.2.blocks.4.mlp.fc1.weight: mean abs diff=1.352729e-17, max abs diff=2.220446e-15,
1.2.blocks.4.mlp.fc2.bias: mean abs diff=6.478311e-17, max abs diff=6.661338e-16,
1.2.blocks.4.mlp.fc2.weight: mean abs diff=1.075652e-17, max abs diff=1.998401e-15,
1.2.blocks.5.conv_dw.bias: mean abs diff=5.110116e-17, max abs diff=4.440892e-16,
1.2.blocks.5.conv_dw.weight: mean abs diff=1.832375e-17, max abs diff=6.661338e-16,
1.2.blocks.5.gamma: mean abs diff=3.021551e-02, max abs diff=1.612601e-01,
1.2.blocks.5.mlp.fc1.bias: mean abs diff=2.080357e-18, max abs diff=1.110223e-16,
1.2.blocks.5.mlp.fc1.weight: mean abs diff=1.110075e-17, max abs diff=3.774758e-15,
1.2.blocks.5.mlp.fc2.bias: mean abs diff=6.109615e-17, max abs diff=5.551115e-16,
1.2.blocks.5.mlp.fc2.weight: mean abs diff=9.006916e-18, max abs diff=1.332268e-15,
1.2.downsample.1.bias: mean abs diff=3.225854e-16, max abs diff=2.664535e-15,
1.2.downsample.1.weight: mean abs diff=5.522733e-16, max abs diff=3.952394e-14,
1.3.blocks.0.conv_dw.bias: mean abs diff=2.033530e-16, max abs diff=1.776357e-15,
1.3.blocks.0.conv_dw.weight: mean abs diff=9.865559e-18, max abs diff=3.330669e-16,
1.3.blocks.0.gamma: mean abs diff=5.768254e-02, max abs diff=1.245738e+00,
1.3.blocks.0.mlp.fc1.bias: mean abs diff=4.043902e-18, max abs diff=1.110223e-16,
1.3.blocks.0.mlp.fc1.weight: mean abs diff=7.278092e-17, max abs diff=3.019807e-14,
1.3.blocks.0.mlp.fc2.bias: mean abs diff=5.866923e-17, max abs diff=8.881784e-16,
1.3.blocks.0.mlp.fc2.weight: mean abs diff=1.548034e-16, max abs diff=2.797762e-14,
1.3.blocks.1.conv_dw.bias: mean abs diff=7.448469e-17, max abs diff=8.881784e-16,
1.3.blocks.1.conv_dw.weight: mean abs diff=7.977038e-18, max abs diff=6.661338e-16,
1.3.blocks.1.gamma: mean abs diff=3.968934e-02, max abs diff=1.628441e-01,
1.3.blocks.1.mlp.fc1.bias: mean abs diff=3.709433e-18, max abs diff=1.110223e-16,
1.3.blocks.1.mlp.fc1.weight: mean abs diff=8.546961e-17, max abs diff=2.753353e-14,
1.3.blocks.1.mlp.fc2.bias: mean abs diff=4.922278e-17, max abs diff=3.330669e-16,
1.3.blocks.1.mlp.fc2.weight: mean abs diff=2.174300e-16, max abs diff=2.331468e-14,
1.3.downsample.1.bias: mean abs diff=8.187827e-17, max abs diff=8.881784e-16,
1.3.downsample.1.weight: mean abs diff=8.041167e-17, max abs diff=7.549517e-15,
Gamma parameter gradients show high deviations compared to conv kernel gradient blocks. Perhaps add a streamable layerscale module?
0.0.bias: mean abs diff=7.424616e-17, max abs diff=4.440892e-16,
0.0.weight: mean abs diff=1.307843e-14, max abs diff=1.403322e-13,
1.0.blocks.0.conv_dw.bias: mean abs diff=6.397660e-16, max abs diff=7.105427e-15,
1.0.blocks.0.conv_dw.weight: mean abs diff=7.089380e-16, max abs diff=3.641532e-14,
1.0.blocks.0.gamma: mean abs diff=5.871523e-01, max abs diff=2.660054e+00,
1.0.blocks.0.mlp.fc1.bias: mean abs diff=1.224335e-17, max abs diff=2.220446e-16,
1.0.blocks.0.mlp.fc1.weight: mean abs diff=5.823970e-17, max abs diff=1.776357e-15,
1.0.blocks.0.mlp.fc2.bias: mean abs diff=8.630249e-17, max abs diff=4.440892e-16,
1.0.blocks.0.mlp.fc2.weight: mean abs diff=1.474591e-16, max abs diff=8.881784e-15,
1.0.blocks.1.conv_dw.bias: mean abs diff=1.082467e-16, max abs diff=8.881784e-16,
1.0.blocks.1.conv_dw.weight: mean abs diff=7.973810e-16, max abs diff=1.243450e-14,
1.0.blocks.1.gamma: mean abs diff=2.162112e-01, max abs diff=1.134846e+00,
1.0.blocks.1.mlp.fc1.bias: mean abs diff=9.008534e-18, max abs diff=1.110223e-16,
1.0.blocks.1.mlp.fc1.weight: mean abs diff=1.266210e-16, max abs diff=7.105427e-15,
1.0.blocks.1.mlp.fc2.bias: mean abs diff=8.951173e-17, max abs diff=4.440892e-16,
1.0.blocks.1.mlp.fc2.weight: mean abs diff=7.805527e-17, max abs diff=4.440892e-15,
1.1.blocks.0.conv_dw.bias: mean abs diff=7.285839e-16, max abs diff=5.329071e-15,
1.1.blocks.0.conv_dw.weight: mean abs diff=1.218858e-16, max abs diff=3.552714e-15,
1.1.blocks.0.gamma: mean abs diff=2.083396e-01, max abs diff=1.931796e+00,
1.1.blocks.0.mlp.fc1.bias: mean abs diff=9.041187e-18, max abs diff=1.110223e-16,
1.1.blocks.0.mlp.fc1.weight: mean abs diff=3.701872e-17, max abs diff=3.552714e-15,
1.1.blocks.0.mlp.fc2.bias: mean abs diff=1.157061e-16, max abs diff=8.881784e-16,
1.1.blocks.0.mlp.fc2.weight: mean abs diff=6.458364e-17, max abs diff=3.552714e-15,
1.1.blocks.1.conv_dw.bias: mean abs diff=5.091413e-16, max abs diff=3.552714e-15,
1.1.blocks.1.conv_dw.weight: mean abs diff=2.511419e-16, max abs diff=9.769963e-15,
1.1.blocks.1.gamma: mean abs diff=1.005777e-01, max abs diff=9.112262e-01,
1.1.blocks.1.mlp.fc1.bias: mean abs diff=8.898251e-18, max abs diff=2.220446e-16,
1.1.blocks.1.mlp.fc1.weight: mean abs diff=5.329241e-17, max abs diff=2.842171e-14,
1.1.blocks.1.mlp.fc2.bias: mean abs diff=1.420739e-16, max abs diff=8.881784e-16,
1.1.blocks.1.mlp.fc2.weight: mean abs diff=7.804615e-17, max abs diff=3.108624e-15,
1.1.downsample.1.bias: mean abs diff=2.175777e-16, max abs diff=1.776357e-15,
1.1.downsample.1.weight: mean abs diff=2.291909e-15, max abs diff=1.030287e-13,
1.2.blocks.0.conv_dw.bias: mean abs diff=4.158349e-16, max abs diff=5.329071e-15,
1.2.blocks.0.conv_dw.weight: mean abs diff=3.973715e-17, max abs diff=1.332268e-15,
1.2.blocks.0.gamma: mean abs diff=1.674242e-01, max abs diff=1.467530e+00,
1.2.blocks.0.mlp.fc1.bias: mean abs diff=4.992518e-18, max abs diff=1.110223e-16,
1.2.blocks.0.mlp.fc1.weight: mean abs diff=1.743732e-17, max abs diff=2.220446e-15,
1.2.blocks.0.mlp.fc2.bias: mean abs diff=9.285650e-17, max abs diff=8.881784e-16,
1.2.blocks.0.mlp.fc2.weight: mean abs diff=2.115008e-17, max abs diff=1.776357e-15,
1.2.blocks.1.conv_dw.bias: mean abs diff=3.696262e-16, max abs diff=1.776357e-15,
1.2.blocks.1.conv_dw.weight: mean abs diff=4.248679e-17, max abs diff=1.776357e-15,
1.2.blocks.1.gamma: mean abs diff=9.885591e-02, max abs diff=9.602708e-01,
1.2.blocks.1.mlp.fc1.bias: mean abs diff=4.519307e-18, max abs diff=1.110223e-16,
1.2.blocks.1.mlp.fc1.weight: mean abs diff=1.894024e-17, max abs diff=3.996803e-15,
1.2.blocks.1.mlp.fc2.bias: mean abs diff=8.100074e-17, max abs diff=8.881784e-16,
1.2.blocks.1.mlp.fc2.weight: mean abs diff=1.888286e-17, max abs diff=3.108624e-15,
1.2.blocks.2.conv_dw.bias: mean abs diff=3.841870e-16, max abs diff=1.776357e-15,
1.2.blocks.2.conv_dw.weight: mean abs diff=6.055337e-17, max abs diff=5.329071e-15,
1.2.blocks.2.gamma: mean abs diff=5.728735e-02, max abs diff=4.270648e-01,
1.2.blocks.2.mlp.fc1.bias: mean abs diff=4.173268e-18, max abs diff=1.110223e-16,
1.2.blocks.2.mlp.fc1.weight: mean abs diff=1.599104e-17, max abs diff=3.996803e-15,
1.2.blocks.2.mlp.fc2.bias: mean abs diff=9.401117e-17, max abs diff=1.332268e-15,
1.2.blocks.2.mlp.fc2.weight: mean abs diff=1.953590e-17, max abs diff=1.110223e-15,
1.2.blocks.3.conv_dw.bias: mean abs diff=2.494424e-16, max abs diff=1.776357e-15,
1.2.blocks.3.conv_dw.weight: mean abs diff=8.190154e-17, max abs diff=2.131628e-14,
1.2.blocks.3.gamma: mean abs diff=5.176520e-02, max abs diff=2.586028e-01,
1.2.blocks.3.mlp.fc1.bias: mean abs diff=3.705144e-18, max abs diff=1.110223e-16,
1.2.blocks.3.mlp.fc1.weight: mean abs diff=1.154254e-17, max abs diff=4.440892e-15,
1.2.blocks.3.mlp.fc2.bias: mean abs diff=9.627038e-17, max abs diff=8.881784e-16,
1.2.blocks.3.mlp.fc2.weight: mean abs diff=1.523018e-17, max abs diff=1.776357e-15,
1.2.blocks.4.conv_dw.bias: mean abs diff=8.977601e-17, max abs diff=4.440892e-16,
1.2.blocks.4.conv_dw.weight: mean abs diff=2.181298e-17, max abs diff=6.661338e-16,
1.2.blocks.4.gamma: mean abs diff=4.759725e-02, max abs diff=4.262359e-01,
1.2.blocks.4.mlp.fc1.bias: mean abs diff=2.401824e-18, max abs diff=5.551115e-17,
1.2.blocks.4.mlp.fc1.weight: mean abs diff=1.352729e-17, max abs diff=2.220446e-15,
1.2.blocks.4.mlp.fc2.bias: mean abs diff=6.478311e-17, max abs diff=6.661338e-16,
1.2.blocks.4.mlp.fc2.weight: mean abs diff=1.075652e-17, max abs diff=1.998401e-15,
1.2.blocks.5.conv_dw.bias: mean abs diff=5.110116e-17, max abs diff=4.440892e-16,
1.2.blocks.5.conv_dw.weight: mean abs diff=1.832375e-17, max abs diff=6.661338e-16,
1.2.blocks.5.gamma: mean abs diff=3.021551e-02, max abs diff=1.612601e-01,
1.2.blocks.5.mlp.fc1.bias: mean abs diff=2.080357e-18, max abs diff=1.110223e-16,
1.2.blocks.5.mlp.fc1.weight: mean abs diff=1.110075e-17, max abs diff=3.774758e-15,
1.2.blocks.5.mlp.fc2.bias: mean abs diff=6.109615e-17, max abs diff=5.551115e-16,
1.2.blocks.5.mlp.fc2.weight: mean abs diff=9.006916e-18, max abs diff=1.332268e-15,
1.2.downsample.1.bias: mean abs diff=3.225854e-16, max abs diff=2.664535e-15,
1.2.downsample.1.weight: mean abs diff=5.522733e-16, max abs diff=3.952394e-14,
1.3.blocks.0.conv_dw.bias: mean abs diff=2.033530e-16, max abs diff=1.776357e-15,
1.3.blocks.0.conv_dw.weight: mean abs diff=9.865559e-18, max abs diff=3.330669e-16,
1.3.blocks.0.gamma: mean abs diff=5.768254e-02, max abs diff=1.245738e+00,
1.3.blocks.0.mlp.fc1.bias: mean abs diff=4.043902e-18, max abs diff=1.110223e-16,
1.3.blocks.0.mlp.fc1.weight: mean abs diff=7.278092e-17, max abs diff=3.019807e-14,
1.3.blocks.0.mlp.fc2.bias: mean abs diff=5.866923e-17, max abs diff=8.881784e-16,
1.3.blocks.0.mlp.fc2.weight: mean abs diff=1.548034e-16, max abs diff=2.797762e-14,
1.3.blocks.1.conv_dw.bias: mean abs diff=7.448469e-17, max abs diff=8.881784e-16,
1.3.blocks.1.conv_dw.weight: mean abs diff=7.977038e-18, max abs diff=6.661338e-16,
1.3.blocks.1.gamma: mean abs diff=3.968934e-02, max abs diff=1.628441e-01,
1.3.blocks.1.mlp.fc1.bias: mean abs diff=3.709433e-18, max abs diff=1.110223e-16,
1.3.blocks.1.mlp.fc1.weight: mean abs diff=8.546961e-17, max abs diff=2.753353e-14,
1.3.blocks.1.mlp.fc2.bias: mean abs diff=4.922278e-17, max abs diff=3.330669e-16,
1.3.blocks.1.mlp.fc2.weight: mean abs diff=2.174300e-16, max abs diff=2.331468e-14,
1.3.downsample.1.bias: mean abs diff=8.187827e-17, max abs diff=8.881784e-16,
1.3.downsample.1.weight: mean abs diff=8.041167e-17, max abs diff=7.549517e-15,