One more iteration

aalmada · aalmada · commit d4c13b0e96b6 · 2024-04-26T22:03:49.000+01:00
diff --git a/src/NetFabric.Numerics.Tensors/ApplyBinary.cs b/src/NetFabric.Numerics.Tensors/ApplyBinary.cs
@@ -1,3 +1,5 @@
+using System;
+
 namespace NetFabric.Numerics.Tensors;
 
 public static partial class Tensor
@@ -35,28 +37,32 @@ public static void Apply<T1, T2, TResult, TOperator>(ReadOnlyMemory<T1> x, ReadO
         if (x.Length > destination.Length)
             Throw.ArgumentException(nameof(destination), "Destination span is too small.");
 
-        if(x.Length > 2 * minChunkSize)
-            ParallelApply(x, y, destination);
+        var coreCount = AvailableCores();
+
+        if (coreCount >= minChunkCount && x.Length > minChunkCount * minChunkSize)
+            ParallelApply(x, y, destination, coreCount);
         else
             Apply<T1, T2, TResult, TOperator>(x.Span, y.Span, destination.Span);
 
-        static void ParallelApply(ReadOnlyMemory<T1> x, ReadOnlyMemory<T2> y, Memory<TResult> destination)
+        static void ParallelApply(ReadOnlyMemory<T1> x, ReadOnlyMemory<T2> y, Memory<TResult> destination, int coreCount)
         {
-            var size = x.Length;
-            var chunkSize = int.Max(size / AvailableCores(), minChunkSize);
+            var totalSize = x.Length;
+            var chunkSize = int.Max(totalSize / coreCount, minChunkSize);
 
-            var actions = new Action[size / chunkSize];
+            var actions = new Action[totalSize / chunkSize];
+            var start = 0;
             for (var index = 0; index < actions.Length; index++)
             {
-                var start = index * chunkSize;
-                var length = (index == actions.Length - 1) 
-                    ? size - start
+                var length = (index == actions.Length - 1)
+                    ? totalSize - start
                     : chunkSize;
 
                 var xSlice = x.Slice(start, length);
                 var ySlice = y.Slice(start, length);
                 var destinationSlice = destination.Slice(start, length);
                 actions[index] = () => Apply<T1, T2, TResult, TOperator>(xSlice.Span, ySlice.Span, destinationSlice.Span);
+
+                start += length;
             }
             Parallel.Invoke(actions);
         }
@@ -111,13 +117,13 @@ public static void Apply<T1, T2, TResult, TOperator>(ReadOnlySpan<T1> x, ReadOnl
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T1>.IsSupported &&
             Vector<T2>.IsSupported &&
             Vector<TResult>.IsSupported &&
-            x.Length >= Vector<T1>.Count)
+            x.Length > Vector<T1>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
@@ -220,13 +226,13 @@ public static void Apply<T1, T2, TResult, TOperator>(ReadOnlySpan<T1> x, T2 y, S
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T1>.IsSupported &&
             Vector<T2>.IsSupported &&
             Vector<TResult>.IsSupported &&
-            x.Length >= Vector<T1>.Count)
+            x.Length > Vector<T1>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
@@ -328,15 +334,15 @@ public static void Apply<T1, T2, TResult, TOperator>(ReadOnlySpan<T1> x, (T2, T2
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T1>.IsSupported &&
             Vector<T2>.IsSupported &&
             Vector<TResult>.IsSupported &&
             Vector<T1>.Count > 2 &&
             Vector<T1>.Count % 2 is 0 &&
-            x.Length >= Vector<T1>.Count)
+            x.Length > Vector<T1>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
diff --git a/src/NetFabric.Numerics.Tensors/ApplyTernary.cs b/src/NetFabric.Numerics.Tensors/ApplyTernary.cs
@@ -57,14 +57,14 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, Rea
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T1>.IsSupported &&
             Vector<T2>.IsSupported &&
             Vector<T3>.IsSupported &&
             Vector<TResult>.IsSupported &&
-            x.Length >= Vector<T1>.Count)
+            x.Length > Vector<T1>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
@@ -178,14 +178,14 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, T2
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T1>.IsSupported &&
             Vector<T2>.IsSupported &&
             Vector<T3>.IsSupported &&
             Vector<TResult>.IsSupported &&
-            x.Length >= Vector<T1>.Count)
+            x.Length > Vector<T1>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
@@ -299,7 +299,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, (T2
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T1>.IsSupported &&
@@ -308,7 +308,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, (T2
             Vector<TResult>.IsSupported &&
             Vector<T1>.Count > 2 &&
             Vector<T1>.Count % 2 is 0 &&
-            x.Length >= Vector<T1>.Count)
+            x.Length > Vector<T1>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
@@ -474,14 +474,14 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, Rea
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T1>.IsSupported &&
             Vector<T2>.IsSupported &&
             Vector<T3>.IsSupported &&
             Vector<TResult>.IsSupported &&
-            x.Length >= Vector<T1>.Count)
+            x.Length > Vector<T1>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
@@ -595,7 +595,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, Rea
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T1>.IsSupported &&
@@ -604,7 +604,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, Rea
             Vector<TResult>.IsSupported &&
             Vector<T1>.Count > 2 &&
             Vector<T1>.Count % 2 is 0 &&
-            x.Length >= Vector<T1>.Count)
+            x.Length > Vector<T1>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
@@ -776,14 +776,14 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, T2
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T1>.IsSupported &&
             Vector<T2>.IsSupported &&
             Vector<T3>.IsSupported &&
             Vector<TResult>.IsSupported &&
-            x.Length >= Vector<T1>.Count)
+            x.Length > Vector<T1>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
@@ -890,7 +890,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, (T2
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T1>.IsSupported &&
@@ -899,7 +899,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, (T2
             Vector<TResult>.IsSupported &&
             Vector<T1>.Count > 2 &&
             Vector<T1>.Count % 2 is 0 &&
-            x.Length >= Vector<T1>.Count)
+            x.Length > Vector<T1>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
diff --git a/src/NetFabric.Numerics.Tensors/ApplyUnary.cs b/src/NetFabric.Numerics.Tensors/ApplyUnary.cs
@@ -31,27 +31,31 @@ public static void Apply<T, TResult, TOperator>(ReadOnlyMemory<T> x, Memory<TRes
         if (x.Length > destination.Length)
             Throw.ArgumentException(nameof(destination), "Destination span is too small.");
 
-        if(x.Length > 2 * minChunkSize)
-            ParallelApply(x, destination);
+        var coreCount = AvailableCores();
+
+        if (coreCount >= minChunkCount && x.Length > minChunkCount * minChunkSize)
+            ParallelApply(x, destination, coreCount);
         else
             Apply<T, TResult, TOperator>(x.Span, destination.Span);
 
-        static void ParallelApply(ReadOnlyMemory<T> source, Memory<TResult> destination)
+        static void ParallelApply(ReadOnlyMemory<T> x, Memory<TResult> destination, int coreCount)
         {
-            var size = source.Length;
-            var chunkSize = int.Max(size / AvailableCores(), minChunkSize);
+            var totalSize = x.Length;
+            var chunkSize = int.Max(totalSize / coreCount, minChunkSize);
 
-            var actions = new Action[size / chunkSize];
+            var actions = new Action[totalSize / chunkSize];
+            var start = 0;
             for (var index = 0; index < actions.Length; index++)
             {
-                var start = index * chunkSize;
-                var length = (index == actions.Length - 1) 
-                    ? size - start
+                var length = (index == actions.Length - 1)
+                    ? totalSize - start
                     : chunkSize;
 
-                var sourceSlice = source.Slice(start, length);
+                var xSlice = x.Slice(start, length);
                 var destinationSlice = destination.Slice(start, length);
-                actions[index] = () => Apply<T, TResult, TOperator>(sourceSlice.Span, destinationSlice.Span);
+                actions[index] = () => Apply<T, TResult, TOperator>(xSlice.Span, destinationSlice.Span);
+
+                start += length;
             }
             Parallel.Invoke(actions);
         }
@@ -98,12 +102,12 @@ public static void Apply<T, TResult, TOperator>(ReadOnlySpan<T> x, Span<TResult>
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T>.IsSupported &&
             Vector<TResult>.IsSupported &&
-            x.Length >= Vector<T>.Count)
+            x.Length > Vector<T>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var sourceVectors = MemoryMarshal.Cast<T, Vector<T>>(x);
@@ -215,14 +219,14 @@ public static void Apply2<T, TResult1, TResult2, TOperator1, TOperator2>(ReadOnl
         var indexSource = 0;
 
         // Check if hardware acceleration and Vector<T> support are available,
-        // and if the length of the x is greater than the Vector<T>.Count.
+        // and if the length of the x is greater than the length of Vector<T>.
         if (TOperator1.IsVectorizable &&
             TOperator2.IsVectorizable &&
             Vector.IsHardwareAccelerated &&
             Vector<T>.IsSupported &&
             Vector<TResult1>.IsSupported &&
             Vector<TResult2>.IsSupported &&
-            x.Length >= Vector<T>.Count)
+            x.Length > Vector<T>.Count)
         {
             // Cast the spans to vectors for hardware acceleration.
             var sourceVectors = MemoryMarshal.Cast<T, Vector<T>>(x);
diff --git a/src/NetFabric.Numerics.Tensors/Tensor.cs b/src/NetFabric.Numerics.Tensors/Tensor.cs
@@ -10,7 +10,8 @@ namespace NetFabric.Numerics.Tensors;
 /// </remarks>
 public static partial class Tensor
 {
-    const int minChunkSize = 100;
+    const int minChunkSize = 1_000;
+    const int minChunkCount = 4;
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     static int AvailableCores() 

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,8 @@ namespace NetFabric.Numerics.Tensors;`
`10`	`10`	`/// </remarks>`
`11`	`11`	`public static partial class Tensor`
`12`	`12`	`{`
`13`		`- const int minChunkSize = 100;`
	`13`	`+ const int minChunkSize = 1_000;`
	`14`	`+ const int minChunkCount = 4;`
`14`	`15`
`15`	`16`	`[MethodImpl(MethodImplOptions.AggressiveInlining)]`
`16`	`17`	`static int AvailableCores()`