Awaitable parallel loops (#86)

mratsim · web-flow · commit 5ca533182beb · 2019-12-31T23:02:10.000+01:00
* Rename sync(Weave) to syncRoot(Weave) to make clear that it is not composable

* Introduce fine-grained awaitable for-loop

* fix comment in capture section

* make parallel reduction compile standalone

* Add yet-to-be-proper sync on awaitable loops

* Sometimes the task you try to split is not the current task anymore

* update changelog

* update histogram and logsumexp to use the awaitable loops

* Fighting your way through recursive imports, and static early symbol resolution

* Allox sync on not iterated loop (i.e. iterations = 0)

* Well, it seems like awaitable loops are not enough to describe the data dependencies of GEMM :sad_face:

* fix lazyFLowvar symbol resolution

* Fix LazyFlowVar with reduction and awaitable loops

* mention that awaitable might still change [skip ci]
diff --git a/README.md b/README.md
@@ -135,7 +135,7 @@ exit(Weave)
 - `init(Weave)`, `exit(Weave)` to start and stop the runtime. Forgetting this will give you nil pointer exceptions on spawn.
 - `spawn fnCall(args)` which spawns a function that may run on another thread and gives you an awaitable Flowvar handle.
 - `sync(Flowvar)` will await a Flowvar and block until you receive a result.
-- `sync(Weave)` is a global barrier for the main thread on the main task. Allowing nestable barriers for any thread is work-in-progress.
+- `syncRoot(Weave)` is a global barrier for the main thread on the main task.
 - `parallelFor`, `parallelForStrided`, `parallelForStaged`, `parallelForStagedStrided` are described above and in the experimental section.
 - `loadBalance(Weave)` gives the runtime the opportunity to distribute work. Insert this within long computation as due to Weave design, it's busy workers hat are also in charge of load balancing. This is done automatically when using `parallelFor`.
 - `isSpawned` allows you to build speculative algorithm where a thread is spawned only if certain conditions are valid. See the `nqueens` benchmark for an example.
diff --git a/benchmarks/bouncing_producer_consumer/weave_bpc.nim b/benchmarks/bouncing_producer_consumer/weave_bpc.nim
@@ -124,7 +124,7 @@ proc main() =
   let start = wtime_msec()
 
   bpc_produce(NumTasksPerDepth, Depth)
-  sync(Weave)
+  syncRoot(Weave)
 
   let stop = wtime_msec()
 
diff --git a/benchmarks/histogram_2D/weave_histogram.nim b/benchmarks/histogram_2D/weave_histogram.nim
@@ -216,6 +216,7 @@ proc generateHistogramWeaveStaged[T](matrix: Matrix[T], hist: Histogram): T =
   # Parallel reduction
   parallelForStaged i in 1 ..< matrix.ld-1:
     captures: {maxAddr, lockAddr, hist, matrix, boxes}
+    awaitable: histoLoop
     prologue:
       let threadHist = newHistogram(boxes)
       var threadMax = T(-Inf)
@@ -239,7 +240,7 @@ proc generateHistogramWeaveStaged[T](matrix: Matrix[T], hist: Histogram): T =
       lockAddr[].release()
       wv_free(threadHist.buffer)
 
-  sync(Weave)
+  sync(histoLoop)
   lock.deinitLock()
   return max
 
diff --git a/benchmarks/logsumexp/weave_logsumexp.nim b/benchmarks/logsumexp/weave_logsumexp.nim
@@ -249,6 +249,7 @@ proc maxWeaveStaged[T: SomeFloat](M: Matrix[T]) : T =
 
   parallelForStaged i in 0 ..< M.nrows:
     captures:{maxAddr, lockAddr, M}
+    awaitable: maxLoop
     prologue:
       var localMax = T(-Inf)
     loop:
@@ -260,7 +261,7 @@ proc maxWeaveStaged[T: SomeFloat](M: Matrix[T]) : T =
       maxAddr[] = max(maxAddr[], localMax)
       lockAddr[].release()
 
-  sync(Weave)
+  sync(maxLoop)
   lock.deinitLock()
 
 proc logsumexpWeaveStaged[T: SomeFloat](M: Matrix[T]): T =
@@ -279,6 +280,7 @@ proc logsumexpWeaveStaged[T: SomeFloat](M: Matrix[T]): T =
 
   parallelForStaged i in 0 ..< M.nrows:
     captures:{lseAddr, lockAddr, alpha, M}
+    awaitable: logSumExpLoop
     prologue:
       var localLSE = 0.T
     loop:
@@ -290,7 +292,7 @@ proc logsumexpWeaveStaged[T: SomeFloat](M: Matrix[T]): T =
       lseAddr[] += localLSE
       lockAddr[].release()
 
-  sync(Weave)
+  sync(logSumExpLoop)
   result = alpha + ln(lse)
   lock.deinitLock()
 
diff --git a/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_packing_weave.nim b/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_packing_weave.nim
@@ -59,7 +59,6 @@ proc pack_A_mc_kc*[T; ukernel: static MicroKernel](
 #                    Packing B
 #
 # ############################################################
-
 proc pack_B_kc_nc*[T; ukernel: static MicroKernel](
       packedB: ptr UncheckedArray[T],
       kc, nc: int,
@@ -70,6 +69,8 @@ proc pack_B_kc_nc*[T; ukernel: static MicroKernel](
   ## Concretely the outer dimension of packed matrices
   ## is k so that C[i, j] = A[i, k] * B[k, j]
   ## does not require strided access
+  mixin packingLoop
+
   let buffer{.restrict.} = assume_aligned packedB
   const NR = ukernel.extract_nr()
   let unroll_stop = nc.round_step_down(NR)
@@ -91,3 +92,5 @@ proc pack_B_kc_nc*[T; ukernel: static MicroKernel](
         offBuf[k*NR + j] = B[k, unroll_stop+j]
       for j in remainder ..< NR: # Pad with 0 if packing over the edge
         offBuf[k*NR + j] = 0.T
+
+  syncRoot(Weave)
diff --git a/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim b/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim
@@ -161,7 +161,6 @@ proc gemm_impl[T; ukernel: static MicroKernel](
     # First time writing to C, we scale it, otherwise accumulate
     let beta = if pc == 0: beta else: 1.T
 
-    sync(Weave) # TODO: this cannot be nested
     # ####################################
     # 3. for ic = 0,...,m−1 in steps of mc
     parallelFor icb in 0 ..< tiles.ic_num_tasks:
@@ -180,7 +179,7 @@ proc gemm_impl[T; ukernel: static MicroKernel](
           alpha, packA, tiles.b,                      #    αA[ic:ic+mc, pc:pc+kc] * B[pc:pc+kc, jc:jc+nc] +
           beta, vC.stride(ic, 0)                      #    βC[ic:ic+mc, jc:jc+nc]
         )
-    sync(Weave) # TODO: this cannot be nested
+    syncRoot(Weave)
 
 # ############################################################
 #
@@ -253,7 +252,6 @@ proc gemm_strided*[T: SomeNumber](
       if hasAvx512f():   dispatch(x86_AVX512)
       elif hasSse2():   dispatch(x86_SSE2)
   dispatch(x86_Generic)
-  sync(Weave)
 
 # ############################################################
 #
diff --git a/benchmarks/matrix_transposition/weave_transposes.nim b/benchmarks/matrix_transposition/weave_transposes.nim
@@ -204,7 +204,7 @@ template runBench(transposeName: typed, reorderCompute, isSequential: bool): unt
       for _ in 0 ..< nrounds:
         transposeName(M, N, bufIn, bufOut)
       if not isSequential:
-        sync(Weave)
+        syncRoot(Weave)
       when not defined(windows):
         let stop = wtime_msec()
         mxnTime = stop - start
@@ -215,7 +215,7 @@ template runBench(transposeName: typed, reorderCompute, isSequential: bool): unt
       for _ in 0 ..< nrounds:
         transposeName(N, M, bufIn, bufOut)
       if not isSequential:
-        sync(Weave)
+        syncRoot(Weave)
       when not defined(windows):
         let stop = wtime_msec()
         nxmTime = stop - start
@@ -238,7 +238,7 @@ template runBench(transposeName: typed, reorderCompute, isSequential: bool): unt
       for _ in 0 ..< nrounds:
         transposeName(N, M, bufIn, bufOut)
       if not isSequential:
-        sync(Weave)
+        syncRoot(Weave)
       when not defined(windows):
         let stop = wtime_msec()
         nxmTime = stop - start
@@ -249,7 +249,7 @@ template runBench(transposeName: typed, reorderCompute, isSequential: bool): unt
       for _ in 0 ..< nrounds:
         transposeName(M, N, bufIn, bufOut)
       if not isSequential:
-        sync(Weave)
+        syncRoot(Weave)
       when not defined(windows):
         let stop = wtime_msec()
         mxnTime = stop - start
diff --git a/benchmarks/single_task_producer/weave_spc.nim b/benchmarks/single_task_producer/weave_spc.nim
@@ -115,7 +115,7 @@ proc main() =
 
   # spc_produce_seq(NumTasksTotal)
   spc_produce(NumTasksTotal)
-  sync(Weave)
+  syncRoot(Weave)
 
   let stop = wtime_msec()
 
diff --git a/changelog.md b/changelog.md
@@ -1,5 +1,52 @@
 # Changelog
 
+### v0.3.0 - unreleased
+
+`sync(Weave)` has been renamed `syncRoot(Weave)` to highlight that it is only valid on the root task in the main thread. In particular, a procedure that uses syncRoot should not be called be in a multithreaded section. This is a breaking change. In the future such changes will have a deprecation path but the library is only 2 weeks old at the moment.
+
+`parallelFor`, `parallelForStrided`, `parallelForStaged`, `parallelForStagedStrided`
+now support an "awaitable" statement to allow fine-grain sync.
+
+Fine-grained data-dependencies are under research (for example launch a task when the first 50 iterations are done out of a 100 iteration loops), "awaitable" may change
+to have an unified syntax for delayed tasks depending on a task, a whole loop or a subset of it.
+If possible, it is recommended to use "awaitable" instead of `syncRoot()` to allow composable parallelism, `syncRoot()` can only be called in a serial section of the code.
+
+Weave can now be compiled with Microsoft Visual Studio in C++ mode.
+
+"LastVictim" and "LastThief" WV_Target policy has been added.
+The default is still "Random", pass "-d:WV_Target=LastVictim" to explore performance on your workload
+
+"StealEarly" has been implemented, the default is not to steal early,
+pass "-d:WV_StealEarly=2" for example to allow workers to initiate a steal request
+when 2 tasks or less are left in their queue.
+
+#### Performance
+
+Weave has been thoroughly tested and tuned on state-of-the-art matrix multiplication implementation
+against competing pure Assembly, hand-tuned BLAS implementations to reach High-performance Computing scalability standards.
+
+3 cases can trigger loop splitting in Weave:
+- loadBalance(Weave),
+- sharing work to idle child threads
+- incoming thieves
+The first 2 were not working properly and resulted in pathological performance cases.
+This has been fixed.
+
+Fixed strided loop iteration rounding
+Fixed compilation with metrics
+
+Executing a loop now counts as a single task for the adaptative steal policy.
+This prevents short loops from hindering steal-half strategy as it depends
+on the number of tasks executed per steal requests interval.
+
+#### Internals
+- Weave uses explicit finite state machines in several places.
+- The memory pool now has the same interface has malloc/free, in the past
+  freeing a block required passing a threadID as this avoided an expensive getThreadID syscall.
+  The new solution uses assembly code to get the address of the current thread thread-local storage
+  as an unique threadID.
+- Weave memory subsystem now supports LLVM AddressSanitizer to detect memory bugs.
+  Spurious (?) errors from Nim and Weave were not removed and are left as a future task.
 
 ### v0.2.0 - December 2019
 
diff --git a/weave.nim b/weave.nim
@@ -6,12 +6,12 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  weave/[parallel_tasks, parallel_for, parallel_for_staged, runtime, runtime_fsm],
+  weave/[parallel_tasks, parallel_for, parallel_for_staged, runtime, runtime_fsm, await_fsm],
   weave/datatypes/flowvars
 
 export
   Flowvar, Weave,
-  spawn, sync,
+  spawn, sync, syncRoot,
   parallelFor, parallelForStrided, parallelForStaged, parallelForStagedStrided,
   init, exit,
   loadBalance,
diff --git a/weave.nimble b/weave.nimble
@@ -42,12 +42,12 @@ task test, "Run Weave tests":
   test "", "weave/parallel_tasks.nim"
   test "", "weave/parallel_for.nim"
   test "", "weave/parallel_for_staged.nim"
-  # test "", "weave/parallel_reduce.nim"
+  test "", "weave/parallel_reduce.nim"
 
   test "-d:WV_LazyFlowvar", "weave/parallel_tasks.nim"
   test "-d:WV_LazyFlowvar", "weave/parallel_for.nim"
   test "-d:WV_LazyFlowvar", "weave/parallel_for_staged.nim"
-  # test "-d:WV_LazyFlowvar", "weave/parallel_reduce.nim" # Experimental
+  test "-d:WV_LazyFlowvar", "weave/parallel_reduce.nim"
 
   test "", "benchmarks/dfs/weave_dfs.nim"
   test "", "benchmarks/fibonacci/weave_fib.nim"
diff --git a/weave/await_fsm.nim b/weave/await_fsm.nim
@@ -50,19 +50,8 @@ setTerminalState(awaitFSA, AW_Exit)
 
 # -------------------------------------------
 
-EagerFV:
-  template isFutReady(): untyped =
-    fv.chan[].tryRecv(parentResult)
-LazyFV:
-  template isFutReady(): untyped =
-    if fv.lfv.hasChannel:
-      ascertain: not fv.lfv.lazy.chan.isNil
-      fv.lfv.lazy.chan[].tryRecv(parentResult)
-    else:
-      fv.lfv.isReady
-
 implEvent(awaitFSA, AWE_FutureReady):
-  isFutReady()
+  isFutReady(fv)
 
 behavior(awaitFSA):
   # In AW_Steal we might recv tasks and steal requests which get stuck in our queues
@@ -184,3 +173,47 @@ behavior(awaitFSA):
 
 synthesize(awaitFSA):
   proc forceFuture*[T](fv: Flowvar[T], parentResult: var T)
+
+# -------------------------------------------
+
+EagerFV:
+  proc forceComplete*[T](fv: Flowvar[T], parentResult: var T) {.inline.} =
+    ## From the parent thread awaiting on the result, force its computation
+    ## by eagerly processing only the child tasks spawned by the awaited task
+    fv.forceFuture(parentResult)
+    recycleChannel(fv)
+
+LazyFV:
+  template forceComplete*[T](fv: Flowvar[T], parentResult: var T) =
+    fv.forceFuture(parentResult)
+    # Reclaim memory
+    if not fv.lfv.hasChannel:
+      ascertain: fv.lfv.isReady
+      parentResult = cast[ptr T](fv.lfv.lazy.buf.addr)[]
+    else:
+      ascertain: not fv.lfv.lazy.chan.isNil
+      recycleChannel(fv)
+
+# Public
+# -------------------------------------------
+
+type Dummy* = object
+  ## A dummy return type (Flowvar[Dummy])
+  ## for waitable for-loops
+  # Do we add a dummy field to avoid a size of 0?
+
+proc sync*[T](fv: FlowVar[T]): T {.inline.} =
+  ## Blocks the current thread until the flowvar is available
+  ## and returned.
+  ## The thread is not idle and will complete pending tasks.
+  fv.forceComplete(result)
+
+template sync*(fv: FlowVar[Dummy]) =
+  ## Blocks the current thread until the full loop task
+  ## associated with the dummy has finished
+  ## The thread is not idle and will complete pending tasks.
+  # This must be a template to avoid recursive dependency
+  # as forceFuture is in await_fsm and await_fsm depends
+  # on this module.
+  var dummy: Dummy
+  forceComplete(fv, dummy)
diff --git a/weave/channels/channels_spsc_single.nim b/weave/channels/channels_spsc_single.nim
@@ -70,7 +70,9 @@ func tryRecv*[T](chan: var ChannelSPSCSingle, dst: var T): bool {.inline.} =
   ## Returns true if successful (channel was not empty)
   ##
   ## ⚠ Use only in the consumer thread that reads from the channel.
-  preCondition: sizeof(T) == chan.itemsize.int
+  preCondition: (sizeof(T) == chan.itemsize.int) or
+                # Support dummy object
+                (sizeof(T) == 0 and chan.itemsize == 1)
 
   let full = chan.full.load(moAcquire)
   if not full:
@@ -84,7 +86,9 @@ func trySend*[T](chan: var ChannelSPSCSingle, src: sink T): bool {.inline.} =
   ## Reurns true if successful (channel was empty)
   ##
   ## ⚠ Use only in the producer thread that writes from the channel.
-  preCondition: sizeof(T) == chan.itemsize.int
+  preCondition: (sizeof(T) == chan.itemsize.int) or
+                # Support dummy object
+                (sizeof(T) == 0 and chan.itemsize == 1)
 
   let full = chan.full.load(moAcquire)
   if full:
diff --git a/weave/config.nim b/weave/config.nim
@@ -105,7 +105,7 @@ template debug*(body: untyped): untyped =
     block: {.noSideEffect, gcsafe.}: body
 
 template debugSplit*(body: untyped): untyped =
-  when defined(WV_DebugSplit):
+  when defined(WV_DebugSplit) or defined(WV_Debug):
     block: {.noSideEffect, gcsafe.}: body
 
 template StealAdaptative*(body: untyped): untyped =
diff --git a/weave/datatypes/flowvars.nim b/weave/datatypes/flowvars.nim
diff --git a/weave/parallel_for.nim b/weave/parallel_for.nim
diff --git a/weave/parallel_for_staged.nim b/weave/parallel_for_staged.nim
diff --git a/weave/parallel_macros.nim b/weave/parallel_macros.nim
diff --git a/weave/parallel_reduce.nim b/weave/parallel_reduce.nim
diff --git a/weave/runtime_fsm.nim b/weave/runtime_fsm.nim
diff --git a/weave/victims.nim b/weave/victims.nim