Skip to content

Commit af37d14

Browse files
authored
fix: adjust mutex handling around ddwaf_context (#52)
- Removes a leftover mutex on `Handle` that no longer served any purpose - Acquiring the mutex on `Context` prior to calling `ddwaf_context_destroy` to avoid concurrent calls to `ddwaf_run`, as these are not thread-safe.
1 parent 10615b9 commit af37d14

File tree

6 files changed

+136
-61
lines changed

6 files changed

+136
-61
lines changed

context.go

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,18 @@ import (
1616
// when calling it multiple times to run its rules every time new addresses
1717
// become available. Each request must have its own Context.
1818
type Context struct {
19-
// Instance of the WAF
20-
handle *Handle
21-
cContext wafContext
22-
// cgoRefs is used to retain go references to WafObjects until the context is destroyed.
23-
// As per libddwaf documentation, WAF Objects must be alive during all the context lifetime
24-
cgoRefs cgoRefPool
25-
// Mutex protecting the use of cContext which is not thread-safe and cgoRefs.
26-
mutex sync.Mutex
19+
handle *Handle // Instance of the WAF
20+
21+
cgoRefs cgoRefPool // Used to retain go data referenced by WAF Objects the context holds
22+
cContext wafContext // The C ddwaf_context pointer
2723

2824
// Stats
29-
// Cumulated internal WAF run time - in nanoseconds - for this context.
30-
totalRuntimeNs atomic.Uint64
31-
// Cumulated overall run time - in nanoseconds - for this context.
32-
totalOverallRuntimeNs atomic.Uint64
33-
// Cumulated timeout count for this context.
34-
timeoutCount atomic.Uint64
25+
totalRuntimeNs atomic.Uint64 // Cumulative internal WAF run time - in nanoseconds - for this context.
26+
totalOverallRuntimeNs atomic.Uint64 // Cumulative overall run time - in nanoseconds - for this context.
27+
timeoutCount atomic.Uint64 // Cumulative timeout count for this context.
28+
29+
// Mutex protecting the use of cContext which is not thread-safe and cgoRefs.
30+
mutex sync.Mutex
3531
}
3632

3733
// NewContext returns a new WAF context of to the given WAF handle.
@@ -41,13 +37,13 @@ type Context struct {
4137
// or the WAF context couldn't be created.
4238
func NewContext(handle *Handle) *Context {
4339
// Handle has been released
44-
if handle.addRefCounter(1) == 0 {
40+
if !handle.retain() {
4541
return nil
4642
}
4743

4844
cContext := wafLib.wafContextInit(handle.cHandle)
4945
if cContext == 0 {
50-
handle.addRefCounter(-1)
46+
handle.release() // We couldn't get a context, so we no longer have an implicit reference to the Handle in it...
5147
return nil
5248
}
5349

@@ -120,12 +116,9 @@ func (context *Context) Run(addressData RunAddressData, timeout time.Duration) (
120116
return
121117
}
122118

119+
// run executes the ddwaf_run call with the provided data on this context. The caller is responsible for locking the
120+
// context appropriately around this call.
123121
func (context *Context) run(persistentData, ephemeralData *wafObject, timeout time.Duration, cgoRefs *cgoRefPool) (Result, error) {
124-
// RLock the handle to safely get read access to the WAF handle and prevent concurrent changes of it
125-
// such as a rules-data update.
126-
context.handle.mutex.RLock()
127-
defer context.handle.mutex.RUnlock()
128-
129122
result := new(wafResult)
130123
defer wafLib.wafResultFree(result)
131124

@@ -173,13 +166,20 @@ func unwrapWafResult(ret wafReturnCode, result *wafResult) (res Result, err erro
173166
return res, err
174167
}
175168

176-
// Close calls handle.closeContext which calls ddwaf_context_destroy and maybe also close the handle if it in termination state.
169+
// Close the underlying `ddwaf_context` and releases the associated internal
170+
// data. Also decreases the reference count of the `ddwaf_hadnle` which created
171+
// this context, possibly releasing it completely (if this was the last context
172+
// created from this handle & it was released by its creator).
177173
func (context *Context) Close() {
178-
defer context.handle.closeContext(context)
179-
// Keep the Go pointer references until the end of the context
180-
keepAlive(context.cgoRefs)
181-
// The context is no longer used so we can try releasing the Go pointer references asap by nulling them
182-
context.cgoRefs = cgoRefPool{}
174+
context.mutex.Lock()
175+
defer context.mutex.Unlock()
176+
177+
wafLib.wafContextDestroy(context.cContext)
178+
keepAlive(context.cgoRefs) // Keep the Go pointer references until the end of the context
179+
defer context.handle.release() // Reduce the reference counter of the Handle.
180+
181+
context.cgoRefs = cgoRefPool{} // The data in context.cgoRefs is no longer needed, explicitly release
182+
context.cContext = 0 // Makes it easy to spot use-after-free/double-free issues
183183
}
184184

185185
// TotalRuntime returns the cumulated WAF runtime across various run calls within the same WAF context.

encoder.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ import (
2020
// reference now or in the future, are stored and referenced in the `cgoRefs` field. The user MUST leverage
2121
// `keepAlive()` with it according to its ddwaf use-case.
2222
type encoder struct {
23+
cgoRefs cgoRefPool
2324
containerMaxSize int
2425
stringMaxSize int
2526
objectMaxDepth int
26-
cgoRefs cgoRefPool
2727
}
2828

2929
type native interface {

handle.go

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,15 @@ package waf
88
import (
99
"errors"
1010
"fmt"
11-
"sync"
1211

1312
"github.com/DataDog/go-libddwaf/v2/internal/noopfree"
1413
"go.uber.org/atomic"
1514
)
1615

1716
// Handle represents an instance of the WAF for a given ruleset.
1817
type Handle struct {
19-
// Instance of the WAF
20-
cHandle wafHandle
18+
// diagnostics holds information about rules initialization
19+
diagnostics Diagnostics
2120

2221
// Lock-less reference counter avoiding blocking calls to the Close() method
2322
// while WAF contexts are still using the WAF handle. Instead, we let the
@@ -31,12 +30,8 @@ type Handle struct {
3130
// block the request handlers for the time of the security rules update.
3231
refCounter *atomic.Int32
3332

34-
// RWMutex protecting the R/W accesses to the internal rules data (stored
35-
// in the handle).
36-
mutex sync.RWMutex
37-
38-
// diagnostics holds information about rules initialization
39-
diagnostics Diagnostics
33+
// Instance of the WAF
34+
cHandle wafHandle
4035
}
4136

4237
// NewHandle creates and returns a new instance of the WAF with the given security rules and configuration
@@ -115,7 +110,6 @@ func (handle *Handle) Addresses() []string {
115110
// Update the ruleset of a WAF instance into a new handle on its own
116111
// the previous handle still needs to be closed manually
117112
func (handle *Handle) Update(newRules any) (*Handle, error) {
118-
119113
encoder := newMaxEncoder()
120114
obj, err := encoder.Encode(newRules)
121115
if err != nil {
@@ -142,36 +136,56 @@ func (handle *Handle) Update(newRules any) (*Handle, error) {
142136
}, nil
143137
}
144138

145-
// closeContext calls ddwaf_context_destroy and eventually ddwaf_destroy on the handle
146-
func (handle *Handle) closeContext(context *Context) {
147-
wafLib.wafContextDestroy(context.cContext)
148-
if handle.addRefCounter(-1) == 0 {
149-
wafLib.wafDestroy(handle.cHandle)
150-
}
151-
}
152-
153139
// Close puts the handle in termination state, when all the contexts are closed the handle will be destroyed
154140
func (handle *Handle) Close() {
155-
if handle.addRefCounter(-1) > 0 {
156-
// There are still Contexts that are not closed
141+
if handle.addRefCounter(-1) != 0 {
142+
// Either the counter is still positive (this Handle is still referenced), or it had previously
143+
// reached 0 and some other call has done the cleanup already.
157144
return
158145
}
159146

160147
wafLib.wafDestroy(handle.cHandle)
148+
handle.diagnostics = Diagnostics{} // Data in diagnostics may no longer be valid (e.g: strings from libddwaf)
149+
handle.cHandle = 0 // Makes it easy to spot use-after-free/double-free issues
150+
}
151+
152+
// retain increments the reference counter of this Handle. Returns true if the
153+
// Handle is still valid, false if it is no longer usable. Calls to retain()
154+
// must be balanced with calls to release() in order to avoid leaking Handles.
155+
func (handle *Handle) retain() bool {
156+
return handle.addRefCounter(1) > 0
161157
}
162158

163-
// addRefCounter add x to Handle.refCounter.
164-
// It relies on a CAS spin-loop implementation in order to avoid changing the
165-
// counter when 0 has been reached.
159+
// release decrements the reference counter of this Handle, possibly causing it
160+
// to be completely closed if no other reference to it exist.
161+
func (handle *Handle) release() {
162+
handle.Close()
163+
}
164+
165+
// addRefCounter adds x to Handle.refCounter. The return valid indicates whether the refCounter reached 0 as part of
166+
// this call or not, which can be used to perform "only-once" activities:
167+
// - result > 0 => the Handle is still usable
168+
// - result == 0 => the handle is no longer usable, ref counter reached 0 as part of this call
169+
// - result == -1 => the handle is no longer usable, ref counter was already 0 previously
166170
func (handle *Handle) addRefCounter(x int32) int32 {
171+
// We use a CAS loop to avoid setting the refCounter to a negative value.
167172
for {
168173
current := handle.refCounter.Load()
169-
if current == 0 {
170-
// The object was released
171-
return 0
174+
if current <= 0 {
175+
// The object had already been released
176+
return -1
172177
}
173-
if swapped := handle.refCounter.CompareAndSwap(current, current+x); swapped {
174-
return current + x
178+
179+
next := current + x
180+
if swapped := handle.refCounter.CompareAndSwap(current, next); swapped {
181+
if next < 0 {
182+
// TODO(romain.marcadier): somehow signal unexpected behavior to the
183+
// caller (panic? error?). We currently clamp to 0 in order to avoid
184+
// causing a customer program crash, but this is the symptom of a bug
185+
// and should be investigated (however this clamping hides the issue).
186+
return 0
187+
}
188+
return next
175189
}
176190
}
177191
}

safe.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ import (
1919
// error is unreliable and the caller must rather stop using the library.
2020
// Examples include safety checks errors.
2121
type PanicError struct {
22-
// The function symbol name that was given to `tryCall()`.
23-
in string
2422
// The recovered panic error while executing the function `in`.
2523
Err error
24+
// The function symbol name that was given to `tryCall()`.
25+
in string
2626
}
2727

2828
func newPanicError(in func() error, err error) *PanicError {

waf.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ import (
1515

1616
// Diagnostics stores the information - provided by the WAF - about WAF rules initialization.
1717
type Diagnostics struct {
18-
Version string
1918
Rules *DiagnosticEntry
2019
CustomRules *DiagnosticEntry
2120
Exclusions *DiagnosticEntry
2221
RulesOverrides *DiagnosticEntry
2322
RulesData *DiagnosticEntry
2423
Processors *DiagnosticEntry
2524
Scanners *DiagnosticEntry
25+
Version string
2626
}
2727

2828
// TopLevelErrors returns the list of top-level errors reported by the WAF on any of the Diagnostics
@@ -56,10 +56,10 @@ func (d *Diagnostics) TopLevelError() error {
5656
// for a specific entry in the WAF ruleset
5757
type DiagnosticEntry struct {
5858
Addresses *DiagnosticAddresses
59+
Errors map[string][]string // Item-level errors (map of error message to entity identifiers or index:#)
60+
Error string // If the entire entry was in error (e.g: invalid format)
5961
Loaded []string // Successfully loaded entity identifiers (or index:#)
6062
Failed []string // Failed entity identifiers (or index:#)
61-
Error string // If the entire entry was in error (e.g: invalid format)
62-
Errors map[string][]string // Item-level errors (map of error message to entity identifiers or index:#)
6363
}
6464

6565
// DiagnosticAddresses stores the information - provided by the WAF - about the known addresses and

waf_test.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,67 @@ func TestConcurrency(t *testing.T) {
817817
// The test mustn't crash and ref-counter must be 0
818818
require.Zero(t, waf.refCounter.Load())
819819
})
820+
821+
t.Run("concurrent-context-use-destroy", func(t *testing.T) {
822+
// This test validates that the WAF Context can be used from multiple
823+
// threads, with mixed calls to `ddwaf_run` and `ddwaf_context_destroy`,
824+
// which are not thread-safe.
825+
826+
waf, err := newDefaultHandle(testArachniRule)
827+
require.NoError(t, err)
828+
829+
wafCtx := NewContext(waf)
830+
require.NotNil(t, wafCtx)
831+
832+
var startBarrier, stopBarrier sync.WaitGroup
833+
startBarrier.Add(1)
834+
stopBarrier.Add(nbUsers + 1)
835+
836+
data := map[string]any{
837+
"server.request.headers.no_cookies": map[string][]string{
838+
"user-agent": {"Arachni/test"},
839+
},
840+
}
841+
842+
for n := 0; n < nbUsers; n++ {
843+
n := n
844+
go func() {
845+
startBarrier.Wait()
846+
defer stopBarrier.Done()
847+
848+
// A microsecond sleep gives us some scheduler-backed order of execution randomization
849+
time.Sleep(time.Microsecond)
850+
851+
// Half of these goroutines will try to use wafCtx.Run(...), while the other half will try
852+
// to use wafCtx.Close(). The expected outcome is that exactly one call to wafCtx.Close()
853+
// effectively releases the WAF context, and between 0 and N calls to wafCtx.Run(...) are
854+
// done (those that land after `wafCtx.Close()` happened will be silent no-ops).
855+
if n%2 == 0 {
856+
wafCtx.Run(RunAddressData{Ephemeral: data}, time.Second)
857+
} else {
858+
wafCtx.Close()
859+
}
860+
}()
861+
}
862+
863+
go func() {
864+
startBarrier.Wait()
865+
defer stopBarrier.Done()
866+
867+
// A microsecond sleep gives us some scheduler-backed order of execution randomization
868+
time.Sleep(time.Microsecond)
869+
870+
// We also asynchronously release the WAF handle, which is fine to do as the WAF context is
871+
// still in use, as the WAF handle has a reference counter guarding it's destruction.
872+
waf.Close()
873+
}()
874+
875+
startBarrier.Done()
876+
stopBarrier.Wait()
877+
878+
// Verify the WAF Handle was properly released.
879+
require.Zero(t, waf.refCounter.Load())
880+
})
820881
}
821882

822883
func TestRunError(t *testing.T) {

0 commit comments

Comments
 (0)