Skip to content

Commit e2d7885

Browse files
authored
buffer: Avoid to process discarded chunks in write_step_by_step (for v1.16) (#4363)
It fixes following error when many `chunk bytes limit exceeds` errors are occurred: ``` 2020-07-28 14:59:26 +0000 [warn]: #0 emit transaction failed: error_class=IOError error="closed stream" location="/fluentd/vendor/bundle/ruby/2.6.0/gems/fluentd-1.11.1/lib/fluent/plugin/buffer/file_chunk.rb:82:in `pos'" tag="cafiscode-eks-cluster.default" 2020-07-28 14:59:26 +0000 [warn]: #0 /fluentd/vendor/bundle/ruby/2.6.0/gems/fluentd-1.11.1/lib/fluent/plugin/buffer/file_chunk.rb:82:in `pos' 2020-07-28 14:59:26 +0000 [warn]: #0 /fluentd/vendor/bundle/ruby/2.6.0/gems/fluentd-1.11.1/lib/fluent/plugin/buffer/file_chunk.rb:82:in `rollback' 2020-07-28 14:59:26 +0000 [warn]: #0 /fluentd/vendor/bundle/ruby/2.6.0/gems/fluentd-1.11.1/lib/fluent/plugin/buffer.rb:339:in `rescue in block in write' 2020-07-28 14:59:26 +0000 [warn]: #0 /fluentd/vendor/bundle/ruby/2.6.0/gems/fluentd-1.11.1/lib/fluent/plugin/buffer.rb:332:in `block in write' 2020-07-28 14:59:26 +0000 [warn]: #0 /fluentd/vendor/bundle/ruby/2.6.0/gems/fluentd-1.11.1/lib/fluent/plugin/buffer.rb:331:in `each' ... ``` Fix #3089 Signed-off-by: Takuro Ashie <[email protected]>
1 parent d3cf2e0 commit e2d7885

File tree

2 files changed

+70
-17
lines changed

2 files changed

+70
-17
lines changed

Diff for: lib/fluent/plugin/buffer.rb

+19-17
Original file line numberDiff line numberDiff line change
@@ -728,7 +728,6 @@ def write_once(metadata, data, format: nil, size: nil, &block)
728728

729729
def write_step_by_step(metadata, data, format, splits_count, &block)
730730
splits = []
731-
errors = []
732731
if splits_count > data.size
733732
splits_count = data.size
734733
end
@@ -749,23 +748,23 @@ def write_step_by_step(metadata, data, format, splits_count, &block)
749748
modified_chunks = []
750749
modified_metadata = metadata
751750
get_next_chunk = ->(){
752-
c = if staged_chunk_used
753-
# Staging new chunk here is bad idea:
754-
# Recovering whole state including newly staged chunks is much harder than current implementation.
755-
modified_metadata = modified_metadata.dup_next
756-
generate_chunk(modified_metadata)
757-
else
758-
synchronize { @stage[modified_metadata] ||= generate_chunk(modified_metadata).staged! }
759-
end
760-
modified_chunks << c
761-
c
751+
if staged_chunk_used
752+
# Staging new chunk here is bad idea:
753+
# Recovering whole state including newly staged chunks is much harder than current implementation.
754+
modified_metadata = modified_metadata.dup_next
755+
generate_chunk(modified_metadata)
756+
else
757+
synchronize { @stage[modified_metadata] ||= generate_chunk(modified_metadata).staged! }
758+
end
762759
}
763760

764761
writing_splits_index = 0
765762
enqueue_chunk_before_retry = false
766763

767764
while writing_splits_index < splits.size
768765
chunk = get_next_chunk.call
766+
errors = []
767+
modified_chunks << {chunk: chunk, adding_bytesize: 0, errors: errors}
769768
chunk.synchronize do
770769
raise ShouldRetry unless chunk.writable?
771770
staged_chunk_used = true if chunk.staged?
@@ -851,15 +850,18 @@ def write_step_by_step(metadata, data, format, splits_count, &block)
851850
raise
852851
end
853852

854-
block.call(chunk, chunk.bytesize - original_bytesize, errors)
855-
errors = []
853+
modified_chunks.last[:adding_bytesize] = chunk.bytesize - original_bytesize
856854
end
857855
end
856+
modified_chunks.each do |data|
857+
block.call(data[:chunk], data[:adding_bytesize], data[:errors])
858+
end
858859
rescue ShouldRetry
859-
modified_chunks.each do |mc|
860-
mc.rollback rescue nil
861-
if mc.unstaged?
862-
mc.purge rescue nil
860+
modified_chunks.each do |data|
861+
chunk = data[:chunk]
862+
chunk.rollback rescue nil
863+
if chunk.unstaged?
864+
chunk.purge rescue nil
863865
end
864866
end
865867
enqueue_chunk(metadata) if enqueue_chunk_before_retry

Diff for: test/plugin/test_buffer.rb

+51
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,57 @@ def create_chunk_es(metadata, es)
850850
test '#compress returns :text' do
851851
assert_equal :text, @p.compress
852852
end
853+
854+
# https://github.com/fluent/fluentd/issues/3089
855+
test "closed chunk should not be committed" do
856+
assert_equal 8 * 1024 * 1024, @p.chunk_limit_size
857+
assert_equal 0.95, @p.chunk_full_threshold
858+
859+
purge_count = 0
860+
861+
stub.proxy(@p).generate_chunk(anything) do |chunk|
862+
stub.proxy(chunk).purge do |result|
863+
purge_count += 1
864+
result
865+
end
866+
stub.proxy(chunk).commit do |result|
867+
assert_false(chunk.closed?)
868+
result
869+
end
870+
stub.proxy(chunk).rollback do |result|
871+
assert_false(chunk.closed?)
872+
result
873+
end
874+
chunk
875+
end
876+
877+
m = @p.metadata(timekey: Time.parse('2016-04-11 16:40:00 +0000').to_i)
878+
small_row = "x" * 1024 * 400
879+
big_row = "x" * 1024 * 1024 * 8 # just `chunk_size_limit`, it does't cause BufferOverFlowError.
880+
881+
# Write 42 events in 1 event stream, last one is for triggering `ShouldRetry`
882+
@p.write({m => [small_row] * 40 + [big_row] + ["x"]})
883+
884+
# Above event strem will be splitted twice by `Buffer#write_step_by_step`
885+
#
886+
# 1. `write_once`: 42 [events] * 1 [stream]
887+
# 2. `write_step_by_step`: 4 [events]* 10 [streams] + 2 [events] * 1 [stream]
888+
# 3. `write_step_by_step` (by `ShouldRetry`): 1 [event] * 42 [streams]
889+
#
890+
# The problematic data is built in the 2nd stage.
891+
# In the 2nd stage, 5 streams are packed in a chunk.
892+
# ((1024 * 400) [bytes] * 4 [events] * 5 [streams] = 8192000 [bytes] < `chunk_limit_size` (8MB)).
893+
# So 3 chunks are used to store all data.
894+
# The 1st chunk is already staged by `write_once`.
895+
# The 2nd & 3rd chunks are newly created as unstaged.
896+
# The 3rd chunk is purged before `ShouldRetry`, it's no problem:
897+
# https://github.com/fluent/fluentd/blob/7e9eba736ff40ad985341be800ddc46558be75f2/lib/fluent/plugin/buffer.rb#L850
898+
# The 2nd chunk is purged in `rescue ShouldRetry`:
899+
# https://github.com/fluent/fluentd/blob/7e9eba736ff40ad985341be800ddc46558be75f2/lib/fluent/plugin/buffer.rb#L862
900+
# It causes the issue described in https://github.com/fluent/fluentd/issues/3089#issuecomment-1811839198
901+
902+
assert_equal 2, purge_count
903+
end
853904
end
854905

855906
sub_test_case 'standard format with configuration for test with lower chunk limit size' do

0 commit comments

Comments
 (0)