scalyr · dlobue · Oct 4, 2022 · Oct 4, 2022 · Oct 4, 2022 · Oct 4, 2022
diff --git a/lib/logstash/outputs/scalyr.rb b/lib/logstash/outputs/scalyr.rb
@@ -200,6 +200,13 @@ class LogStash::Outputs::Scalyr < LogStash::Outputs::Base
   # for large batches, it may make sense to disable this option when logstash batch size is configured in a way that
   # Scalyr single request limit won't be reached.
   config :estimate_each_event_size, :validate => :boolean, :default => true
+  # The following settings tune event truncation, which will truncate the
+  # message field to below `max_field_size_bytes`, drop any other field that
+  # exceeds `max_field_size_bytes`, and drop enough fields to ensure the record
+  # does not exceed `max_record_size_bytes`. This feature is only active when
+  # `estimate_each_event_size` is enabled.
+  config :max_record_size_bytes, :validate => :number, :default => 200 * 1024
+  config :max_field_size_bytes, :validate => :number, :default => 50 * 1024
 
   # Library to use for JSON serialization. Valid values are "stdlib" and "jrjackson". The later may offer 2-4 performance
   # improvements on serialization.
@@ -930,6 +937,12 @@ def build_multi_event_request_array(logstash_events)
           end
         end
 
+        if event_json.bytesize > @max_record_size_bytes
+          @logger.warn("Event size exceeds max_record_size_bytes, and will be truncated")
+          truncate_event(scalyr_event)
+          event_json = self.json_encode(scalyr_event)
+        end
+
         # generate new request if json size of events in the array exceed maximum request buffer size
         append_event = true
         add_bytes = event_json.bytesize
@@ -991,6 +1004,46 @@ def add_client_timestamp_to_body(body)
     body[:client_timestamp] = current_time_millis.to_s
   end
 
+  # This should only be called on events that have already been determined to be too large
+  def truncate_event(event)
+    attrs = event[:attrs]
+    new_attrs = Hash.new
+    total_size = 0
+    priority_events = ["serverHost", "parser", "logfile", "severity", "message"]
 # Rename serverHost (if exists) to __origServerHost so sources filtering works correctly 
 scalyr_event[:sev] = severity_int 
 # Rename serverHost (if exists) to __origServerHost so sources filtering works correctly 
 scalyr_event[:sev] = severity_int 
+    priority_events.each do |key|
+      next unless attrs.has_key? key
+      total_size += key.bytesize
+      value = attrs.delete key
+      value_size = _get_size(value)
+      if value_size > @max_field_size_bytes
+        value.slice!(@max_field_size_bytes, value_size)
+        total_size += @max_field_size_bytes
+      else
+        total_size += value_size
+      end
+      new_attrs[key] = value
+    end
+    attrs.each do |key, value|
+      kv_size = key.bytesize + _get_size(value)
+      # skip field if the combined size of the key and value exceed the max field size.
+      # We do this so we don't have to deal with figuring out how to truncate complex types
+      next if kv_size > @max_field_size_bytes
+      # Stop copying fields over if we would exceed the max record size
+      break if kv_size + total_size > @max_record_size_bytes
+      total_size += kv_size
+      new_attrs[key] = value
+    end
+    event[:attrs] = new_attrs
+    return event
+  end
+
+  def _get_size(value)
+    if not value.kind_of? String
+      value = value.to_s
+    end
+    return value.bytesize
+  end
+
 
   # A request comprises multiple Scalyr Events.  This function creates a request hash for
   # final upload to Scalyr (from an array of events, and an optional hash of current threads)

diff --git a/spec/logstash/outputs/scalyr_spec.rb b/spec/logstash/outputs/scalyr_spec.rb
@@ -1201,6 +1201,111 @@ def post_add_events(body, is_status, body_serialization_duration = 0)
       end
     end
 
+    context "when an event exceeds the max record size" do
+      def setup_plugin
+        config = {
+            'api_write_token' => '1234',
+            'perform_connectivity_check' => false,
+            'estimate_each_event_size' => true,
+        }
+        plugin = LogStash::Outputs::Scalyr.new(config)
+
+        allow(plugin).to receive(:send_status).and_return(nil)
+        plugin.register
+        return plugin
+      end
+
+      it "truncates the message field if it exceeds the max field size" do
+        plugin = setup_plugin()
+        e = LogStash::Event.new
+        e.set('message', 'a' * (205 * 1024))
+
+        result = plugin.build_multi_event_request_array([e])
+        body = JSON.parse(result[0][:body])
+        events = body['events']
+        scalyr_event = events[0]
+        attrs = scalyr_event['attrs']
+        expect(attrs['message'].bytesize).to eq(50 * 1024)
+      end
+      it "doesn't copy fields that exceed the max field size" do
+        plugin = setup_plugin()
+        e = LogStash::Event.new
+        e.set('message', 'a' * (205 * 1024))
+        e.set('honk', 'b' * (65 * 1024))
+        e.set('blarg', 'honk')
+        e.set('rawr', 'blah')
+
+        result = plugin.build_multi_event_request_array([e])
+        body = JSON.parse(result[0][:body])
+        events = body['events']
+        scalyr_event = events[0]
+        attrs = scalyr_event['attrs']
+        expect(attrs.has_key? 'honk').to be false
+      end
+      it "takes field key size into account" do
+        plugin = setup_plugin()
+        e = LogStash::Event.new
+        e.set('b' * (20 * 1024), 'blarg')
+        e.set('c' * (20 * 1024), 'blarg')
+        e.set('d' * (20 * 1024), 'blarg')
+        e.set('e' * (20 * 1024), 'blarg')
+        e.set('q' * (20 * 1024), 'blarg')
+        e.set('w' * (20 * 1024), 'blarg')
+        e.set('r' * (20 * 1024), 'blarg')
+        e.set('z' * (20 * 1024), 'blarg')
+        e.set('x' * (20 * 1024), 'blarg')
+        e.set('c' * (20 * 1024), 'blarg')
+        e.set('v' * (20 * 1024), 'blarg')
+        e.set('t' * (20 * 1024), 'blarg')
+
+        result = plugin.build_multi_event_request_array([e])
+        body = JSON.parse(result[0][:body])
+        events = body['events']
+        scalyr_event = events[0]
+        attrs = scalyr_event['attrs']
+        expect(attrs.size).to eq(10)
+        expect(attrs.to_json.bytesize).to be <= 200*1024
+      end
+      it "stops copying fields when the record would exceed the max record size" do
+        plugin = setup_plugin()
+        e = LogStash::Event.new
+        e.set('b', 'a' * (20 * 1024))
+        e.set('c', 'a' * (20 * 1024))
+        e.set('d', 'a' * (20 * 1024))
+        e.set('e', 'a' * (20 * 1024))
+        e.set('q', 'a' * (20 * 1024))
+        e.set('w', 'a' * (20 * 1024))
+        e.set('r', 'a' * (20 * 1024))
+        e.set('z', 'a' * (20 * 1024))
+        e.set('x', 'a' * (20 * 1024))
+        e.set('c', 'a' * (20 * 1024))
+        e.set('v', 'a' * (20 * 1024))
+        e.set('t', 'a' * (20 * 1024))
+
+        result = plugin.build_multi_event_request_array([e])
+        body = JSON.parse(result[0][:body])
+        events = body['events']
+        scalyr_event = events[0]
+        attrs = scalyr_event['attrs']
+        expect(attrs.size).to eq(10)
+      end
+      it "can estimate the size of complex nested objects, and throw them away" do
+        plugin = setup_plugin()
+        e = LogStash::Event.new
+        e.set('message', 'a' * (205 * 1024))
+        e.set('honk', [['b' * (65 * 1024)]])
+        e.set('blarg', 'honk')
+        e.set('rawr', 'blah')
+
+        result = plugin.build_multi_event_request_array([e])
+        body = JSON.parse(result[0][:body])
+        events = body['events']
+        scalyr_event = events[0]
+        attrs = scalyr_event['attrs']
+        expect(attrs.has_key? 'honk').to be false
+      end
+    end
+
     context "scalyr_server config option handling and connectivity check" do
       it "doesn't throw an error on valid url" do
         config = {