From 517945167fbfede81c6a63d63d1b8e1563e7224b Mon Sep 17 00:00:00 2001 From: Kavindu Dodanduwa Date: Mon, 2 Feb 2026 11:29:18 -0800 Subject: [PATCH 1/2] Intrduce streaming support for aws logs Signed-off-by: Kavindu Dodanduwa # Conflicts: # extension/encoding/awslogsencodingextension/go.mod # extension/encoding/awslogsencodingextension/internal/unmarshaler/subscription-filter/unmarshaler.go --- .chloggen/feat_streaming-for-aws-logs.yaml | 27 +++ .../awslogsencodingextension/README.md | 17 ++ .../awslogsencodingextension/extension.go | 109 +++++---- .../testdata/cloudtrail_log_expected.yaml | 40 ++-- ...udtrail_log_expected_with_uid_feature.yaml | 28 +-- .../testdata/stream/cloudtrail_log.json | 192 +++++++++++++++ .../stream/cloudtrail_log_expect_1.yaml | 183 ++++++++++++++ .../stream/cloudtrail_log_expect_2.yaml | 123 ++++++++++ .../stream/cloudtrail_log_expect_3.yaml | 143 +++++++++++ .../unmarshaler/cloudtraillog/unmarshaler.go | 223 +++++++++++++----- .../cloudtraillog/unmarshaler_test.go | 57 +++++ .../elb-access-log/benchmark_test.go | 1 + .../unmarshaler/elb-access-log/elb.go | 12 +- .../testdata/alb_al_valid_logs.log | 2 +- .../testdata/stream_alb/alb_al_valid_logs.log | 3 + .../alb_al_valid_logs_expected_1.yaml | 95 ++++++++ .../alb_al_valid_logs_expected_2.yaml | 95 ++++++++ .../alb_al_valid_logs_expected_3.yaml | 104 ++++++++ .../unmarshaler/elb-access-log/unmarshaler.go | 166 ++++++++----- .../elb-access-log/unmarshaler_test.go | 122 +++++++++- .../testdata/stream/alert_log_multi.json | 2 + .../testdata/stream/alert_log_multi_1.yaml | 65 +++++ .../testdata/stream/alert_log_multi_2.yaml | 56 +++++ .../network-firewall-log/unmarshaler.go | 126 +++++++--- .../network-firewall-log/unmarshaler_test.go | 65 +++++ .../testdata/stream/valid_log_multi.log | 3 + .../stream/valid_s3_access_multi_1.yaml | 86 +++++++ .../stream/valid_s3_access_multi_2.yaml | 83 +++++++ .../stream/valid_s3_access_multi_3.yaml | 86 +++++++ .../unmarshaler/s3-access-log/unmarshaler.go | 72 +++++- .../s3-access-log/unmarshaler_test.go | 65 +++++ .../subscription-filter/unmarshaler.go | 67 ++++-- .../internal/unmarshaler/unmarshaler.go | 3 + .../stream/valid_vpc_flow_log_multi.log | 3 + .../stream/valid_vpc_flow_log_multi_1.yaml | 56 +++++ .../stream/valid_vpc_flow_log_multi_2.yaml | 56 +++++ .../unmarshaler/vpc-flow-log/unmarshaler.go | 131 +++++++++- .../vpc-flow-log/unmarshaler_test.go | 70 ++++++ .../waf/testdata/missing_webaclid_log.json | 101 +------- .../unmarshaler/waf/testdata/valid_log.json | 102 +------- .../waf/testdata/valid_log_multi.json | 2 + .../waf/testdata/valid_log_multi_1.yaml | 113 +++++++++ .../waf/testdata/valid_log_multi_2.yaml | 86 +++++++ .../internal/unmarshaler/waf/unmarshaler.go | 197 ++++++++++------ .../unmarshaler/waf/unmarshaler_test.go | 68 +++++- internal/tidylist/tidylist.txt | 2 +- 46 files changed, 2957 insertions(+), 551 deletions(-) create mode 100644 .chloggen/feat_streaming-for-aws-logs.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log.json create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_1.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_2.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_3.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs.log create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_1.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_2.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_3.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi.json create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi_1.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi_2.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_log_multi.log create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_1.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_2.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_3.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi.log create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi_1.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi_2.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi.json create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi_1.yaml create mode 100644 extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi_2.yaml diff --git a/.chloggen/feat_streaming-for-aws-logs.yaml b/.chloggen/feat_streaming-for-aws-logs.yaml new file mode 100644 index 0000000000000..b1242080300d1 --- /dev/null +++ b/.chloggen/feat_streaming-for-aws-logs.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog) +component: extension/awslogs_encoding + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Adopt encoder streaming support for AWS Logs Encoding Extension + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [45567] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/extension/encoding/awslogsencodingextension/README.md b/extension/encoding/awslogsencodingextension/README.md index 4d0bfb5db45f7..53afeed014ebf 100644 --- a/extension/encoding/awslogsencodingextension/README.md +++ b/extension/encoding/awslogsencodingextension/README.md @@ -171,6 +171,23 @@ otelcol --config=config.yaml --feature-gates --feature-gates= | `userIdentity.arn` | `aws.principal.arn` | `aws.user_identity.principal.arn` | | `userIdentity.type` | `aws.principal.type` | `aws.user_identity.principal.type` | +## Streaming Support + +All sub formats support both streaming & non-streaming unmarshaling. +The table below summarizes streaming support details for each log type, along with the offset tracking mechanism, + +| Log Type | Sub Log Type/Source | Offset Tracking | +|---------------------|--------------------------------|-----------------------------------| +| CloudTrail | Generic records | Number of records processed | +| CloudTrail | Digest record | Always 0 (full payload processed) | +| ELB Access Logs | ALB/NLB/CLB | Bytes processed | +| Network Firewall | Alert/Flow/TLS | Bytes processed | +| S3 Access Logs | - | Bytes processed | +| Subscription filter | - | Always 0 (full payload processed) | +| VPC Flow Logs | S3 plain text | Bytes processed | +| VPC Flow Logs | CloudWatch subscription filter | Always 0 (full payload processed) | +| WAF Logs | - | Bytes processed | + ## Produced Records per Format ### VPC flow log record fields diff --git a/extension/encoding/awslogsencodingextension/extension.go b/extension/encoding/awslogsencodingextension/extension.go index ecec03dabd360..43cdff81882c6 100644 --- a/extension/encoding/awslogsencodingextension/extension.go +++ b/extension/encoding/awslogsencodingextension/extension.go @@ -55,7 +55,10 @@ func init() { featuregate.WithRegisterReferenceURL("https://github.com/open-telemetry/opentelemetry-collector-contrib/pull/45459")) } -var _ encoding.LogsUnmarshalerExtension = (*encodingExtension)(nil) +var ( + _ encoding.LogsUnmarshalerExtension = (*encodingExtension)(nil) + _ encoding.LogsDecoderExtension = (*encodingExtension)(nil) +) type encodingExtension struct { cfg *Config @@ -70,8 +73,8 @@ func newExtension(cfg *Config, settings extension.Settings) (*encodingExtension, case constants.FormatCloudWatchLogsSubscriptionFilter, constants.FormatCloudWatchLogsSubscriptionFilterV1: if cfg.Format == constants.FormatCloudWatchLogsSubscriptionFilterV1 { settings.Logger.Warn("using old format value. This format will be removed in version 0.138.0.", - zap.String("old_format", string(constants.FormatCloudWatchLogsSubscriptionFilterV1)), - zap.String("new_format", string(constants.FormatCloudWatchLogsSubscriptionFilter)), + zap.String("old_format", constants.FormatCloudWatchLogsSubscriptionFilterV1), + zap.String("new_format", constants.FormatCloudWatchLogsSubscriptionFilter), ) } return &encodingExtension{ @@ -81,8 +84,8 @@ func newExtension(cfg *Config, settings extension.Settings) (*encodingExtension, case constants.FormatVPCFlowLog, constants.FormatVPCFlowLogV1: if cfg.Format == constants.FormatVPCFlowLogV1 { settings.Logger.Warn("using old format value. This format will be removed in version 0.138.0.", - zap.String("old_format", string(constants.FormatVPCFlowLogV1)), - zap.String("new_format", string(constants.FormatVPCFlowLog)), + zap.String("old_format", constants.FormatVPCFlowLogV1), + zap.String("new_format", constants.FormatVPCFlowLog), ) } @@ -100,8 +103,8 @@ func newExtension(cfg *Config, settings extension.Settings) (*encodingExtension, case constants.FormatS3AccessLog, constants.FormatS3AccessLogV1: if cfg.Format == constants.FormatS3AccessLogV1 { settings.Logger.Warn("using old format value. This format will be removed in version 0.138.0.", - zap.String("old_format", string(constants.FormatS3AccessLogV1)), - zap.String("new_format", string(constants.FormatS3AccessLog)), + zap.String("old_format", constants.FormatS3AccessLogV1), + zap.String("new_format", constants.FormatS3AccessLog), ) } return &encodingExtension{ @@ -111,8 +114,8 @@ func newExtension(cfg *Config, settings extension.Settings) (*encodingExtension, case constants.FormatWAFLog, constants.FormatWAFLogV1: if cfg.Format == constants.FormatWAFLogV1 { settings.Logger.Warn("using old format value. This format will be removed in version 0.138.0.", - zap.String("old_format", string(constants.FormatWAFLogV1)), - zap.String("new_format", string(constants.FormatWAFLog)), + zap.String("old_format", constants.FormatWAFLogV1), + zap.String("new_format", constants.FormatWAFLog), ) } return &encodingExtension{ @@ -122,8 +125,8 @@ func newExtension(cfg *Config, settings extension.Settings) (*encodingExtension, case constants.FormatCloudTrailLog, constants.FormatCloudTrailLogV1: if cfg.Format == constants.FormatCloudTrailLogV1 { settings.Logger.Warn("using old format value. This format will be removed in version 0.138.0.", - zap.String("old_format", string(constants.FormatCloudTrailLogV1)), - zap.String("new_format", string(constants.FormatCloudTrailLog)), + zap.String("old_format", constants.FormatCloudTrailLogV1), + zap.String("new_format", constants.FormatCloudTrailLog), ) } return &encodingExtension{ @@ -135,8 +138,8 @@ func newExtension(cfg *Config, settings extension.Settings) (*encodingExtension, case constants.FormatELBAccessLog, constants.FormatELBAccessLogV1: if cfg.Format == constants.FormatELBAccessLogV1 { settings.Logger.Warn("using old format value. This format will be removed in version 0.138.0.", - zap.String("old_format", string(constants.FormatELBAccessLogV1)), - zap.String("new_format", string(constants.FormatELBAccessLog)), + zap.String("old_format", constants.FormatELBAccessLogV1), + zap.String("new_format", constants.FormatELBAccessLog), ) } return &encodingExtension{ @@ -167,37 +170,30 @@ func (*encodingExtension) Shutdown(_ context.Context) error { return nil } -func (e *encodingExtension) getGzipReader(buf []byte) (io.Reader, error) { - var err error - gzipReader, ok := e.gzipPool.Get().(*gzip.Reader) - if !ok { - gzipReader, err = gzip.NewReader(bytes.NewReader(buf)) - } else { - err = gzipReader.Reset(bytes.NewBuffer(buf)) +func (e *encodingExtension) UnmarshalLogs(buf []byte) (plog.Logs, error) { + encodingReader, reader, err := e.getReaderFromFormat(buf) + if err != nil { + return plog.Logs{}, fmt.Errorf("failed to get reader for %q logs: %w", e.format, err) } - if err != nil { - if gzipReader != nil { - e.gzipPool.Put(gzipReader) + defer func() { + if encodingReader == gzipEncoding { + r := reader.(*gzip.Reader) + _ = r.Close() + e.gzipPool.Put(r) } - return nil, fmt.Errorf("failed to decompress content: %w", err) - } + }() - return gzipReader, nil -} + logs, err := e.unmarshaler.UnmarshalAWSLogs(reader) + if err != nil { + return plog.Logs{}, fmt.Errorf("failed to unmarshal logs as %q format: %w", e.format, err) + } -// isGzipData checks if the buffer contains gzip-compressed data by examining magic bytes -func isGzipData(buf []byte) bool { - return len(buf) > 2 && buf[0] == 0x1f && buf[1] == 0x8b + return logs, nil } -// getReaderForData returns the appropriate reader and encoding type based on data format -func (e *encodingExtension) getReaderForData(buf []byte) (string, io.Reader, error) { - if isGzipData(buf) { - reader, err := e.getGzipReader(buf) - return gzipEncoding, reader, err - } - return bytesEncoding, bytes.NewReader(buf), nil +func (e *encodingExtension) NewLogsDecoder(reader io.Reader, options ...encoding.DecoderOption) (encoding.LogsDecoder, error) { + return e.unmarshaler.NewLogsDecoder(reader, options...) } func (e *encodingExtension) getReaderFromFormat(buf []byte) (string, io.Reader, error) { @@ -229,24 +225,35 @@ func (e *encodingExtension) getReaderFromFormat(buf []byte) (string, io.Reader, } } -func (e *encodingExtension) UnmarshalLogs(buf []byte) (plog.Logs, error) { - encodingReader, reader, err := e.getReaderFromFormat(buf) - if err != nil { - return plog.Logs{}, fmt.Errorf("failed to get reader for %q logs: %w", e.format, err) +// getReaderForData returns the appropriate reader and encoding type based on data format +func (e *encodingExtension) getReaderForData(buf []byte) (string, io.Reader, error) { + if isGzipData(buf) { + reader, err := e.getGzipReader(buf) + return gzipEncoding, reader, err } + return bytesEncoding, bytes.NewReader(buf), nil +} - defer func() { - if encodingReader == gzipEncoding { - r := reader.(*gzip.Reader) - _ = r.Close() - e.gzipPool.Put(r) - } - }() +func (e *encodingExtension) getGzipReader(buf []byte) (io.Reader, error) { + var err error + gzipReader, ok := e.gzipPool.Get().(*gzip.Reader) + if !ok { + gzipReader, err = gzip.NewReader(bytes.NewReader(buf)) + } else { + err = gzipReader.Reset(bytes.NewBuffer(buf)) + } - logs, err := e.unmarshaler.UnmarshalAWSLogs(reader) if err != nil { - return plog.Logs{}, fmt.Errorf("failed to unmarshal logs as %q format: %w", e.format, err) + if gzipReader != nil { + e.gzipPool.Put(gzipReader) + } + return nil, fmt.Errorf("failed to decompress content: %w", err) } - return logs, nil + return gzipReader, nil +} + +// isGzipData checks if the buffer contains gzip-compressed data by examining magic bytes +func isGzipData(buf []byte) bool { + return len(buf) > 2 && buf[0] == 0x1f && buf[1] == 0x8b } diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/cloudtrail_log_expected.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/cloudtrail_log_expected.yaml index ac3f469834ce1..850b215eb04b9 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/cloudtrail_log_expected.yaml +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/cloudtrail_log_expected.yaml @@ -6,10 +6,10 @@ resourceLogs: stringValue: aws - key: cloud.region value: - stringValue: us-east-1 + stringValue: us-west-2 - key: cloud.account.id value: - stringValue: "123456789012" + stringValue: "111122223333" scopeLogs: - logRecords: - attributes: @@ -124,29 +124,29 @@ resourceLogs: values: - kvlistValue: values: - - key: instanceId - value: - stringValue: i-EXAMPLEaff4840c22 - - key: currentState + - key: previousState value: kvlistValue: values: - key: code value: - doubleValue: 0 + doubleValue: 80 - key: name value: - stringValue: pending - - key: previousState + stringValue: stopped + - key: instanceId + value: + stringValue: i-EXAMPLEaff4840c22 + - key: currentState value: kvlistValue: values: - key: code value: - doubleValue: 80 + doubleValue: 0 - key: name value: - stringValue: stopped + stringValue: pending - kvlistValue: values: - key: instanceId @@ -166,12 +166,12 @@ resourceLogs: value: kvlistValue: values: - - key: code - value: - doubleValue: 80 - key: name value: stringValue: stopped + - key: code + value: + doubleValue: 80 body: {} timeUnixNano: "1689801448000000000" - attributes: @@ -359,9 +359,6 @@ resourceLogs: value: kvlistValue: values: - - key: topicArn - value: - stringValue: arn:aws:sns:us-east-1:123456789012:ExampleSNSTopic - key: message value: stringValue: HIDDEN_DUE_TO_SECURITY_REASONS @@ -374,6 +371,9 @@ resourceLogs: - key: messageAttributes value: stringValue: HIDDEN_DUE_TO_SECURITY_REASONS + - key: topicArn + value: + stringValue: arn:aws:sns:us-east-1:123456789012:ExampleSNSTopic - key: aws.response.elements value: kvlistValue: @@ -611,15 +611,15 @@ resourceLogs: value: kvlistValue: values: + - key: MFAUsed + value: + boolValue: true - key: MobileVersion value: stringValue: "No" - key: LoginTo value: stringValue: https://console.aws.amazon.com/console/home?region=us - - key: MFAUsed - value: - boolValue: true body: {} timeUnixNano: "1749997800000000000" - attributes: diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/cloudtrail_log_expected_with_uid_feature.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/cloudtrail_log_expected_with_uid_feature.yaml index 4bc3c5ecb6107..843d069d77efe 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/cloudtrail_log_expected_with_uid_feature.yaml +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/cloudtrail_log_expected_with_uid_feature.yaml @@ -6,10 +6,10 @@ resourceLogs: stringValue: aws - key: cloud.region value: - stringValue: us-east-1 + stringValue: us-west-2 - key: cloud.account.id value: - stringValue: "123456789012" + stringValue: "111122223333" scopeLogs: - logRecords: - attributes: @@ -141,12 +141,12 @@ resourceLogs: value: kvlistValue: values: - - key: name - value: - stringValue: stopped - key: code value: doubleValue: 80 + - key: name + value: + stringValue: stopped - kvlistValue: values: - key: instanceId @@ -166,12 +166,12 @@ resourceLogs: value: kvlistValue: values: - - key: name - value: - stringValue: stopped - key: code value: doubleValue: 80 + - key: name + value: + stringValue: stopped body: {} timeUnixNano: "1689801448000000000" - attributes: @@ -418,9 +418,6 @@ resourceLogs: value: kvlistValue: values: - - key: state - value: - stringValue: Start - key: eventSource value: stringValue: ssm.amazonaws.com @@ -452,6 +449,9 @@ resourceLogs: - key: average value: doubleValue: 669 + - key: state + value: + stringValue: Start body: {} timeUnixNano: "1672627860000000000" - attributes: @@ -611,15 +611,15 @@ resourceLogs: value: kvlistValue: values: + - key: MFAUsed + value: + boolValue: true - key: MobileVersion value: stringValue: "No" - key: LoginTo value: stringValue: https://console.aws.amazon.com/console/home?region=us - - key: MFAUsed - value: - boolValue: true body: {} timeUnixNano: "1749997800000000000" - attributes: diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log.json b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log.json new file mode 100644 index 0000000000000..586f9162ad4d9 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log.json @@ -0,0 +1,192 @@ +{ + "Records": [ + { + "eventVersion": "1.08", + "userIdentity": { + "type": "IAMUser", + "principalId": "EXAMPLE6E4XEGITWATV6R", + "arn": "arn:aws:iam::123456789012:user/Mateo", + "accountId": "123456789012", + "accessKeyId": "AKIAIOSFODNN7EXAMPLE", + "userName": "Mateo", + "sessionContext": { + "sessionIssuer": {}, + "webIdFederationData": {}, + "attributes": { + "creationDate": "2023-07-19T21:11:57Z", + "mfaAuthenticated": "false" + } + } + }, + "eventTime": "2023-07-19T21:17:28Z", + "eventSource": "ec2.amazonaws.com", + "eventName": "StartInstances", + "awsRegion": "us-east-1", + "sourceIPAddress": "192.0.2.0", + "userAgent": "aws-cli/2.13.5 Python/3.11.4 Linux/4.14.255-314-253.539.amzn2.x86_64 exec-env/CloudShell exe/x86_64.amzn.2 prompt/off command/ec2.start-instances", + "requestParameters": { + "instancesSet": { + "items": [ + { + "instanceId": "i-EXAMPLE56126103cb" + }, + { + "instanceId": "i-EXAMPLEaff4840c22" + } + ] + } + }, + "responseElements": { + "requestId": "e4336db0-149f-4a6b-844d-EXAMPLEb9d16", + "instancesSet": { + "items": [ + { + "instanceId": "i-EXAMPLEaff4840c22", + "currentState": { + "code": 0, + "name": "pending" + }, + "previousState": { + "code": 80, + "name": "stopped" + } + }, + { + "instanceId": "i-EXAMPLE56126103cb", + "currentState": { + "code": 0, + "name": "pending" + }, + "previousState": { + "code": 80, + "name": "stopped" + } + } + ] + } + }, + "requestID": "e4336db0-149f-4a6b-844d-EXAMPLEb9d16", + "eventID": "e755e09c-42f9-4c5c-9064-EXAMPLE228c7", + "readOnly": false, + "eventType": "AwsApiCall", + "apiVersion": "v1", + "managementEvent": true, + "recipientAccountId": "123456789012", + "eventCategory": "Management", + "tlsDetails": { + "tlsVersion": "TLSv1.2", + "cipherSuite": "ECDHE-RSA-AES128-GCM-SHA256", + "clientProvidedHostHeader": "ec2.us-east-1.amazonaws.com" + }, + "sessionCredentialFromConsole": "true" + }, + { + "eventVersion": "1.08", + "userIdentity": { + "type": "IAMUser", + "principalId": "AIDA6ON6E4XEGITEXAMPLE", + "arn": "arn:aws:iam::123456789012:user/Mary", + "accountId": "123456789012", + "accessKeyId": "AKIAIOSFODNN7EXAMPLE", + "userName": "Mary", + "sessionContext": { + "sessionIssuer": {}, + "webIdFederationData": {}, + "attributes": { + "creationDate": "2023-07-19T21:11:57Z", + "mfaAuthenticated": "false" + } + } + }, + "eventTime": "2023-07-19T21:25:09Z", + "eventSource": "iam.amazonaws.com", + "eventName": "CreateUser", + "awsRegion": "us-east-1", + "sourceIPAddress": "192.0.2.0", + "userAgent": "aws-cli/2.13.5 Python/3.11.4 Linux/4.14.255-314-253.539.amzn2.x86_64 exec-env/CloudShell exe/x86_64.amzn.2 prompt/off command/iam.create-user", + "requestParameters": { + "userName": "Richard" + }, + "responseElements": { + "user": { + "path": "/", + "arn": "arn:aws:iam::123456789012:user/Richard", + "userId": "AIDA6ON6E4XEP7EXAMPLE", + "createDate": "Jul 19, 2023 9:25:09 PM", + "userName": "Richard" + } + }, + "requestID": "2d528c76-329e-410b-9516-EXAMPLE565dc", + "eventID": "ba0801a1-87ec-4d26-be87-EXAMPLE75bbb", + "readOnly": false, + "eventType": "AwsApiCall", + "managementEvent": true, + "recipientAccountId": "123456789012", + "eventCategory": "Management", + "tlsDetails": { + "tlsVersion": "TLSv1.2", + "cipherSuite": "ECDHE-RSA-AES128-GCM-SHA256", + "clientProvidedHostHeader": "iam.amazonaws.com" + }, + "sessionCredentialFromConsole": "true" + }, + { + "eventVersion": "1.08", + "userIdentity": { + "type": "AssumedRole", + "principalId": "EX_PRINCIPAL_ID", + "arn": "arn:aws:iam::123456789012:user/Bob", + "accountId": "123456789012", + "accessKeyId": "AKIAIOSFODNN7EXAMPLE", + "sessionContext": { + "sessionIssuer": { + "type": "Role", + "principalId": "AKIAIOSFODNN7EXAMPLE", + "arn": "arn:aws:iam::123456789012:role/Admin", + "accountId": "123456789012", + "userName": "ExampleUser" + }, + "attributes": { + "creationDate": "2023-08-21T16:44:05Z", + "mfaAuthenticated": "true" + } + } + }, + "eventTime": "2023-08-21T16:48:37Z", + "eventSource": "sns.amazonaws.com", + "eventName": "Publish", + "awsRegion": "us-east-1", + "sourceIPAddress": "192.0.2.0", + "userAgent": "aws-cli/1.29.16 md/Botocore#1.31.16 ua/2.0 os/linux#5.4.250-173.369.amzn2int.x86_64 md/arch#x86_64 lang/python#3.8.17 md/pyimpl#CPython cfg/retry-mode#legacy botocore/1.31.16", + "requestParameters": { + "topicArn": "arn:aws:sns:us-east-1:123456789012:ExampleSNSTopic", + "message": "HIDDEN_DUE_TO_SECURITY_REASONS", + "subject": "HIDDEN_DUE_TO_SECURITY_REASONS", + "messageStructure": "json", + "messageAttributes": "HIDDEN_DUE_TO_SECURITY_REASONS" + }, + "responseElements": { + "messageId": "0787cd1e-d92b-521c-a8b4-90434e8ef840" + }, + "requestID": "0a8ab208-11bf-5e01-bd2d-ef55861b545d", + "eventID": "bb3496d4-5252-4660-9c28-3c6aebdb21c0", + "readOnly": false, + "resources": [ + { + "accountId": "123456789012", + "type": "AWS::SNS::Topic", + "ARN": "arn:aws:sns:us-east-1:123456789012:ExampleSNSTopic" + } + ], + "eventType": "AwsApiCall", + "managementEvent": false, + "recipientAccountId": "123456789012", + "eventCategory": "Data", + "tlsDetails": { + "tlsVersion": "TLSv1.2", + "cipherSuite": "ECDHE-RSA-AES128-GCM-SHA256", + "clientProvidedHostHeader": "sns.us-east-1.amazonaws.com" + } + } + ] +} \ No newline at end of file diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_1.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_1.yaml new file mode 100644 index 0000000000000..a328f1d5d7bb2 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_1.yaml @@ -0,0 +1,183 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: cloud.region + value: + stringValue: us-east-1 + - key: cloud.account.id + value: + stringValue: "123456789012" + scopeLogs: + - logRecords: + - attributes: + - key: aws.cloudtrail.event_version + value: + stringValue: "1.08" + - key: aws.cloudtrail.event_id + value: + stringValue: e755e09c-42f9-4c5c-9064-EXAMPLE228c7 + - key: rpc.method + value: + stringValue: StartInstances + - key: rpc.system + value: + stringValue: AwsApiCall + - key: aws.cloudtrail.api_version + value: + stringValue: v1 + - key: rpc.service + value: + stringValue: ec2.amazonaws.com + - key: aws.request_id + value: + stringValue: e4336db0-149f-4a6b-844d-EXAMPLEb9d16 + - key: aws.event.category + value: + stringValue: Management + - key: aws.event.read_only + value: + boolValue: false + - key: aws.event.management + value: + boolValue: true + - key: source.address + value: + stringValue: 192.0.2.0 + - key: user_agent.original + value: + stringValue: aws-cli/2.13.5 Python/3.11.4 Linux/4.14.255-314-253.539.amzn2.x86_64 exec-env/CloudShell exe/x86_64.amzn.2 prompt/off command/ec2.start-instances + - key: aws.session.console + value: + boolValue: true + - key: user.name + value: + stringValue: Mateo + - key: aws.user_identity.account_id + value: + stringValue: "123456789012" + - key: aws.access_key.id + value: + stringValue: AKIAIOSFODNN7EXAMPLE + - key: aws.principal.id + value: + stringValue: EXAMPLE6E4XEGITWATV6R + - key: aws.principal.arn + value: + stringValue: arn:aws:iam::123456789012:user/Mateo + - key: aws.principal.type + value: + stringValue: IAMUser + - key: aws.user_identity.session_context.attributes.mfa_authenticated + value: + boolValue: false + - key: aws.user_identity.session_context.attributes.creation_date + value: + stringValue: "2023-07-19T21:11:57Z" + - key: tls.protocol.version + value: + stringValue: "1.2" + - key: tls.cipher + value: + stringValue: ECDHE-RSA-AES128-GCM-SHA256 + - key: server.address + value: + stringValue: ec2.us-east-1.amazonaws.com + - key: aws.request.parameters + value: + kvlistValue: + values: + - key: instancesSet + value: + kvlistValue: + values: + - key: items + value: + arrayValue: + values: + - kvlistValue: + values: + - key: instanceId + value: + stringValue: i-EXAMPLE56126103cb + - kvlistValue: + values: + - key: instanceId + value: + stringValue: i-EXAMPLEaff4840c22 + - key: aws.response.elements + value: + kvlistValue: + values: + - key: requestId + value: + stringValue: e4336db0-149f-4a6b-844d-EXAMPLEb9d16 + - key: instancesSet + value: + kvlistValue: + values: + - key: items + value: + arrayValue: + values: + - kvlistValue: + values: + - key: instanceId + value: + stringValue: i-EXAMPLEaff4840c22 + - key: currentState + value: + kvlistValue: + values: + - key: code + value: + doubleValue: 0 + - key: name + value: + stringValue: pending + - key: previousState + value: + kvlistValue: + values: + - key: code + value: + doubleValue: 80 + - key: name + value: + stringValue: stopped + - kvlistValue: + values: + - key: previousState + value: + kvlistValue: + values: + - key: code + value: + doubleValue: 80 + - key: name + value: + stringValue: stopped + - key: instanceId + value: + stringValue: i-EXAMPLE56126103cb + - key: currentState + value: + kvlistValue: + values: + - key: code + value: + doubleValue: 0 + - key: name + value: + stringValue: pending + body: {} + timeUnixNano: "1689801448000000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.cloudtrail + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension + version: test-version diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_2.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_2.yaml new file mode 100644 index 0000000000000..a157bf9555f7e --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_2.yaml @@ -0,0 +1,123 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: cloud.region + value: + stringValue: us-east-1 + - key: cloud.account.id + value: + stringValue: "123456789012" + scopeLogs: + - logRecords: + - attributes: + - key: aws.cloudtrail.event_version + value: + stringValue: "1.08" + - key: aws.cloudtrail.event_id + value: + stringValue: ba0801a1-87ec-4d26-be87-EXAMPLE75bbb + - key: rpc.method + value: + stringValue: CreateUser + - key: rpc.system + value: + stringValue: AwsApiCall + - key: rpc.service + value: + stringValue: iam.amazonaws.com + - key: aws.request_id + value: + stringValue: 2d528c76-329e-410b-9516-EXAMPLE565dc + - key: aws.event.category + value: + stringValue: Management + - key: aws.event.read_only + value: + boolValue: false + - key: aws.event.management + value: + boolValue: true + - key: source.address + value: + stringValue: 192.0.2.0 + - key: user_agent.original + value: + stringValue: aws-cli/2.13.5 Python/3.11.4 Linux/4.14.255-314-253.539.amzn2.x86_64 exec-env/CloudShell exe/x86_64.amzn.2 prompt/off command/iam.create-user + - key: aws.session.console + value: + boolValue: true + - key: user.name + value: + stringValue: Mary + - key: aws.user_identity.account_id + value: + stringValue: "123456789012" + - key: aws.access_key.id + value: + stringValue: AKIAIOSFODNN7EXAMPLE + - key: aws.principal.id + value: + stringValue: AIDA6ON6E4XEGITEXAMPLE + - key: aws.principal.arn + value: + stringValue: arn:aws:iam::123456789012:user/Mary + - key: aws.principal.type + value: + stringValue: IAMUser + - key: aws.user_identity.session_context.attributes.mfa_authenticated + value: + boolValue: false + - key: aws.user_identity.session_context.attributes.creation_date + value: + stringValue: "2023-07-19T21:11:57Z" + - key: tls.protocol.version + value: + stringValue: "1.2" + - key: tls.cipher + value: + stringValue: ECDHE-RSA-AES128-GCM-SHA256 + - key: server.address + value: + stringValue: iam.amazonaws.com + - key: aws.request.parameters + value: + kvlistValue: + values: + - key: userName + value: + stringValue: Richard + - key: aws.response.elements + value: + kvlistValue: + values: + - key: user + value: + kvlistValue: + values: + - key: path + value: + stringValue: / + - key: arn + value: + stringValue: arn:aws:iam::123456789012:user/Richard + - key: userId + value: + stringValue: AIDA6ON6E4XEP7EXAMPLE + - key: createDate + value: + stringValue: Jul 19, 2023 9:25:09 PM + - key: userName + value: + stringValue: Richard + body: {} + timeUnixNano: "1689801909000000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.cloudtrail + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension + version: test-version diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_3.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_3.yaml new file mode 100644 index 0000000000000..b51cefd8c7189 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/testdata/stream/cloudtrail_log_expect_3.yaml @@ -0,0 +1,143 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: cloud.region + value: + stringValue: us-east-1 + - key: cloud.account.id + value: + stringValue: "123456789012" + scopeLogs: + - logRecords: + - attributes: + - key: aws.cloudtrail.event_version + value: + stringValue: "1.08" + - key: aws.cloudtrail.event_id + value: + stringValue: bb3496d4-5252-4660-9c28-3c6aebdb21c0 + - key: rpc.method + value: + stringValue: Publish + - key: rpc.system + value: + stringValue: AwsApiCall + - key: rpc.service + value: + stringValue: sns.amazonaws.com + - key: aws.request_id + value: + stringValue: 0a8ab208-11bf-5e01-bd2d-ef55861b545d + - key: aws.event.category + value: + stringValue: Data + - key: aws.event.read_only + value: + boolValue: false + - key: aws.event.management + value: + boolValue: false + - key: source.address + value: + stringValue: 192.0.2.0 + - key: user_agent.original + value: + stringValue: aws-cli/1.29.16 md/Botocore#1.31.16 ua/2.0 os/linux#5.4.250-173.369.amzn2int.x86_64 md/arch#x86_64 lang/python#3.8.17 md/pyimpl#CPython cfg/retry-mode#legacy botocore/1.31.16 + - key: aws.user_identity.account_id + value: + stringValue: "123456789012" + - key: aws.access_key.id + value: + stringValue: AKIAIOSFODNN7EXAMPLE + - key: aws.principal.id + value: + stringValue: EX_PRINCIPAL_ID + - key: aws.principal.arn + value: + stringValue: arn:aws:iam::123456789012:user/Bob + - key: aws.principal.type + value: + stringValue: AssumedRole + - key: aws.user_identity.session_context.attributes.mfa_authenticated + value: + boolValue: true + - key: aws.user_identity.session_context.attributes.creation_date + value: + stringValue: "2023-08-21T16:44:05Z" + - key: aws.user_identity.session_context.issuer.type + value: + stringValue: Role + - key: aws.user_identity.session_context.issuer.principal_id + value: + stringValue: AKIAIOSFODNN7EXAMPLE + - key: aws.user_identity.session_context.issuer.arn + value: + stringValue: arn:aws:iam::123456789012:role/Admin + - key: aws.user_identity.session_context.issuer.account_id + value: + stringValue: "123456789012" + - key: aws.user_identity.session_context.issuer.user_name + value: + stringValue: ExampleUser + - key: tls.protocol.version + value: + stringValue: "1.2" + - key: tls.cipher + value: + stringValue: ECDHE-RSA-AES128-GCM-SHA256 + - key: server.address + value: + stringValue: sns.us-east-1.amazonaws.com + - key: aws.request.parameters + value: + kvlistValue: + values: + - key: topicArn + value: + stringValue: arn:aws:sns:us-east-1:123456789012:ExampleSNSTopic + - key: message + value: + stringValue: HIDDEN_DUE_TO_SECURITY_REASONS + - key: subject + value: + stringValue: HIDDEN_DUE_TO_SECURITY_REASONS + - key: messageStructure + value: + stringValue: json + - key: messageAttributes + value: + stringValue: HIDDEN_DUE_TO_SECURITY_REASONS + - key: aws.response.elements + value: + kvlistValue: + values: + - key: messageId + value: + stringValue: 0787cd1e-d92b-521c-a8b4-90434e8ef840 + - key: aws.resources + value: + arrayValue: + values: + - kvlistValue: + values: + - key: account.id + value: + stringValue: "123456789012" + - key: type + value: + stringValue: AWS::SNS::Topic + - key: arn + value: + stringValue: arn:aws:sns:us-east-1:123456789012:ExampleSNSTopic + body: {} + timeUnixNano: "1692636517000000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.cloudtrail + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension + version: test-version diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/unmarshaler.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/unmarshaler.go index 80fc05f1c02b8..fbadf828dd08f 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/unmarshaler.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/unmarshaler.go @@ -18,9 +18,11 @@ import ( "go.opentelemetry.io/collector/pdata/plog" conventions "go.opentelemetry.io/otel/semconv/v1.38.0" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/constants" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/metadata" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/unmarshaler" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/xstreamencoding" ) // readerBufferSize defines the buffer size for buffered readers. @@ -149,17 +151,50 @@ func NewCloudTrailLogUnmarshaler(buildInfo component.BuildInfo, uIDFeatureEnable } func (u *CloudTrailLogUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs, error) { - bufferedReader := bufio.NewReaderSize(reader, readerBufferSize) + // Decode as a stream but flush all at once using flush options + streamDecoder, err := u.NewLogsDecoder(reader, encoding.WithFlushItems(0), encoding.WithFlushBytes(0)) + if err != nil { + return plog.Logs{}, err + } + + logs, err := streamDecoder.DecodeLogs() + if err != nil { + //nolint:errorlint + if err == io.EOF { + // EOF indicates no logs were found, return any logs that's available + return logs, nil + } + + return plog.Logs{}, err + } + + return logs, nil +} + +// NewLogsDecoder returns a streaming logs decoder. It detects format type for CloudTrail logs and processes accordingly. +// Supported sub formats, how they are processed and what offset conveys for each: +// - S3 Records: Offset tracked by the number of records processed +// - CloudWatch subscription filter: Processes full payload; offset is always 0 +// - Digest file: Single record output; offset is always 0 +func (u *CloudTrailLogUnmarshaler) NewLogsDecoder(reader io.Reader, options ...encoding.DecoderOption) (encoding.LogsDecoder, error) { + var bufferedReader *bufio.Reader + if br, ok := reader.(*bufio.Reader); ok { + bufferedReader = br + } else { + bufferedReader = bufio.NewReaderSize(reader, readerBufferSize) + } // Peek into the first 64 bytes to determine the type of CloudTrail log peekBytes, err := bufferedReader.Peek(64) - if err != nil && !errors.Is(err, io.EOF) { - return plog.Logs{}, fmt.Errorf("failed to peek into CloudTrail log: %w", err) + if err != nil { + if !errors.Is(err, io.EOF) { + return nil, fmt.Errorf("failed to peek into CloudTrail log: %w", err) + } } firstKey, err := extractFirstKey(peekBytes) if err != nil { - return plog.Logs{}, fmt.Errorf("failed to extract the first JSON key: %w", err) + return nil, fmt.Errorf("failed to extract the first JSON key: %w", err) } decoder := gojson.NewDecoder(bufferedReader) @@ -171,7 +206,7 @@ func (u *CloudTrailLogUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs // Check for S3 CloudTrail log format (most common) if firstKey == "Records" { - return u.fromS3(decoder) + return u.processRecords(decoder, options...) } // Check for CloudWatch subscription filter format @@ -182,22 +217,130 @@ func (u *CloudTrailLogUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs // Otherwise, assume it's a CloudTrail digest record and attempt to decode var cloudTrailDigest CloudTrailDigest - if err := decoder.Decode(&cloudTrailDigest); err != nil { - return plog.Logs{}, fmt.Errorf("failed to unmarshal payload as a CloudTrail digest: %w", err) + if err = decoder.Decode(&cloudTrailDigest); err != nil { + return nil, fmt.Errorf("failed to unmarshal payload as a CloudTrail digest: %w", err) } - return u.processDigestRecord(cloudTrailDigest) + isEOF := false + record, err := u.processDigestRecord(cloudTrailDigest) + + decoderF := func() (plog.Logs, error) { + if err != nil { + return plog.Logs{}, err + } + + if isEOF { + return plog.Logs{}, io.EOF + } + + isEOF = true + return record, nil + } + return xstreamencoding.NewLogsDecoderAdapter(decoderF, func() int64 { return 0 }), nil } -// fromCloudWatch handles CloudTrail logs from CloudWatch Logs subscription filter -func (u *CloudTrailLogUnmarshaler) fromCloudWatch(reader *bufio.Reader) (plog.Logs, error) { +// processRecords is specialized in processing CloudTrail log records with streaming support +// Implementation works with a gojson.Decoder to efficiently stream through potentially large log files. +func (u *CloudTrailLogUnmarshaler) processRecords(decoder *gojson.Decoder, options ...encoding.DecoderOption) (encoding.LogsDecoder, error) { + // Check opening bracket + if token, err := decoder.Token(); err != nil || token != gojson.Delim('{') { + return nil, fmt.Errorf("expected '{': %w", err) + } + + // Move to Records array + if _, err := decoder.Token(); err != nil { + return nil, fmt.Errorf("expected 'Records' key: %w", err) + } + + // Check for array opening + if token, err := decoder.Token(); err != nil || token != gojson.Delim('[') { + return nil, fmt.Errorf("expected '[': %w", err) + } + + offsetRecord := int64(0) + batchHelper := xstreamencoding.NewBatchHelper(options...) + + for offsetRecord < batchHelper.Options().Offset { + if !decoder.More() { + return nil, fmt.Errorf("EOF reached before offset %d records were discarded", batchHelper.Options().Offset) + } + var skip gojson.RawMessage + if err := decoder.Decode(&skip); err != nil { + return nil, err + } + offsetRecord++ + } + + offsetFunc := func() int64 { + return offsetRecord + } + + decoderF := func() (plog.Logs, error) { + logs := plog.NewLogs() + resourceLogs := logs.ResourceLogs().AppendEmpty() + scopeLogs := resourceLogs.ScopeLogs().AppendEmpty() + u.setCommonScopeAttributes(scopeLogs) + + logRecords := scopeLogs.LogRecords() + // Pre-allocate space for log records to improve performance + logRecords.EnsureCapacity(100) + + var record CloudTrailRecord + + for decoder.More() { + startOffset := decoder.InputOffset() + record = CloudTrailRecord{} + err := decoder.Decode(&record) + if err != nil { + return plog.Logs{}, err + } + + logRecord := logRecords.AppendEmpty() + if err := u.setLogRecord(logRecord, &record); err != nil { + return plog.Logs{}, err + } + + batchHelper.IncrementBytes(decoder.InputOffset() - startOffset) + batchHelper.IncrementItems(1) + offsetRecord++ + + if batchHelper.ShouldFlush() { + batchHelper.Reset() + // Set resource attributes before flushing + u.setResourceAttributes(resourceLogs.Resource().Attributes(), record) + return logs, nil + } + } + + if logRecords.Len() == 0 { + return logs, io.EOF + } + + // Set resource attributes before final flush + u.setResourceAttributes(resourceLogs.Resource().Attributes(), record) + return logs, nil + } + + return xstreamencoding.NewLogsDecoderAdapter(decoderF, offsetFunc), nil +} + +// fromCloudWatch handles CloudTrail logs from CloudWatch Logs subscription filter. +// Processes full record and Offset is from 0. +func (u *CloudTrailLogUnmarshaler) fromCloudWatch(reader *bufio.Reader) (encoding.LogsDecoder, error) { var cwLog events.CloudwatchLogsData if err := gojson.NewDecoder(reader).Decode(&cwLog); err != nil { - return plog.Logs{}, fmt.Errorf("failed to unmarshal data as cloudwatch logs event: %w", err) + return nil, fmt.Errorf("failed to unmarshal data as cloudwatch logs event: %w", err) } if len(cwLog.LogEvents) == 0 { - return plog.NewLogs(), nil + return xstreamencoding.NewLogsDecoderAdapter( + func() (plog.Logs, error) { + return plog.Logs{}, io.EOF + }, + func() int64 { + return 0 + }), + nil } logs, resourceLogs, scopeLogs := u.createLogs() @@ -217,7 +360,7 @@ func (u *CloudTrailLogUnmarshaler) fromCloudWatch(reader *bufio.Reader) (plog.Lo // Parse message as a single CloudTrail record var record CloudTrailRecord if err := gojson.Unmarshal([]byte(event.Message), &record); err != nil { - return plog.Logs{}, fmt.Errorf("failed to unmarshal CloudTrail event from message: %w", err) + return nil, fmt.Errorf("failed to unmarshal CloudTrail event from message: %w", err) } // Set resource attributes from first record (region, account) @@ -228,59 +371,21 @@ func (u *CloudTrailLogUnmarshaler) fromCloudWatch(reader *bufio.Reader) (plog.Lo logRecord := logRecords.AppendEmpty() if err := u.setLogRecord(logRecord, &record); err != nil { - return plog.Logs{}, err + return nil, err } } - return logs, nil -} - -// processRecords is specialized in processing CloudTrail log records. -// Implementation works with a gojson.Decoder to efficiently stream through potentially large log files. -func (u *CloudTrailLogUnmarshaler) fromS3(decoder *gojson.Decoder) (plog.Logs, error) { - // Check opening bracket - if token, err := decoder.Token(); err != nil || token != gojson.Delim('{') { - return plog.Logs{}, fmt.Errorf("expected '{': %w", err) - } - - // Move to Records array - if _, err := decoder.Token(); err != nil { - return plog.Logs{}, fmt.Errorf("expected 'Records' key: %w", err) - } - - // Check for array opening - if token, err := decoder.Token(); err != nil || token != gojson.Delim('[') { - return plog.Logs{}, fmt.Errorf("expected '[': %w", err) - } - - logs, resourceLogs, scopeLogs := u.createLogs() - - logRecords := scopeLogs.LogRecords() - // Pre-allocate space for log records to improve performance - logRecords.EnsureCapacity(100) - - var record CloudTrailRecord - init := true - - for decoder.More() { - record = CloudTrailRecord{} - err := decoder.Decode(&record) - if err != nil { - return plog.Logs{}, err - } - - if init { - u.setResourceAttributes(resourceLogs.Resource().Attributes(), record) - init = false + isEOF := false + decoderF := func() (plog.Logs, error) { + if isEOF { + return plog.Logs{}, io.EOF } - logRecord := logRecords.AppendEmpty() - if err := u.setLogRecord(logRecord, &record); err != nil { - return plog.Logs{}, err - } + isEOF = true + return logs, nil } - return logs, nil + return xstreamencoding.NewLogsDecoderAdapter(decoderF, func() int64 { return 0 }), nil } // processDigestRecord is specialized in processing CloudTrail digest records diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/unmarshaler_test.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/unmarshaler_test.go index bdea1e6cf5e7b..83cfeff803b45 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/unmarshaler_test.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/cloudtraillog/unmarshaler_test.go @@ -6,6 +6,7 @@ package cloudtraillog import ( "bytes" "errors" + "fmt" "io" "os" "path/filepath" @@ -13,7 +14,9 @@ import ( "github.com/stretchr/testify/require" "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/pdata/plog" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest/plogtest" ) @@ -139,6 +142,60 @@ func TestCloudtrailLogUnmarshaler_UnmarshalAWSDigest(t *testing.T) { } } +func TestNewLogsDecoder(t *testing.T) { + directory := "testdata/stream" + expectPattern := "cloudtrail_log_expect_%d.yaml" + + tests := []struct { + name string + offset int64 + }{ + { + name: "Normal streaming", + offset: 0, + }, + { + name: "Streaming with offset", + offset: 2, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + content := readLogFile(t, directory, "cloudtrail_log.json") + unmarshaler := NewCloudTrailLogUnmarshaler(component.BuildInfo{Version: "test-version"}, false) + + // flush each record and start with defined offset + streamer, err := unmarshaler.NewLogsDecoder(content, encoding.WithFlushItems(1), encoding.WithOffset(tt.offset)) + require.NoError(t, err) + + index := tt.offset + for { + index++ + var logs plog.Logs + logs, err = streamer.DecodeLogs() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + + t.Errorf("failed to unmarshal log %d: %v", index, err) + } + + var expectedLogs plog.Logs + expectedLogs, err = golden.ReadLogs(filepath.Join(directory, fmt.Sprintf(expectPattern, index))) + require.NoError(t, err) + require.NoError(t, plogtest.CompareLogs(expectedLogs, logs, plogtest.IgnoreResourceLogsOrder())) + require.Equal(t, index, streamer.Offset()) + } + + // expect EOF after all logs are read + _, err = streamer.DecodeLogs() + require.ErrorIs(t, err, io.EOF) + }) + } +} + func TestCloudTrailLogUnmarshaler_UnmarshalAWSLogs_InvalidJSON(t *testing.T) { t.Parallel() unmarshaler := NewCloudTrailLogUnmarshaler(component.BuildInfo{Version: "test-version"}, false) diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/benchmark_test.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/benchmark_test.go index c6aaa5f37d359..5a7bdb657ddc3 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/benchmark_test.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/benchmark_test.go @@ -55,6 +55,7 @@ func BenchmarkUnmarshalAWSLogs(b *testing.B) { buildInfo: component.BuildInfo{}, logger: zap.NewNop(), } + for _, bc := range elbCases { data := createELBAccessLogContent(b, bc.filename, bc.nLogs) b.Run(bc.name, func(b *testing.B) { diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/elb.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/elb.go index 5fc84c8a52bba..ccfdb85ec62bb 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/elb.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/elb.go @@ -13,10 +13,14 @@ import ( "time" ) +type logSyntaxType string + const ( - albAccessLogs = "alb_access_logs" - nlbAccessLogs = "nlb_access_logs" - clbAccessLogs = "clb_access_logs" + albAccessLogs logSyntaxType = "alb_access_logs" + nlbAccessLogs logSyntaxType = "nlb_access_logs" + clbAccessLogs logSyntaxType = "clb_access_logs" + controlMessage logSyntaxType = "control_message" + // any field can be set to - to indicate that the data was unknown // or unavailable, or that the field was not applicable to this request. unknownField = "-" @@ -373,7 +377,7 @@ func safeConvertStrToFloat(stringNum string) (float64, error) { // ALB supports http, https, h2, grpcs, ws, wss and NLB supports tls. // Only if those are not matched, it checks if the field is a valid timestamp (for CLB logs). // If none match, it returns an error. -func findLogSyntaxByField(field string) (string, error) { +func findLogSyntaxByField(field string) (logSyntaxType, error) { switch field { case "http", "https", "h2", "grpcs", "ws", "wss": return albAccessLogs, nil diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/alb_al_valid_logs.log b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/alb_al_valid_logs.log index 4f942b4e99010..a94bad0ac59cc 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/alb_al_valid_logs.log +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/alb_al_valid_logs.log @@ -1,3 +1,3 @@ https 2018-07-02T22:23:00.186641Z app/my-loadbalancer/50dc6c495c0c9188 192.168.131.39:2817 10.0.0.1:80 0.086 0.048 0.037 200 200 0 57 "GET https://www.example.com:443/ HTTP/1.1" "curl/7.46.0" ECDHE-RSA-AES128-GCM-SHA256 TLSv1.2 arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 "Root=1-58337281-1d84f3d73c47ec4e58577259" "www.example.com" "arn:aws:acm:us-east-2:123456789012:certificate/12345678-1234-1234-1234-123456789012" 1 2018-07-02T22:22:48.364000Z "authenticate,forward" "-" "-" "10.0.0.1:80" "200" "-" "-" TID_1234abcd5678ef90 https 2018-07-02T22:23:00.186641Z app/my-loadbalancer/50dc6c495c0c9188 [fe80::202:b3ff:fe1e:8329]:443 [2001:db8::1]:80 0.086 0.048 0.037 200 200 0 57 "GET https://www.example.com:443/ HTTP/1.1" "curl/7.46.0" ECDHE-RSA-AES128-GCM-SHA256 TLSv1.2 arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 "Root=1-58337281-1d84f3d73c47ec4e58577259" "www.example.com" "arn:aws:acm:us-east-2:123456789012:certificate/12345678-1234-1234-1234-123456789012" 1 2018-07-02T22:22:48.364000Z "authenticate,forward" "-" "-" "10.0.0.1:80" "200" "-" "-" TID_1234abcd5678ef90 -https 2018-07-02T22:23:00.186641Z app/my-loadbalancer/50dc6c495c0c9188 192.168.131.39:2817 10.0.0.1:80 0.086 0.048 0.037 200 200 0 57 "GET https://www.example.com:443/ HTTP/1.1" "curl/7.46.0" ECDHE-RSA-AES128-GCM-SHA256 TLSv1.2 arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 "Root=1-58337281-1d84f3d73c47ec4e58577259" "www.example.com" "arn:aws:acm:us-east-2:123456789012:certificate/12345678-1234-1234-1234-123456789012" 1 2018-07-02T22:22:48.364000Z "authenticate,forward" "-" "-" "10.0.0.1:80" "200" "-" "-" TID_1234abcd5678ef90 "transformed.example.com" "https://transformed.example.com/index.html" "TransformSuccess" +https 2018-07-02T22:23:00.186641Z app/my-loadbalancer/50dc6c495c0c9188 192.168.131.39:2817 10.0.0.1:80 0.086 0.048 0.037 200 200 0 57 "GET https://www.example.com:443/ HTTP/1.1" "curl/7.46.0" ECDHE-RSA-AES128-GCM-SHA256 TLSv1.2 arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 "Root=1-58337281-1d84f3d73c47ec4e58577259" "www.example.com" "arn:aws:acm:us-east-2:123456789012:certificate/12345678-1234-1234-1234-123456789012" 1 2018-07-02T22:22:48.364000Z "authenticate,forward" "-" "-" "10.0.0.1:80" "200" "-" "-" TID_1234abcd5678ef90 "transformed.example.com" "https://transformed.example.com/index.html" "TransformSuccess" \ No newline at end of file diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs.log b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs.log new file mode 100644 index 0000000000000..4f942b4e99010 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs.log @@ -0,0 +1,3 @@ +https 2018-07-02T22:23:00.186641Z app/my-loadbalancer/50dc6c495c0c9188 192.168.131.39:2817 10.0.0.1:80 0.086 0.048 0.037 200 200 0 57 "GET https://www.example.com:443/ HTTP/1.1" "curl/7.46.0" ECDHE-RSA-AES128-GCM-SHA256 TLSv1.2 arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 "Root=1-58337281-1d84f3d73c47ec4e58577259" "www.example.com" "arn:aws:acm:us-east-2:123456789012:certificate/12345678-1234-1234-1234-123456789012" 1 2018-07-02T22:22:48.364000Z "authenticate,forward" "-" "-" "10.0.0.1:80" "200" "-" "-" TID_1234abcd5678ef90 +https 2018-07-02T22:23:00.186641Z app/my-loadbalancer/50dc6c495c0c9188 [fe80::202:b3ff:fe1e:8329]:443 [2001:db8::1]:80 0.086 0.048 0.037 200 200 0 57 "GET https://www.example.com:443/ HTTP/1.1" "curl/7.46.0" ECDHE-RSA-AES128-GCM-SHA256 TLSv1.2 arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 "Root=1-58337281-1d84f3d73c47ec4e58577259" "www.example.com" "arn:aws:acm:us-east-2:123456789012:certificate/12345678-1234-1234-1234-123456789012" 1 2018-07-02T22:22:48.364000Z "authenticate,forward" "-" "-" "10.0.0.1:80" "200" "-" "-" TID_1234abcd5678ef90 +https 2018-07-02T22:23:00.186641Z app/my-loadbalancer/50dc6c495c0c9188 192.168.131.39:2817 10.0.0.1:80 0.086 0.048 0.037 200 200 0 57 "GET https://www.example.com:443/ HTTP/1.1" "curl/7.46.0" ECDHE-RSA-AES128-GCM-SHA256 TLSv1.2 arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 "Root=1-58337281-1d84f3d73c47ec4e58577259" "www.example.com" "arn:aws:acm:us-east-2:123456789012:certificate/12345678-1234-1234-1234-123456789012" 1 2018-07-02T22:22:48.364000Z "authenticate,forward" "-" "-" "10.0.0.1:80" "200" "-" "-" TID_1234abcd5678ef90 "transformed.example.com" "https://transformed.example.com/index.html" "TransformSuccess" diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_1.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_1.yaml new file mode 100644 index 0000000000000..6551010f57302 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_1.yaml @@ -0,0 +1,95 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: cloud.resource_id + value: + stringValue: app/my-loadbalancer/50dc6c495c0c9188 + scopeLogs: + - logRecords: + - attributes: + - key: network.protocol.name + value: + stringValue: https + - key: network.protocol.version + value: + stringValue: "1.1" + - key: client.address + value: + stringValue: 192.168.131.39 + - key: http.request.method + value: + stringValue: GET + - key: url.full + value: + stringValue: https://www.example.com:443/ + - key: client.port + value: + intValue: "2817" + - key: http.request.size + value: + intValue: "0" + - key: http.response.size + value: + intValue: "57" + - key: aws.elb.status.code + value: + intValue: "200" + - key: tls.protocol.version + value: + stringValue: tlsv1.2 + - key: tls.cipher + value: + stringValue: ECDHE-RSA-AES128-GCM-SHA256 + - key: user_agent.original + value: + stringValue: curl/7.46.0 + - key: url.domain + value: + stringValue: www.example.com + - key: destination.address + value: + stringValue: 10.0.0.1 + - key: destination.port + value: + intValue: "80" + - key: aws.elb.request_processing_time + value: + doubleValue: 0.086 + - key: aws.elb.target_processing_time + value: + doubleValue: 0.048 + - key: aws.elb.response_processing_time + value: + doubleValue: 0.037 + - key: aws.elb.aws_trace_id + value: + stringValue: Root=1-58337281-1d84f3d73c47ec4e58577259 + - key: aws.elb.backend.status.code + value: + intValue: "200" + - key: aws.elb.target_group_arn + value: + stringValue: arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 + - key: aws.elb.chosen_cert_arn + value: + stringValue: arn:aws:acm:us-east-2:123456789012:certificate/12345678-1234-1234-1234-123456789012 + - key: aws.elb.actions_executed + value: + arrayValue: + values: + - stringValue: authenticate + - stringValue: forward + - key: aws.elb.connection_trace_id + value: + stringValue: TID_1234abcd5678ef90 + body: {} + timeUnixNano: "1530570180186641000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.elbaccess + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_2.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_2.yaml new file mode 100644 index 0000000000000..71519fbacf4a6 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_2.yaml @@ -0,0 +1,95 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: cloud.resource_id + value: + stringValue: app/my-loadbalancer/50dc6c495c0c9188 + scopeLogs: + - logRecords: + - attributes: + - key: network.protocol.name + value: + stringValue: https + - key: network.protocol.version + value: + stringValue: "1.1" + - key: client.address + value: + stringValue: fe80::202:b3ff:fe1e:8329 + - key: http.request.method + value: + stringValue: GET + - key: url.full + value: + stringValue: https://www.example.com:443/ + - key: client.port + value: + intValue: "443" + - key: http.request.size + value: + intValue: "0" + - key: http.response.size + value: + intValue: "57" + - key: aws.elb.status.code + value: + intValue: "200" + - key: tls.protocol.version + value: + stringValue: tlsv1.2 + - key: tls.cipher + value: + stringValue: ECDHE-RSA-AES128-GCM-SHA256 + - key: user_agent.original + value: + stringValue: curl/7.46.0 + - key: url.domain + value: + stringValue: www.example.com + - key: destination.address + value: + stringValue: 2001:db8::1 + - key: destination.port + value: + intValue: "80" + - key: aws.elb.request_processing_time + value: + doubleValue: 0.086 + - key: aws.elb.target_processing_time + value: + doubleValue: 0.048 + - key: aws.elb.response_processing_time + value: + doubleValue: 0.037 + - key: aws.elb.aws_trace_id + value: + stringValue: Root=1-58337281-1d84f3d73c47ec4e58577259 + - key: aws.elb.backend.status.code + value: + intValue: "200" + - key: aws.elb.target_group_arn + value: + stringValue: arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 + - key: aws.elb.chosen_cert_arn + value: + stringValue: arn:aws:acm:us-east-2:123456789012:certificate/12345678-1234-1234-1234-123456789012 + - key: aws.elb.actions_executed + value: + arrayValue: + values: + - stringValue: authenticate + - stringValue: forward + - key: aws.elb.connection_trace_id + value: + stringValue: TID_1234abcd5678ef90 + body: {} + timeUnixNano: "1530570180186641000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.elbaccess + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_3.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_3.yaml new file mode 100644 index 0000000000000..f6f9b5f4a8dea --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/testdata/stream_alb/alb_al_valid_logs_expected_3.yaml @@ -0,0 +1,104 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: cloud.resource_id + value: + stringValue: app/my-loadbalancer/50dc6c495c0c9188 + scopeLogs: + - logRecords: + - attributes: + - key: network.protocol.name + value: + stringValue: https + - key: network.protocol.version + value: + stringValue: "1.1" + - key: client.address + value: + stringValue: 192.168.131.39 + - key: http.request.method + value: + stringValue: GET + - key: url.full + value: + stringValue: https://www.example.com:443/ + - key: client.port + value: + intValue: "2817" + - key: http.request.size + value: + intValue: "0" + - key: http.response.size + value: + intValue: "57" + - key: aws.elb.status.code + value: + intValue: "200" + - key: tls.protocol.version + value: + stringValue: tlsv1.2 + - key: tls.cipher + value: + stringValue: ECDHE-RSA-AES128-GCM-SHA256 + - key: user_agent.original + value: + stringValue: curl/7.46.0 + - key: url.domain + value: + stringValue: www.example.com + - key: destination.address + value: + stringValue: 10.0.0.1 + - key: destination.port + value: + intValue: "80" + - key: aws.elb.request_processing_time + value: + doubleValue: 0.086 + - key: aws.elb.target_processing_time + value: + doubleValue: 0.048 + - key: aws.elb.response_processing_time + value: + doubleValue: 0.037 + - key: aws.elb.aws_trace_id + value: + stringValue: Root=1-58337281-1d84f3d73c47ec4e58577259 + - key: aws.elb.backend.status.code + value: + intValue: "200" + - key: aws.elb.target_group_arn + value: + stringValue: arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 + - key: aws.elb.chosen_cert_arn + value: + stringValue: arn:aws:acm:us-east-2:123456789012:certificate/12345678-1234-1234-1234-123456789012 + - key: aws.elb.actions_executed + value: + arrayValue: + values: + - stringValue: authenticate + - stringValue: forward + - key: aws.elb.connection_trace_id + value: + stringValue: TID_1234abcd5678ef90 + - key: aws.elb.transformed_host + value: + stringValue: transformed.example.com + - key: aws.elb.transformed_uri + value: + stringValue: https://transformed.example.com/index.html + - key: aws.elb.request_transform_status + value: + stringValue: TransformSuccess + body: {} + timeUnixNano: "1530570180186641000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.elbaccess + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/unmarshaler.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/unmarshaler.go index 00df076e7674d..6cffb08f037c5 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/unmarshaler.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/unmarshaler.go @@ -16,9 +16,11 @@ import ( conventions "go.opentelemetry.io/otel/semconv/v1.38.0" "go.uber.org/zap" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/constants" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/metadata" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/unmarshaler" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/xstreamencoding" ) type elbAccessLogUnmarshaler struct { @@ -37,85 +39,112 @@ type resourceAttributes struct { resourceID string } -// UnmarshalAWSLogs processes a file containing ELB access logs. +// UnmarshalAWSLogs processes all logs from the provided reader func (f *elbAccessLogUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs, error) { - scanner := bufio.NewScanner(reader) - - logs, resourceLogs, scopeLogs := f.createLogs() - resourceAttr := &resourceAttributes{} + // Decode as a stream but flush all at once using flush options + streamDecoder, err := f.NewLogsDecoder(reader, encoding.WithFlushItems(0), encoding.WithFlushBytes(0)) + if err != nil { + return plog.Logs{}, err + } - var line string - var fields []string + logs, err := streamDecoder.DecodeLogs() + if err != nil { + //nolint:errorlint + if err == io.EOF { + // EOF indicates no logs were found, return any logs that's available + return logs, nil + } - // Read first line to determine format - if !scanner.Scan() { - return plog.Logs{}, errors.New("no log lines found") + return plog.Logs{}, err } - line = scanner.Text() - fields, err := extractFields(line) + return logs, nil +} + +// NewLogsDecoder returns a LogsDecoder that processes ELB access logs from the provided reader. +// Auto-detects the ELB Log type (ALB, NLB, CLB, or control message) using the first log line. +// Supported sub formats: +// - ALB/NLB/CLB access logs: Supports offset-based streaming; offset tracks bytes processed +// - Control message: Returns empty log; offset is always 0 +func (f *elbAccessLogUnmarshaler) NewLogsDecoder(reader io.Reader, options ...encoding.DecoderOption) (encoding.LogsDecoder, error) { + bufReader := bufio.NewReader(reader) + syntax, err := peekAndGetSyntax(bufReader) if err != nil { - return plog.Logs{}, fmt.Errorf("failed to parse log line: %w", err) - } - if len(fields) == 0 { - return plog.Logs{}, fmt.Errorf("log line has no fields: %s", line) + return nil, err } - // Check for control message - if fields[0] == EnableControlMessage { - f.logger.Info(fmt.Sprintf("Control message received: %s", line)) - return plog.NewLogs(), nil + if syntax == controlMessage { + f.logger.Info("ELB Control message received") + + // Emit control message as empty log message. + return xstreamencoding.NewLogsDecoderAdapter( + func() (plog.Logs, error) { + return plog.Logs{}, nil + }, + func() int64 { + return 0 + }, + ), nil } - // Determine syntax - syntax, err := findLogSyntaxByField(fields[0]) + scannerHelper, err := xstreamencoding.NewScannerHelper(bufReader, options...) if err != nil { - return plog.Logs{}, fmt.Errorf("unable to determine log syntax: %w", err) + return nil, fmt.Errorf("failed to create scanner helper: %w", err) } - for { - // Process lines based on determined syntax - switch syntax { - case albAccessLogs: - err = f.handleALBAccessLogs(fields, resourceAttr, scopeLogs) + + decodeF := func() (plog.Logs, error) { + logs, resourceLogs, scopeLogs := f.createLogs() + resourceAttr := &resourceAttributes{} + + for { + line, flush, err := scannerHelper.ScanString() if err != nil { - return plog.Logs{}, err + if !errors.Is(err, io.EOF) { + return plog.Logs{}, fmt.Errorf("error reading ELB access logs from stream: %w", err) + } + + if line == "" { + break + } } - case nlbAccessLogs: - err = f.handleNLBAccessLogs(fields, resourceAttr, scopeLogs) + + fields, err := extractFields(line) if err != nil { - return plog.Logs{}, err + return plog.Logs{}, fmt.Errorf("failed to parse log line: %w", err) + } + + if len(fields) == 0 { + return plog.Logs{}, fmt.Errorf("log line has no fields: %s", line) + } + + // Process line based on syntax + switch syntax { + case albAccessLogs: + err = f.handleALBAccessLogs(fields, resourceAttr, scopeLogs) + case nlbAccessLogs: + err = f.handleNLBAccessLogs(fields, resourceAttr, scopeLogs) + case clbAccessLogs: + err = f.handleCLBAccessLogs(fields, resourceAttr, scopeLogs) } - case clbAccessLogs: - err = f.handleCLBAccessLogs(fields, resourceAttr, scopeLogs) if err != nil { return plog.Logs{}, err } - default: - return plog.Logs{}, fmt.Errorf("unsupported log syntax: %s", syntax) - } - // Refill with next line until we reach the scanner end - if !scanner.Scan() { - break + if flush { + break + } } - line = scanner.Text() - fields, err = extractFields(line) - if err != nil { - return plog.Logs{}, fmt.Errorf("failed to parse log line: %w", err) - } - if len(fields) == 0 { - return plog.Logs{}, fmt.Errorf("log line has no fields: %s", line) + f.setResourceAttributes(resourceAttr, resourceLogs) + + if scopeLogs.LogRecords().Len() == 0 { + return plog.Logs{}, io.EOF } - } - // Handle potential scanner errors - if err := scanner.Err(); err != nil { - return plog.Logs{}, fmt.Errorf("error scanning log lines: %w", err) + return logs, nil } - f.setResourceAttributes(resourceAttr, resourceLogs) - return logs, nil + return xstreamencoding.NewLogsDecoderAdapter(decodeF, scannerHelper.Offset), nil } // createLogs with the expected fields for the scope logs @@ -414,3 +443,34 @@ func (f *elbAccessLogUnmarshaler) addToNLBAccessLogs(resourceAttr *resourceAttri rScope := scopeLogs.LogRecords().AppendEmpty() recordLog.MoveTo(rScope) } + +func peekAndGetSyntax(bufReader *bufio.Reader) (logSyntaxType, error) { + // 100 bytes should be enough to cover first sections of the log line + peekedData, err := bufReader.Peek(100) + if err != nil { + if !errors.Is(err, io.EOF) { + return "", fmt.Errorf("failed to peek log line: %w", err) + } + } + + fields, err := extractFields(string(peekedData)) + if err != nil { + return "", fmt.Errorf("failed to parse log line: %w", err) + } + if len(fields) == 0 { + return "", fmt.Errorf("invalid first ELB access log line part: %s", string(peekedData)) + } + + // Check for control message + if fields[0] == EnableControlMessage { + return controlMessage, nil + } + + // Determine syntax + syntax, err := findLogSyntaxByField(fields[0]) + if err != nil { + return "", fmt.Errorf("unable to determine log syntax: %w", err) + } + + return syntax, nil +} diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/unmarshaler_test.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/unmarshaler_test.go index e95a668ae252e..2b70929981305 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/unmarshaler_test.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/elb-access-log/unmarshaler_test.go @@ -4,8 +4,11 @@ package elbaccesslogs import ( + "bufio" "bytes" "compress/gzip" + "errors" + "fmt" "io" "os" "path/filepath" @@ -14,8 +17,10 @@ import ( "github.com/stretchr/testify/require" "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/pdata/plog" "go.uber.org/zap/zaptest" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest/plogtest" ) @@ -65,7 +70,7 @@ func TestUnmarshallELBAccessLogs(t *testing.T) { }, "empty log file": { reader: readAndCompressLogFile(t, filesDirectory, "elb_empty_file.log"), - expectedErr: "no log lines found", + expectedErr: "invalid first ELB access log line part", }, "invalid syntax field": { reader: readAndCompressLogFile(t, filesDirectory, "alb_al_invalid_syntax.log"), @@ -122,3 +127,118 @@ func TestUnmarshallELBAccessLogs(t *testing.T) { }) } } + +func TestNewLogsDecoder(t *testing.T) { + directory := "testdata/stream_alb" + expectPattern := "alb_al_valid_logs_expected_%d.yaml" + + tests := []struct { + name string + offset int64 + index int + }{ + { + name: "Normal streaming", + offset: 0, + index: 0, + }, + { + name: "Stream with offset", + offset: 577, // skip first record + index: 1, // start from first index + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Input with 3 valid ALB logs + input := readAndCompressLogFile(t, directory, "alb_al_valid_logs.log") + + logger := zaptest.NewLogger(t) + elbUnmarshaler := NewELBAccessLogUnmarshaler(component.BuildInfo{}, logger) + + // Flush after every log for testing purposes & use set offset from test case + streamer, err := elbUnmarshaler.NewLogsDecoder(input, encoding.WithFlushItems(1), encoding.WithOffset(tt.offset)) + require.NoError(t, err) + + index := tt.index + for { + index++ + + var logs plog.Logs + logs, err = streamer.DecodeLogs() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + + t.Errorf("failed to unmarshal log for index %d: %v", index, err) + } + + // To check or update offset, uncomment offset below + // fmt.Println(streamer.Offset()) + + var expectedLogs plog.Logs + expectedLogs, err = golden.ReadLogs(filepath.Join(directory, fmt.Sprintf(expectPattern, index))) + require.NoError(t, err) + require.NoError(t, plogtest.CompareLogs(expectedLogs, logs, plogtest.IgnoreResourceLogsOrder())) + } + + // expect EOF after all logs are read + _, err = streamer.DecodeLogs() + require.ErrorIs(t, err, io.EOF) + }) + } +} + +func Test_peekAndGetSyntax(t *testing.T) { + tests := []struct { + name string + input []byte + logSyntax logSyntaxType + wantError string + }{ + { + name: "Enable message", + input: []byte("Enable ConnectionLog for ELB"), + logSyntax: controlMessage, + wantError: "", + }, + { + name: "ALB log", + input: []byte("http 2018-07-02T22:23:00.186641Z app/my-loadbalancer/50dc6c495c0c9188 192.168.131.39:2817 10.0.0.1:80 0.000 0.001 0.000 200 200 34 366 \"GET http://www.example.com:80/ HTTP/1.1\" \"curl/7.46.0\" - - arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-targets/73e2d6bc24d8a067 \"Root=1-58337262-36d228ad5d99923122bbe354\" \"-\" \"-\" 0 2018-07-02T22:22:48.364000Z \"forward\" \"-\" \"-\" \"10.0.0.1:80\" \"200\" \"-\" \"-\" TID_1234abcd5678ef90 \"-\" \"-\" \"-\""), + logSyntax: albAccessLogs, + wantError: "", + }, + { + name: "NLB log", + input: []byte("tls 2.0 2018-12-20T02:59:40 net/my-network-loadbalancer/c6e77e28c25b2234 g3d4b5e8bb8464cd 72.21.218.154:51341 172.100.100.185:443 5 2 98 246 - arn:aws:acm:us-east-2:671290407336:certificate/2a108f19-aded-46b0-8493-c63eb1ef4a99 - ECDHE-RSA-AES128-SHA tlsv12 - my-network-loadbalancer-c6e77e28c25b2234.elb.us-east-2.amazonaws.com - - - 2018-12-20T02:59:30\n"), + logSyntax: nlbAccessLogs, + wantError: "", + }, + { + name: "CLB log", + input: []byte("2015-05-13T23:39:43.945958Z my-loadbalancer 192.168.131.39:2817 10.0.0.1:80 0.000073 0.001048 0.000057 200 200 0 29 \"GET http://www.example.com:80/ HTTP/1.1\" \"curl/7.38.0\" - -\n"), + logSyntax: clbAccessLogs, + wantError: "", + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + reader := bufio.NewReader(bytes.NewReader(test.input)) + + syntax, err := peekAndGetSyntax(reader) + if test.wantError != "" { + require.ErrorContains(t, err, test.wantError) + return + } + + require.NoError(t, err) + require.Equal(t, test.logSyntax, syntax) + + // reader should not have consumed any bytes + peekedBytes, _ := reader.ReadBytes('\n') + require.Equal(t, test.input, peekedBytes) + }) + } +} diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi.json b/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi.json new file mode 100644 index 0000000000000..07f9835f210a6 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi.json @@ -0,0 +1,2 @@ +{"firewall_name":"test-firewall","availability_zone":"us-east-1a","event_timestamp":"2025-10-20T10:30:45.123Z","event":{"event_type":"alert","flow_id":1234567890,"src_ip":"192.168.1.100","src_port":54321,"dest_ip":"10.0.1.50","dest_port":443,"proto":"TCP","alert":{"action":"allowed","signature":"ET MALWARE Suspicious TLS Connection","signature_id":2027758,"rev":3,"gid":1,"category":"Malware","severity":1}}} +{"firewall_name":"test-firewall","availability_zone":"us-east-1b","event_timestamp":"2025-10-20T10:30:45.123Z","event":{"timestamp":"2020-10-13T22:10:01.006481+0000","flow_id":1582438383425873,"event_type":"alert","src_ip":"203.0.113.4","src_port":55555,"dest_ip":"192.0.2.16","dest_port":111,"proto":"TCP","alert":{"action":"allowed","signature_id":5,"rev":0,"signature":"test_tcp","category":"","severity":1}}} \ No newline at end of file diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi_1.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi_1.yaml new file mode 100644 index 0000000000000..e7fd3f17731ba --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi_1.yaml @@ -0,0 +1,65 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: aws.networkfirewall.name + value: + stringValue: test-firewall + - key: cloud.availability_zone + value: + stringValue: us-east-1a + scopeLogs: + - logRecords: + - attributes: + - key: aws.networkfirewall.event.type + value: + stringValue: alert + - key: aws.networkfirewall.flow_id + value: + intValue: "1234567890" + - key: source.address + value: + stringValue: 192.168.1.100 + - key: source.port + value: + intValue: "54321" + - key: destination.address + value: + stringValue: 10.0.1.50 + - key: destination.port + value: + intValue: "443" + - key: network.transport + value: + stringValue: TCP + - key: aws.networkfirewall.alert.action + value: + stringValue: allowed + - key: aws.networkfirewall.alert.signature + value: + stringValue: ET MALWARE Suspicious TLS Connection + - key: aws.networkfirewall.alert.signature_id + value: + intValue: "2027758" + - key: aws.networkfirewall.alert.rev + value: + intValue: "3" + - key: aws.networkfirewall.alert.category + value: + stringValue: Malware + - key: aws.networkfirewall.alert.severity + value: + intValue: "1" + - key: aws.networkfirewall.alert.gid + value: + intValue: "1" + body: {} + timeUnixNano: "1760956245123000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.networkfirewall + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi_2.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi_2.yaml new file mode 100644 index 0000000000000..6454ef069ea4f --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/testdata/stream/alert_log_multi_2.yaml @@ -0,0 +1,56 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: aws.networkfirewall.name + value: + stringValue: test-firewall + - key: cloud.availability_zone + value: + stringValue: us-east-1b + scopeLogs: + - logRecords: + - attributes: + - key: aws.networkfirewall.event.type + value: + stringValue: alert + - key: aws.networkfirewall.flow_id + value: + intValue: "1582438383425873" + - key: source.address + value: + stringValue: 203.0.113.4 + - key: source.port + value: + intValue: "55555" + - key: destination.address + value: + stringValue: 192.0.2.16 + - key: destination.port + value: + intValue: "111" + - key: network.transport + value: + stringValue: TCP + - key: aws.networkfirewall.alert.action + value: + stringValue: allowed + - key: aws.networkfirewall.alert.signature + value: + stringValue: test_tcp + - key: aws.networkfirewall.alert.signature_id + value: + intValue: "5" + - key: aws.networkfirewall.alert.severity + value: + intValue: "1" + body: {} + timeUnixNano: "1760956245123000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.networkfirewall + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/unmarshaler.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/unmarshaler.go index fef6f1eba7f15..7bc91c79c412f 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/unmarshaler.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/unmarshaler.go @@ -4,7 +4,6 @@ package networkfirewall // import "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log" import ( - "bufio" "errors" "fmt" "io" @@ -16,9 +15,11 @@ import ( "go.opentelemetry.io/collector/pdata/plog" conventions "go.opentelemetry.io/otel/semconv/v1.38.0" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/constants" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/metadata" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/unmarshaler" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/xstreamencoding" ) type networkFirewallLogUnmarshaler struct { @@ -99,56 +100,105 @@ type networkFirewallLog struct { } func (n *networkFirewallLogUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs, error) { - logs := plog.NewLogs() + // Decode as a stream but flush all at once using flush options + streamUnmarshaler, err := n.NewLogsDecoder(reader, encoding.WithFlushItems(0), encoding.WithFlushBytes(0)) + if err != nil { + return plog.Logs{}, err + } - resourceLogs := logs.ResourceLogs().AppendEmpty() - resourceLogs.Resource().Attributes().PutStr( - string(conventions.CloudProviderKey), - conventions.CloudProviderAWS.Value.AsString(), - ) + logs, err := streamUnmarshaler.DecodeLogs() + if err != nil { + //nolint:errorlint + if err == io.EOF { + // EOF indicates no logs were found, return any logs that's available + return logs, nil + } - scopeLogs := resourceLogs.ScopeLogs().AppendEmpty() - scopeLogs.Scope().SetName(metadata.ScopeName) - scopeLogs.Scope().SetVersion(n.buildInfo.Version) - scopeLogs.Scope().Attributes().PutStr(constants.FormatIdentificationTag, "aws."+constants.FormatNetworkFirewallLog) + return plog.Logs{}, err + } - firewallName := "" - availabilityZone := "" + return logs, nil +} - scanner := bufio.NewScanner(reader) - for scanner.Scan() { - logLine := scanner.Bytes() +// NewLogsDecoder returns a LogsDecoder that processes AWS Network Firewall logs from the provided reader. +// Supports offset-based streaming; offset tracks bytes processed. +func (n *networkFirewallLogUnmarshaler) NewLogsDecoder(reader io.Reader, options ...encoding.DecoderOption) (encoding.LogsDecoder, error) { + scannerHelper, err := xstreamencoding.NewScannerHelper(reader, options...) + if err != nil { + return nil, err + } - var log networkFirewallLog - if err := gojson.Unmarshal(logLine, &log); err != nil { - return plog.Logs{}, fmt.Errorf("failed to unmarshal Network Firewall log: %w", err) - } - if log.FirewallName == "" { - return plog.Logs{}, errors.New("invalid Network Firewall log: empty firewall_name field") + decodeF := func() (plog.Logs, error) { + logs := plog.NewLogs() + + resourceLogs := logs.ResourceLogs().AppendEmpty() + resourceLogs.Resource().Attributes().PutStr( + string(conventions.CloudProviderKey), + conventions.CloudProviderAWS.Value.AsString(), + ) + + scopeLogs := resourceLogs.ScopeLogs().AppendEmpty() + scopeLogs.Scope().SetName(metadata.ScopeName) + scopeLogs.Scope().SetVersion(n.buildInfo.Version) + scopeLogs.Scope().Attributes().PutStr(constants.FormatIdentificationTag, "aws."+constants.FormatNetworkFirewallLog) + + firewallName := "" + availabilityZone := "" + + for { + logBytes, flush, err := scannerHelper.ScanBytes() + if err != nil { + if !errors.Is(err, io.EOF) { + return plog.Logs{}, fmt.Errorf("failed to unmarshal Network Firewall log: %w", err) + } + + if len(logBytes) == 0 { + break + } + } + + var log networkFirewallLog + if err := gojson.Unmarshal(logBytes, &log); err != nil { + return plog.Logs{}, fmt.Errorf("failed to unmarshal Network Firewall log: %w", err) + } + if log.FirewallName == "" { + return plog.Logs{}, errors.New("invalid Network Firewall log: empty firewall_name field") + } + if firewallName == "" { + firewallName = log.FirewallName + availabilityZone = log.AvailabilityZone + } + if firewallName != log.FirewallName { + return plog.Logs{}, fmt.Errorf( + "unexpected: new firewall_name %q is different than previous one %q", + log.FirewallName, + firewallName, + ) + } + + record := scopeLogs.LogRecords().AppendEmpty() + if err := n.addNetworkFirewallLog(log, record); err != nil { + return plog.Logs{}, err + } + + if flush { + break + } } + if firewallName == "" { - firewallName = log.FirewallName - availabilityZone = log.AvailabilityZone - } - if firewallName != log.FirewallName { - return plog.Logs{}, fmt.Errorf( - "unexpected: new firewall_name %q is different than previous one %q", - log.FirewallName, - firewallName, - ) + // This means there is no log to process, return EOF to indicate no logs. + return plog.Logs{}, io.EOF } - record := scopeLogs.LogRecords().AppendEmpty() - if err := n.addNetworkFirewallLog(log, record); err != nil { - return plog.Logs{}, err + if err := setResourceAttributes(resourceLogs, firewallName, availabilityZone); err != nil { + return plog.Logs{}, fmt.Errorf("failed to set resource attributes: %w", err) } - } - if err := setResourceAttributes(resourceLogs, firewallName, availabilityZone); err != nil { - return plog.Logs{}, fmt.Errorf("failed to set resource attributes: %w", err) + return logs, nil } - return logs, nil + return xstreamencoding.NewLogsDecoderAdapter(decodeF, scannerHelper.Offset), nil } func setResourceAttributes(resourceLogs plog.ResourceLogs, firewallName, availabilityZone string) error { diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/unmarshaler_test.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/unmarshaler_test.go index f44295aa83c8f..4e05d17bd645d 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/unmarshaler_test.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/network-firewall-log/unmarshaler_test.go @@ -5,6 +5,8 @@ package networkfirewall import ( "bytes" + "errors" + "fmt" "io" "os" "path/filepath" @@ -16,6 +18,7 @@ import ( "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/pdata/plog" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest/plogtest" ) @@ -105,6 +108,68 @@ func TestUnmarshalLogs(t *testing.T) { } } +func TestNewLogsDecoder(t *testing.T) { + directory := "testdata/stream" + expectPattern := "alert_log_multi_%d.yaml" + + tests := []struct { + name string + offset int64 + index int + }{ + { + name: "Normal streaming", + offset: 0, + index: 0, + }, + { + name: "Stream with offset", + offset: 411, // skip first record + index: 1, // start from first index + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data, err := os.ReadFile(filepath.Join(directory, "alert_log_multi.json")) + require.NoError(t, err) + reader := compressToGZIPReader(t, data) + + networkUnmarshal := NewNetworkFirewallLogUnmarshaler(component.BuildInfo{}) + + // Flush after every log for testing purposes & set offset + streamer, err := networkUnmarshal.NewLogsDecoder(reader, encoding.WithFlushItems(1), encoding.WithOffset(tt.offset)) + require.NoError(t, err) + + index := tt.index + for { + index++ + + var logs plog.Logs + logs, err = streamer.DecodeLogs() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + + t.Errorf("failed to unmarshal log for index %d: %v", index, err) + } + + // To check or update offset, uncomment offset below + // fmt.Println(streamer.Offset()) + var expectedLogs plog.Logs + expectedLogs, err = golden.ReadLogs(filepath.Join(directory, fmt.Sprintf(expectPattern, index))) + require.NoError(t, err) + require.NoError(t, plogtest.CompareLogs(expectedLogs, logs, plogtest.IgnoreResourceLogsOrder())) + } + + // expect EOF after all logs are read + _, err = streamer.DecodeLogs() + require.ErrorIs(t, err, io.EOF) + }) + } +} + func TestSetResourceAttributes(t *testing.T) { t.Parallel() diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_log_multi.log b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_log_multi.log new file mode 100644 index 0000000000000..3311565d3dec6 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_log_multi.log @@ -0,0 +1,3 @@ +79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be amzn-s3-demo-bucket1 [06/Feb/2019:00:00:38 +0000] 192.0.2.3 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be 3E57427F3EXAMPLE REST.GET.VERSIONING - "GET /amzn-s3-demo-bucket1?versioning HTTP/1.1" 200 - 113 - 7 - "-" "S3Console/0.4" - s9lzHYrFp76ZVxRcpX9+5cjAnEH2ROuNkd2BHfIa6UkFVdtjf5mKR3/eTPFvsiP/XV/VLi31234= SigV4 ECDHE-RSA-AES128-GCM-SHA256 AuthHeader amzn-s3-demo-bucket1.s3.us-west-1.amazonaws.com TLSV1.2 arn:aws:s3:us-west-1:123456789012:accesspoint/example-AP Yes +79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be amzn-s3-demo-bucket1 [06/Feb/2019:00:00:38 +0000] 192.0.2.3 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be 891CE47D2EXAMPLE REST.GET.LOGGING_STATUS - "GET /amzn-s3-demo-bucket1?logging HTTP/1.1" 200 - 242 - 11 - "-" "S3Console/0.4" - 9vKBE6vMhrNiWHZmb2L0mXOcqPGzQOI5XLnCtZNPxev+Hf+7tpT6sxDwDty4LHBUOZJG96N1234= SigV4 ECDHE-RSA-AES128-GCM-SHA256 AuthHeader amzn-s3-demo-bucket1.s3.us-west-1.amazonaws.com TLSV1.2 - - +79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be amzn-s3-demo-bucket1 [06/Feb/2019:00:00:38 +0000] 192.0.2.3 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be A1206F460EXAMPLE REST.GET.BUCKETPOLICY - "GET /amzn-s3-demo-bucket1?policy HTTP/1.1" 404 NoSuchBucketPolicy 297 - 38 - "-" "S3Console/0.4" - BNaBsXZQQDbssi6xMBdBU2sLt+Yf5kZDmeBUP35sFoKa3sLLeMC78iwEIWxs99CRUrbS4n11234= SigV4 ECDHE-RSA-AES128-GCM-SHA256 AuthHeader amzn-s3-demo-bucket1.s3.us-west-1.amazonaws.com TLSV1.2 - Yes \ No newline at end of file diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_1.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_1.yaml new file mode 100644 index 0000000000000..e0829a6be5100 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_1.yaml @@ -0,0 +1,86 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: aws.s3.bucket + value: + stringValue: amzn-s3-demo-bucket1 + - key: aws.s3.owner + value: + stringValue: 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be + scopeLogs: + - logRecords: + - attributes: + - key: source.address + value: + stringValue: 192.0.2.3 + - key: user.id + value: + stringValue: 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be + - key: aws.request_id + value: + stringValue: 3E57427F3EXAMPLE + - key: rpc.method + value: + stringValue: REST.GET.VERSIONING + - key: http.request.method + value: + stringValue: GET + - key: url.path + value: + stringValue: /amzn-s3-demo-bucket1 + - key: url.query + value: + stringValue: versioning + - key: network.protocol.name + value: + stringValue: http + - key: network.protocol.version + value: + stringValue: "1.1" + - key: http.response.status_code + value: + intValue: "200" + - key: http.response.body.size + value: + intValue: "113" + - key: duration + value: + intValue: "7" + - key: user_agent.original + value: + stringValue: S3Console/0.4 + - key: aws.extended_request_id + value: + stringValue: s9lzHYrFp76ZVxRcpX9+5cjAnEH2ROuNkd2BHfIa6UkFVdtjf5mKR3/eTPFvsiP/XV/VLi31234= + - key: aws.signature.version + value: + stringValue: SigV4 + - key: tls.cipher + value: + stringValue: ECDHE-RSA-AES128-GCM-SHA256 + - key: aws.s3.auth_type + value: + stringValue: AuthHeader + - key: http.request.header.host + value: + stringValue: amzn-s3-demo-bucket1.s3.us-west-1.amazonaws.com + - key: tls.protocol.version + value: + stringValue: "1.2" + - key: aws.s3.access_point.arn + value: + stringValue: arn:aws:s3:us-west-1:123456789012:accesspoint/example-AP + - key: aws.s3.acl_required + value: + boolValue: true + body: {} + timeUnixNano: "1549411238000000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.s3access + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_2.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_2.yaml new file mode 100644 index 0000000000000..eb50160615d3e --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_2.yaml @@ -0,0 +1,83 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: aws.s3.bucket + value: + stringValue: amzn-s3-demo-bucket1 + - key: aws.s3.owner + value: + stringValue: 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be + scopeLogs: + - logRecords: + - attributes: + - key: source.address + value: + stringValue: 192.0.2.3 + - key: user.id + value: + stringValue: 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be + - key: aws.request_id + value: + stringValue: 891CE47D2EXAMPLE + - key: rpc.method + value: + stringValue: REST.GET.LOGGING_STATUS + - key: http.request.method + value: + stringValue: GET + - key: url.path + value: + stringValue: /amzn-s3-demo-bucket1 + - key: url.query + value: + stringValue: logging + - key: network.protocol.name + value: + stringValue: http + - key: network.protocol.version + value: + stringValue: "1.1" + - key: http.response.status_code + value: + intValue: "200" + - key: http.response.body.size + value: + intValue: "242" + - key: duration + value: + intValue: "11" + - key: user_agent.original + value: + stringValue: S3Console/0.4 + - key: aws.extended_request_id + value: + stringValue: 9vKBE6vMhrNiWHZmb2L0mXOcqPGzQOI5XLnCtZNPxev+Hf+7tpT6sxDwDty4LHBUOZJG96N1234= + - key: aws.signature.version + value: + stringValue: SigV4 + - key: tls.cipher + value: + stringValue: ECDHE-RSA-AES128-GCM-SHA256 + - key: aws.s3.auth_type + value: + stringValue: AuthHeader + - key: http.request.header.host + value: + stringValue: amzn-s3-demo-bucket1.s3.us-west-1.amazonaws.com + - key: tls.protocol.version + value: + stringValue: "1.2" + - key: aws.s3.acl_required + value: + boolValue: false + body: {} + timeUnixNano: "1549411238000000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.s3access + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_3.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_3.yaml new file mode 100644 index 0000000000000..6c188bb7b0df9 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/testdata/stream/valid_s3_access_multi_3.yaml @@ -0,0 +1,86 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: aws.s3.bucket + value: + stringValue: amzn-s3-demo-bucket1 + - key: aws.s3.owner + value: + stringValue: 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be + scopeLogs: + - logRecords: + - attributes: + - key: source.address + value: + stringValue: 192.0.2.3 + - key: user.id + value: + stringValue: 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be + - key: aws.request_id + value: + stringValue: A1206F460EXAMPLE + - key: rpc.method + value: + stringValue: REST.GET.BUCKETPOLICY + - key: http.request.method + value: + stringValue: GET + - key: url.path + value: + stringValue: /amzn-s3-demo-bucket1 + - key: url.query + value: + stringValue: policy + - key: network.protocol.name + value: + stringValue: http + - key: network.protocol.version + value: + stringValue: "1.1" + - key: http.response.status_code + value: + intValue: "404" + - key: error.type + value: + stringValue: NoSuchBucketPolicy + - key: http.response.body.size + value: + intValue: "297" + - key: duration + value: + intValue: "38" + - key: user_agent.original + value: + stringValue: S3Console/0.4 + - key: aws.extended_request_id + value: + stringValue: BNaBsXZQQDbssi6xMBdBU2sLt+Yf5kZDmeBUP35sFoKa3sLLeMC78iwEIWxs99CRUrbS4n11234= + - key: aws.signature.version + value: + stringValue: SigV4 + - key: tls.cipher + value: + stringValue: ECDHE-RSA-AES128-GCM-SHA256 + - key: aws.s3.auth_type + value: + stringValue: AuthHeader + - key: http.request.header.host + value: + stringValue: amzn-s3-demo-bucket1.s3.us-west-1.amazonaws.com + - key: tls.protocol.version + value: + stringValue: "1.2" + - key: aws.s3.acl_required + value: + boolValue: true + body: {} + timeUnixNano: "1549411238000000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.s3access + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/unmarshaler.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/unmarshaler.go index 8a2fe9691efee..80c1dba03cf30 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/unmarshaler.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/unmarshaler.go @@ -4,7 +4,6 @@ package s3accesslog // import "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log" import ( - "bufio" "errors" "fmt" "io" @@ -18,9 +17,11 @@ import ( "go.opentelemetry.io/collector/pdata/plog" conventions "go.opentelemetry.io/otel/semconv/v1.38.0" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/constants" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/metadata" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/unmarshaler" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/xstreamencoding" ) const ( @@ -47,23 +48,68 @@ type resourceAttributes struct { } func (s *s3AccessLogUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs, error) { - scanner := bufio.NewScanner(reader) - - logs, resourceLogs, scopeLogs := s.createLogs() - resourceAttr := &resourceAttributes{} - for scanner.Scan() { - log := scanner.Text() - if err := handleLog(resourceAttr, scopeLogs, log); err != nil { - return plog.Logs{}, err + // Decode as a stream but flush all at once using flush options + streamUnmarshaler, err := s.NewLogsDecoder(reader, encoding.WithFlushItems(0), encoding.WithFlushBytes(0)) + if err != nil { + return plog.Logs{}, err + } + logs, err := streamUnmarshaler.DecodeLogs() + if err != nil { + //nolint:errorlint + if err == io.EOF { + // EOF indicates no logs were found, return any logs that's available + return logs, nil } + + return logs, err } + return logs, nil +} - if err := scanner.Err(); err != nil { - return plog.Logs{}, fmt.Errorf("error reading log line: %w", err) +// NewLogsDecoder returns a LogsDecoder that processes S3 access logs from the provided reader. +// Parses space-delimited log lines following the S3 server access log format. +// Supports offset-based streaming; offset tracks bytes processed +func (s *s3AccessLogUnmarshaler) NewLogsDecoder(reader io.Reader, options ...encoding.DecoderOption) (encoding.LogsDecoder, error) { + scannerHelper, err := xstreamencoding.NewScannerHelper(reader, options...) + if err != nil { + return nil, err } - s.setResourceAttributes(resourceAttr, resourceLogs) - return logs, nil + decoderF := func() (plog.Logs, error) { + logs, resourceLogs, scopeLogs := s.createLogs() + resourceAttr := &resourceAttributes{} + + for { + log, flush, err := scannerHelper.ScanString() + if err != nil { + if !errors.Is(err, io.EOF) { + return plog.Logs{}, fmt.Errorf("error reading S3 access logs from stream:: %w", err) + } + + if log == "" { + break + } + } + + if err := handleLog(resourceAttr, scopeLogs, log); err != nil { + return plog.Logs{}, err + } + + if flush { + break + } + } + + s.setResourceAttributes(resourceAttr, resourceLogs) + + if scopeLogs.LogRecords().Len() == 0 { + return logs, io.EOF + } + + return logs, nil + } + + return xstreamencoding.NewLogsDecoderAdapter(decoderF, scannerHelper.Offset), nil } // createLogs with the expected fields for the scope logs diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/unmarshaler_test.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/unmarshaler_test.go index e124542d7f189..bef885f76180d 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/unmarshaler_test.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/s3-access-log/unmarshaler_test.go @@ -5,6 +5,8 @@ package s3accesslog import ( "bytes" + "errors" + "fmt" "io" "os" "path/filepath" @@ -14,6 +16,7 @@ import ( "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/pdata/plog" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest/plogtest" ) @@ -225,3 +228,65 @@ func TestUnmarshalLogs(t *testing.T) { }) } } + +func TestNewLogsDecoder(t *testing.T) { + directory := "testdata/stream" + expectPattern := "valid_s3_access_multi_%d.yaml" + + tests := []struct { + name string + offset int64 + index int + }{ + { + name: "Normal streaming", + offset: 0, + index: 0, + }, + { + name: "Stream with offset", + offset: 554, // skip first record + index: 1, // start from first index + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s3Unmarshaler := s3AccessLogUnmarshaler{buildInfo: component.BuildInfo{}} + + data, err := os.ReadFile(filepath.Join(directory, "valid_log_multi.log")) + require.NoError(t, err) + + // Flush after every log for testing purposes & set offset + streamer, err := s3Unmarshaler.NewLogsDecoder(bytes.NewReader(data), encoding.WithFlushItems(1), encoding.WithOffset(tt.offset)) + require.NoError(t, err) + + index := tt.index + for { + index++ + + var logs plog.Logs + logs, err = streamer.DecodeLogs() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + + t.Errorf("failed to unmarshal log %d: %v", index, err) + } + + // To check or update offset, uncomment offset below + // fmt.Println(streamer.Offset()) + + var expectedLogs plog.Logs + expectedLogs, err = golden.ReadLogs(filepath.Join(directory, fmt.Sprintf(expectPattern, index))) + require.NoError(t, err) + require.NoError(t, plogtest.CompareLogs(expectedLogs, logs, plogtest.IgnoreResourceLogsOrder())) + } + + // expect EOF after all logs are read + _, err = streamer.DecodeLogs() + require.ErrorIs(t, err, io.EOF) + }) + } +} diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/subscription-filter/unmarshaler.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/subscription-filter/unmarshaler.go index 73c9dad739d5d..42442363499ee 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/subscription-filter/unmarshaler.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/subscription-filter/unmarshaler.go @@ -15,9 +15,11 @@ import ( "go.opentelemetry.io/collector/pdata/plog" conventions "go.opentelemetry.io/otel/semconv/v1.38.0" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/constants" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/metadata" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/unmarshaler" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/xstreamencoding" ) var ( @@ -62,26 +64,59 @@ func NewSubscriptionFilterUnmarshaler(buildInfo component.BuildInfo) unmarshaler // Logs are assumed to be gzip-compressed as specified at // https://docs.aws.amazon.com/firehose/latest/dev/writing-with-cloudwatch-logs.html. func (f *subscriptionFilterUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs, error) { - logs := plog.NewLogs() - resourceLogsByKey := make(map[resourceGroupKey]plog.LogRecordSlice) - - decoder := gojson.NewDecoder(reader) - for decoder.More() { - var cwLog cloudwatchLogsData - if err := decoder.Decode(&cwLog); err != nil { - return plog.Logs{}, fmt.Errorf("failed to decode decompressed reader: %w", err) + // Decode as a stream but flush all at once using flush options + streamUnmarshaler, err := f.NewLogsDecoder(reader, encoding.WithFlushItems(0), encoding.WithFlushBytes(0)) + if err != nil { + return plog.Logs{}, err + } + logs, err := streamUnmarshaler.DecodeLogs() + if err != nil { + //nolint:errorlint + if err == io.EOF { + // EOF indicates no logs were found, return any logs that's available + return logs, nil } - if cwLog.MessageType == "CONTROL_MESSAGE" { - continue - } + return plog.Logs{}, err + } - if err := validateLog(cwLog); err != nil { - return plog.Logs{}, fmt.Errorf("invalid cloudwatch log: %w", err) - } + return logs, nil +} - f.appendLogs(logs, resourceLogsByKey, cwLog) - } +// NewLogsDecoder returns a LogsDecoder that processes CloudWatch Logs subscription filter events. +// Supported sub formats: +// - DATA_MESSAGE: Returns logs grouped by owner, log group, and stream; offset is always 0 +// - CONTROL_MESSAGE: Returns empty logs; offset is always 0 +func (f *subscriptionFilterUnmarshaler) NewLogsDecoder(reader io.Reader, _ ...encoding.DecoderOption) (encoding.LogsDecoder, error) { + // Note - no real streaming as CloudWatch Logs subscription filter events are small in size + var isEOF bool + return xstreamencoding.NewLogsDecoderAdapter( + func() (plog.Logs, error) { + if isEOF { + return plog.NewLogs(), io.EOF + } + + var cwLog cloudwatchLogsData + decoder := gojson.NewDecoder(reader) + if err := decoder.Decode(&cwLog); err != nil { + return plog.Logs{}, fmt.Errorf("failed to decode decompressed reader: %w", err) + } + + if cwLog.MessageType == "CONTROL_MESSAGE" { + return plog.NewLogs(), nil + } + + if err := validateLog(cwLog); err != nil { + return plog.Logs{}, fmt.Errorf("invalid cloudwatch log: %w", err) + } + + isEOF = true + return f.createLogs(cwLog), nil + }, func() int64 { + return int64(0) + }, + ), nil +} return logs, nil } diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/unmarshaler.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/unmarshaler.go index d80ce78686680..6c980e7a6367c 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/unmarshaler.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/unmarshaler.go @@ -7,8 +7,11 @@ import ( "io" "go.opentelemetry.io/collector/pdata/plog" + + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" ) type AWSUnmarshaler interface { UnmarshalAWSLogs(reader io.Reader) (plog.Logs, error) + NewLogsDecoder(reader io.Reader, options ...encoding.DecoderOption) (encoding.LogsDecoder, error) } diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi.log b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi.log new file mode 100644 index 0000000000000..b6031374e478f --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi.log @@ -0,0 +1,3 @@ +version account-id interface-id srcaddr dstaddr srcport dstport protocol packets bytes start end action log-status +2 123456789010 eni-1235b8ca123456789 172.31.16.139 172.31.16.21 20641 22 6 20 4249 1418530010 1418530070 ACCEPT OK +2 123456789010 eni-1235b8ca123456789 172.31.9.69 172.31.9.12 49761 3389 6 20 4249 1418530010 1418530070 REJECT OK \ No newline at end of file diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi_1.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi_1.yaml new file mode 100644 index 0000000000000..fa2f36b5649d5 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi_1.yaml @@ -0,0 +1,56 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: cloud.account.id + value: + stringValue: "123456789010" + scopeLogs: + - logRecords: + - attributes: + - key: aws.vpc.flow.log.version + value: + intValue: "2" + - key: network.interface.name + value: + stringValue: eni-1235b8ca123456789 + - key: source.port + value: + intValue: "20641" + - key: destination.port + value: + intValue: "22" + - key: network.protocol.name + value: + stringValue: tcp + - key: aws.vpc.flow.packets + value: + intValue: "20" + - key: aws.vpc.flow.bytes + value: + intValue: "4249" + - key: aws.vpc.flow.start + value: + intValue: "1418530010" + - key: aws.vpc.flow.action + value: + stringValue: ACCEPT + - key: aws.vpc.flow.status + value: + stringValue: OK + - key: source.address + value: + stringValue: 172.31.16.139 + - key: destination.address + value: + stringValue: 172.31.16.21 + body: {} + timeUnixNano: "1418530070000000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.vpcflow + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi_2.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi_2.yaml new file mode 100644 index 0000000000000..004bcb35a50c6 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/testdata/stream/valid_vpc_flow_log_multi_2.yaml @@ -0,0 +1,56 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: cloud.account.id + value: + stringValue: "123456789010" + scopeLogs: + - logRecords: + - attributes: + - key: aws.vpc.flow.log.version + value: + intValue: "2" + - key: network.interface.name + value: + stringValue: eni-1235b8ca123456789 + - key: source.port + value: + intValue: "49761" + - key: destination.port + value: + intValue: "3389" + - key: network.protocol.name + value: + stringValue: tcp + - key: aws.vpc.flow.packets + value: + intValue: "20" + - key: aws.vpc.flow.bytes + value: + intValue: "4249" + - key: aws.vpc.flow.start + value: + intValue: "1418530010" + - key: aws.vpc.flow.action + value: + stringValue: REJECT + - key: aws.vpc.flow.status + value: + stringValue: OK + - key: source.address + value: + stringValue: 172.31.9.69 + - key: destination.address + value: + stringValue: 172.31.9.12 + body: {} + timeUnixNano: "1418530070000000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.vpcflow + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler.go index 89e4d7eda5ab7..0e6e32d6ce089 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler.go @@ -20,9 +20,11 @@ import ( conventions "go.opentelemetry.io/otel/semconv/v1.38.0" "go.uber.org/zap" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/constants" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/metadata" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/unmarshaler" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/xstreamencoding" ) var ( @@ -92,7 +94,23 @@ func NewVPCFlowLogUnmarshaler( func (v *vpcFlowLogUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs, error) { switch v.cfg.FileFormat { case constants.FileFormatPlainText: - return v.unmarshalPlainTextLogs(reader) + // Decode as a stream but flush all at once using flush options + streamUnmarshaler, err := v.NewLogsDecoder(reader, encoding.WithFlushItems(0), encoding.WithFlushBytes(0)) + if err != nil { + return plog.Logs{}, err + } + logs, err := streamUnmarshaler.DecodeLogs() + if err != nil { + //nolint:errorlint + if err == io.EOF { + // EOF indicates no logs were found, return any logs that's available + return logs, nil + } + + return plog.Logs{}, err + } + + return logs, nil case constants.FileFormatParquet: // TODO return plog.Logs{}, errors.New("still needs to be implemented") @@ -102,6 +120,117 @@ func (v *vpcFlowLogUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs, e } } +// NewLogsDecoder returns a LogsDecoder that processes VPC flow logs from the provided reader. +// Auto-detects the source format (S3 plain text or CloudWatch subscription filter) from the first byte. +// Supported sub formats: +// - S3 plain text logs: Supports offset-based streaming; offset tracks bytes processed +// - CloudWatch subscription filter: Processes full payload; offset is always 0 +// - Parquet format: Not yet implemented +func (v *vpcFlowLogUnmarshaler) NewLogsDecoder(reader io.Reader, options ...encoding.DecoderOption) (encoding.LogsDecoder, error) { + if v.cfg.FileFormat == constants.FileFormatParquet { + return nil, errors.New("streaming parquet VPC flow logs is not yet implemented") + } + + // use buffered reader for efficiency and to avoid any size restrictions + bufReader := bufio.NewReader(reader) + + var err error + firstByte, err := bufReader.Peek(1) + if err != nil { + return nil, fmt.Errorf("failed to read first byte: %w", err) + } + + if firstByte[0] == '{' { + // Dealing with a JSON log message, so check for CloudWatch bound trigger + var cwLogs plog.Logs + cwLogs, err = v.fromCloudWatch(v.cfg.parsedFormat, bufReader) + if err != nil { + return nil, err + } + + var isEOF bool + return xstreamencoding.NewLogsDecoderAdapter( + func() (plog.Logs, error) { + if isEOF { + return plog.Logs{}, io.EOF + } + + isEOF = true + return cwLogs, nil + }, + func() int64 { + return 0 + }, + ), nil + } + + var offset int64 + line, err := bufReader.ReadString('\n') + if err != nil { + return nil, fmt.Errorf("failed to read first line of VPC logs from S3: %w", err) + } + + offset += int64(len(line)) + + fields := strings.Fields(line) + batchHelper := xstreamencoding.NewBatchHelper(options...) + + if batchHelper.Options().Offset > 0 { + // discard ignoring the first line + var discarded int + discarded, err = bufReader.Discard(int(batchHelper.Options().Offset - offset)) + if err != nil { + if errors.Is(err, io.EOF) { + return nil, fmt.Errorf("EOF reached before offset %d records were discarded", batchHelper.Options().Offset) + } + return nil, err + } + offset += int64(discarded) + } + + offsetF := func() int64 { + return offset + } + + decodeF := func() (plog.Logs, error) { + logs, resourceLogs, scopeLogs := v.createLogs() + for { + line, err = bufReader.ReadString('\n') + if err != nil { + if !errors.Is(err, io.EOF) { + return plog.Logs{}, fmt.Errorf("error reading VPC logs: %w", err) + } + + if line == "" { + break + } + } + batchHelper.IncrementBytes(int64(len(line))) + batchHelper.IncrementItems(1) + offset += int64(len(line)) + + // Trim spaces and new lines + line = strings.TrimSpace(line) + if err := v.addToLogs(resourceLogs, scopeLogs, fields, line); err != nil { + return plog.Logs{}, err + } + + if batchHelper.ShouldFlush() { + batchHelper.Reset() + break + } + } + + if scopeLogs.LogRecords().Len() == 0 { + return logs, io.EOF + } + + return logs, nil + } + + return xstreamencoding.NewLogsDecoderAdapter(decodeF, offsetF), nil +} + func (v *vpcFlowLogUnmarshaler) unmarshalPlainTextLogs(reader io.Reader) (plog.Logs, error) { // use buffered reader for efficiency and to avoid any size restrictions bufReader := bufio.NewReader(reader) diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler_test.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler_test.go index 166017598e5fb..516c48673f2eb 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler_test.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler_test.go @@ -5,6 +5,8 @@ package vpcflowlog import ( "bytes" + "errors" + "fmt" "io" "os" "path/filepath" @@ -16,6 +18,7 @@ import ( "go.opentelemetry.io/collector/pdata/plog" "go.uber.org/zap" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/constants" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest/plogtest" @@ -135,6 +138,73 @@ func TestUnmarshalLogs_PlainText(t *testing.T) { } } +func TestNewLogsDecoder(t *testing.T) { + directory := "testdata/stream" + expectPattern := "valid_vpc_flow_log_multi_%d.yaml" + + tests := []struct { + name string + offset int64 + index int + }{ + { + name: "Normal streaming", + offset: 0, + index: 0, + }, + { + name: "Stream with offset", + offset: 230, // skip first record + index: 1, // start from first index + }, + } + + config := Config{ + FileFormat: constants.FileFormatPlainText, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + vpcUnmarshal, err := NewVPCFlowLogUnmarshaler(config, component.BuildInfo{}, zap.NewNop(), false) + require.NoError(t, err) + + data, err := os.ReadFile(filepath.Join(directory, "valid_vpc_flow_log_multi.log")) + require.NoError(t, err) + + // Flush after every log for testing purposes & set offset + streamer, err := vpcUnmarshal.NewLogsDecoder(bytes.NewReader(data), encoding.WithFlushItems(1), encoding.WithOffset(tt.offset)) + require.NoError(t, err) + + index := tt.index + for { + index++ + + var logs plog.Logs + logs, err = streamer.DecodeLogs() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + + t.Errorf("failed to unmarshal log %d: %v", index, err) + } + + // To check or update offset, uncomment offset below + // fmt.Println(streamer.Offset()) + + var expectedLogs plog.Logs + expectedLogs, err = golden.ReadLogs(filepath.Join(directory, fmt.Sprintf(expectPattern, index))) + require.NoError(t, err) + require.NoError(t, plogtest.CompareLogs(expectedLogs, logs, plogtest.IgnoreResourceLogsOrder())) + } + + // expect EOF after all logs are read + _, err = streamer.DecodeLogs() + require.ErrorIs(t, err, io.EOF) + }) + } +} + func TestHandleAddresses(t *testing.T) { t.Parallel() diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/missing_webaclid_log.json b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/missing_webaclid_log.json index 48d31d524cfb8..feef745e039ab 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/missing_webaclid_log.json +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/missing_webaclid_log.json @@ -1,100 +1 @@ -{ - "timestamp":1748208718574, - "formatVersion":1, - "terminatingRuleId":"Default_Action", - "terminatingRuleType":"REGULAR", - "action":"ALLOW", - "terminatingRuleMatchDetails":[ ], - "httpSourceName":"CF", - "httpSourceId":"E3DTJP8YLL6OBQ", - "ruleGroupList":[ ], - "rateBasedRuleList":[ - { - "rateBasedRuleId":"arn:aws:wafv2:us-east-1:123456789101_MANAGED:global/ipset/e3132a63-134d-4da9-a0c4-b166ddd6de6c_77ce5c35-14fa-4731-9710-86216d568f12_IPV4/77ce5c35-14fa-4731-9710-86216d568f12", - "rateBasedRuleName":"rule-1", - "limitKey":"IP", - "maxRateAllowed":10000, - "evaluationWindowSec":300, - "limitValue":"178.84.204.171" - } - ], - "nonTerminatingMatchingRules":[ ], - "requestHeadersInserted":null, - "responseCodeSent":null, - "httpRequest":{ - "clientIp":"178.84.204.171", - "country":"NL", - "headers":[ - { - "name":"host", - "value":"dsx1234tsajqz63.cloudfront.net" - }, - { - "name":"user-agent", - "value":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0" - }, - { - "name":"accept", - "value":"image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5" - }, - { - "name":"accept-language", - "value":"en-US,en;q=0.5" - }, - { - "name":"accept-encoding", - "value":"gzip, deflate, br, zstd" - }, - { - "name":"referer", - "value":"https://dsx88tsajqz63.cloudfront.net/" - }, - { - "name":"sec-fetch-dest", - "value":"image" - }, - { - "name":"sec-fetch-mode", - "value":"no-cors" - }, - { - "name":"sec-fetch-site", - "value":"same-origin" - }, - { - "name":"dnt", - "value":"1" - }, - { - "name":"sec-gpc", - "value":"1" - }, - { - "name":"priority", - "value":"u=6" - }, - { - "name":"te", - "value":"trailers" - } - ], - "uri":"/favicon.ico", - "args":"", - "httpVersion":"HTTP/2.0", - "httpMethod":"GET", - "requestId":"n6LHLPqblIh_4qRsVj0940K9LxKyrkiUUE7lyMol1eTptabtlhHiXQ==", - "fragment":"", - "scheme":"https", - "host":"dsx88tsajqz63.cloudfront.net" - }, - "labels":[ - { - "name":"awswaf:clientip:geo:country:NL" - }, - { - "name":"awswaf:clientip:geo:region:NL-NH" - } - ], - "ja3Fingerprint":"6f7889b9fb1a62a9577e685c1fcfa919", - "ja4Fingerprint":"t13d1717h2_5b57614c22b0_3cbfd9057e0d" -} \ No newline at end of file +{"timestamp":1748208718574,"formatVersion":1,"terminatingRuleId":"Default_Action","terminatingRuleType":"REGULAR","action":"ALLOW","terminatingRuleMatchDetails":[],"httpSourceName":"CF","httpSourceId":"E3DTJP8YLL6OBQ","ruleGroupList":[],"rateBasedRuleList":[{"rateBasedRuleId":"arn:aws:wafv2:us-east-1:123456789101_MANAGED:global/ipset/e3132a63-134d-4da9-a0c4-b166ddd6de6c_77ce5c35-14fa-4731-9710-86216d568f12_IPV4/77ce5c35-14fa-4731-9710-86216d568f12","rateBasedRuleName":"rule-1","limitKey":"IP","maxRateAllowed":10000,"evaluationWindowSec":300,"limitValue":"178.84.204.171"}],"nonTerminatingMatchingRules":[],"requestHeadersInserted":null,"responseCodeSent":null,"httpRequest":{"clientIp":"178.84.204.171","country":"NL","headers":[{"name":"host","value":"dsx1234tsajqz63.cloudfront.net"},{"name":"user-agent","value":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0"},{"name":"accept","value":"image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5"},{"name":"accept-language","value":"en-US,en;q=0.5"},{"name":"accept-encoding","value":"gzip, deflate, br, zstd"},{"name":"referer","value":"https://dsx88tsajqz63.cloudfront.net/"},{"name":"sec-fetch-dest","value":"image"},{"name":"sec-fetch-mode","value":"no-cors"},{"name":"sec-fetch-site","value":"same-origin"},{"name":"dnt","value":"1"},{"name":"sec-gpc","value":"1"},{"name":"priority","value":"u=6"},{"name":"te","value":"trailers"}],"uri":"/favicon.ico","args":"","httpVersion":"HTTP/2.0","httpMethod":"GET","requestId":"n6LHLPqblIh_4qRsVj0940K9LxKyrkiUUE7lyMol1eTptabtlhHiXQ==","fragment":"","scheme":"https","host":"dsx88tsajqz63.cloudfront.net"},"labels":[{"name":"awswaf:clientip:geo:country:NL"},{"name":"awswaf:clientip:geo:region:NL-NH"}],"ja3Fingerprint":"6f7889b9fb1a62a9577e685c1fcfa919","ja4Fingerprint":"t13d1717h2_5b57614c22b0_3cbfd9057e0d"} \ No newline at end of file diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log.json b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log.json index 2b90e20093d0a..b5607a91c292e 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log.json +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log.json @@ -1,101 +1 @@ -{ - "timestamp":1748208718574, - "formatVersion":1, - "webaclId":"arn:aws:wafv2:us-east-1:123456789101:global/webacl/open-telemetry-waf/e3132a63-134d-4da9-a0c4-b166ddd6de6c", - "terminatingRuleId":"Default_Action", - "terminatingRuleType":"REGULAR", - "action":"ALLOW", - "terminatingRuleMatchDetails":[ ], - "httpSourceName":"CF", - "httpSourceId":"E3DTJP8YLL6OBQ", - "ruleGroupList":[ ], - "rateBasedRuleList":[ - { - "rateBasedRuleId":"arn:aws:wafv2:us-east-1:123456789101_MANAGED:global/ipset/e3132a63-134d-4da9-a0c4-b166ddd6de6c_77ce5c35-14fa-4731-9710-86216d568f12_IPV4/77ce5c35-14fa-4731-9710-86216d568f12", - "rateBasedRuleName":"rule-1", - "limitKey":"IP", - "maxRateAllowed":10000, - "evaluationWindowSec":300, - "limitValue":"178.84.204.171" - } - ], - "nonTerminatingMatchingRules":[ ], - "requestHeadersInserted":null, - "responseCodeSent":null, - "httpRequest":{ - "clientIp":"178.84.204.171", - "country":"NL", - "headers":[ - { - "name":"host", - "value":"dsx1234tsajqz63.cloudfront.net" - }, - { - "name":"user-agent", - "value":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0" - }, - { - "name":"accept", - "value":"image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5" - }, - { - "name":"accept-language", - "value":"en-US,en;q=0.5" - }, - { - "name":"accept-encoding", - "value":"gzip, deflate, br, zstd" - }, - { - "name":"referer", - "value":"https://dsx88tsajqz63.cloudfront.net/" - }, - { - "name":"sec-fetch-dest", - "value":"image" - }, - { - "name":"sec-fetch-mode", - "value":"no-cors" - }, - { - "name":"sec-fetch-site", - "value":"same-origin" - }, - { - "name":"dnt", - "value":"1" - }, - { - "name":"sec-gpc", - "value":"1" - }, - { - "name":"priority", - "value":"u=6" - }, - { - "name":"te", - "value":"trailers" - } - ], - "uri":"/favicon.ico", - "args":"", - "httpVersion":"HTTP/2.0", - "httpMethod":"GET", - "requestId":"n6LHLPqblIh_4qRsVj0940K9LxKyrkiUUE7lyMol1eTptabtlhHiXQ==", - "fragment":"", - "scheme":"https", - "host":"dsx88tsajqz63.cloudfront.net" - }, - "labels":[ - { - "name":"awswaf:clientip:geo:country:NL" - }, - { - "name":"awswaf:clientip:geo:region:NL-NH" - } - ], - "ja3Fingerprint":"6f7889b9fb1a62a9577e685c1fcfa919", - "ja4Fingerprint":"t13d1717h2_5b57614c22b0_3cbfd9057e0d" -} \ No newline at end of file +{"timestamp":1748208718574,"formatVersion":1,"webaclId":"arn:aws:wafv2:us-east-1:123456789101:global/webacl/open-telemetry-waf/e3132a63-134d-4da9-a0c4-b166ddd6de6c","terminatingRuleId":"Default_Action","terminatingRuleType":"REGULAR","action":"ALLOW","terminatingRuleMatchDetails":[],"httpSourceName":"CF","httpSourceId":"E3DTJP8YLL6OBQ","ruleGroupList":[],"rateBasedRuleList":[{"rateBasedRuleId":"arn:aws:wafv2:us-east-1:123456789101_MANAGED:global/ipset/e3132a63-134d-4da9-a0c4-b166ddd6de6c_77ce5c35-14fa-4731-9710-86216d568f12_IPV4/77ce5c35-14fa-4731-9710-86216d568f12","rateBasedRuleName":"rule-1","limitKey":"IP","maxRateAllowed":10000,"evaluationWindowSec":300,"limitValue":"178.84.204.171"}],"nonTerminatingMatchingRules":[],"requestHeadersInserted":null,"responseCodeSent":null,"httpRequest":{"clientIp":"178.84.204.171","country":"NL","headers":[{"name":"host","value":"dsx1234tsajqz63.cloudfront.net"},{"name":"user-agent","value":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0"},{"name":"accept","value":"image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5"},{"name":"accept-language","value":"en-US,en;q=0.5"},{"name":"accept-encoding","value":"gzip, deflate, br, zstd"},{"name":"referer","value":"https://dsx88tsajqz63.cloudfront.net/"},{"name":"sec-fetch-dest","value":"image"},{"name":"sec-fetch-mode","value":"no-cors"},{"name":"sec-fetch-site","value":"same-origin"},{"name":"dnt","value":"1"},{"name":"sec-gpc","value":"1"},{"name":"priority","value":"u=6"},{"name":"te","value":"trailers"}],"uri":"/favicon.ico","args":"","httpVersion":"HTTP/2.0","httpMethod":"GET","requestId":"n6LHLPqblIh_4qRsVj0940K9LxKyrkiUUE7lyMol1eTptabtlhHiXQ==","fragment":"","scheme":"https","host":"dsx88tsajqz63.cloudfront.net"},"labels":[{"name":"awswaf:clientip:geo:country:NL"},{"name":"awswaf:clientip:geo:region:NL-NH"}],"ja3Fingerprint":"6f7889b9fb1a62a9577e685c1fcfa919","ja4Fingerprint":"t13d1717h2_5b57614c22b0_3cbfd9057e0d"} \ No newline at end of file diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi.json b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi.json new file mode 100644 index 0000000000000..3db1d96732c18 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi.json @@ -0,0 +1,2 @@ +{"timestamp":1748208718574,"formatVersion":1,"webaclId":"arn:aws:wafv2:us-east-1:123456789101:global/webacl/open-telemetry-waf/e3132a63-134d-4da9-a0c4-b166ddd6de6c","terminatingRuleId":"Default_Action","terminatingRuleType":"REGULAR","action":"ALLOW","terminatingRuleMatchDetails":[],"httpSourceName":"CF","httpSourceId":"E3DTJP8YLL6OBQ","ruleGroupList":[],"rateBasedRuleList":[{"rateBasedRuleId":"arn:aws:wafv2:us-east-1:123456789101_MANAGED:global/ipset/e3132a63-134d-4da9-a0c4-b166ddd6de6c_77ce5c35-14fa-4731-9710-86216d568f12_IPV4/77ce5c35-14fa-4731-9710-86216d568f12","rateBasedRuleName":"rule-1","limitKey":"IP","maxRateAllowed":10000,"evaluationWindowSec":300,"limitValue":"178.84.204.171"}],"nonTerminatingMatchingRules":[],"requestHeadersInserted":null,"responseCodeSent":null,"httpRequest":{"clientIp":"178.84.204.171","country":"NL","headers":[{"name":"host","value":"dsx1234tsajqz63.cloudfront.net"},{"name":"user-agent","value":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0"},{"name":"accept","value":"image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5"},{"name":"accept-language","value":"en-US,en;q=0.5"},{"name":"accept-encoding","value":"gzip, deflate, br, zstd"},{"name":"referer","value":"https://dsx88tsajqz63.cloudfront.net/"},{"name":"sec-fetch-dest","value":"image"},{"name":"sec-fetch-mode","value":"no-cors"},{"name":"sec-fetch-site","value":"same-origin"},{"name":"dnt","value":"1"},{"name":"sec-gpc","value":"1"},{"name":"priority","value":"u=6"},{"name":"te","value":"trailers"}],"uri":"/favicon.ico","args":"","httpVersion":"HTTP/2.0","httpMethod":"GET","requestId":"n6LHLPqblIh_4qRsVj0940K9LxKyrkiUUE7lyMol1eTptabtlhHiXQ==","fragment":"","scheme":"https","host":"dsx88tsajqz63.cloudfront.net"},"labels":[{"name":"awswaf:clientip:geo:country:NL"},{"name":"awswaf:clientip:geo:region:NL-NH"}],"ja3Fingerprint":"6f7889b9fb1a62a9577e685c1fcfa919","ja4Fingerprint":"t13d1717h2_5b57614c22b0_3cbfd9057e0d"} +{"timestamp":1683355579981,"formatVersion":1,"webaclId":"arn:aws:wafv2:us-east-1:123456789101:global/webacl/open-telemetry-waf/e3132a63-134d-4da9-a0c4-b166ddd6de6c","terminatingRuleId":"RateBasedRule","terminatingRuleType":"RATE_BASED","action":"BLOCK","terminatingRuleMatchDetails":[],"httpSourceName":"APIGW","httpSourceId":"EXAMPLE11:rjvegx5guh:CanaryTest","ruleGroupList":[],"rateBasedRuleList":[{"rateBasedRuleId":"arn:aws:wafv2:us-east-1:123456789101_MANAGED:global/ipset/e3132a63-134d-4da9-a0c4-b166ddd6de6c_77ce5c35-14fa-4731-9710-86216d568f12_IPV4/77ce5c35-14fa-4731-9710-86216d568f12","rateBasedRuleName":"RateBasedRule","limitKey":"CUSTOMKEYS","maxRateAllowed":100,"evaluationWindowSec":"120","customValues":[{"key":"HEADER","name":"dogname","value":"ella"}]}],"nonTerminatingMatchingRules":[],"requestHeadersInserted":null,"responseCodeSent":null,"httpRequest":{"clientIp":"52.46.82.45","country":"FR","headers":[{"name":"X-Forwarded-For","value":"52.46.82.45"},{"name":"X-Forwarded-Proto","value":"https"},{"name":"X-Forwarded-Port","value":"443"},{"name":"Host","value":"rjvegx5guh.execute-api.eu-west-3.amazonaws.com"},{"name":"X-Amzn-Trace-Id","value":"Root=1-645566cf-7cb058b04d9bb3ee01dc4036"},{"name":"dogname","value":"ella"},{"name":"User-Agent","value":"RateBasedRuleTestKoipOneKeyModulePV2"},{"name":"Accept-Encoding","value":"gzip,deflate"}],"uri":"/CanaryTest","args":"","httpVersion":"HTTP/1.1","httpMethod":"GET","requestId":"Ed0AiHF_CGYF-DA="}} diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi_1.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi_1.yaml new file mode 100644 index 0000000000000..c29e6ac631be3 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi_1.yaml @@ -0,0 +1,113 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: cloud.region + value: + stringValue: us-east-1 + - key: cloud.account.id + value: + stringValue: "123456789101" + - key: cloud.resource_id + value: + stringValue: arn:aws:wafv2:us-east-1:123456789101:global/webacl/open-telemetry-waf/e3132a63-134d-4da9-a0c4-b166ddd6de6c + scopeLogs: + - logRecords: + - attributes: + - key: network.protocol.name + value: + stringValue: http + - key: network.protocol.version + value: + stringValue: "2.0" + - key: aws.waf.terminating_rule.type + value: + stringValue: REGULAR + - key: aws.waf.terminating_rule.id + value: + stringValue: Default_Action + - key: aws.waf.action + value: + stringValue: ALLOW + - key: aws.waf.source.id + value: + stringValue: E3DTJP8YLL6OBQ + - key: aws.waf.source.name + value: + stringValue: CF + - key: http.request.header.host + value: + stringValue: dsx1234tsajqz63.cloudfront.net + - key: http.request.header.user-agent + value: + stringValue: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0 + - key: http.request.header.accept + value: + stringValue: image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5 + - key: http.request.header.accept-language + value: + stringValue: en-US,en;q=0.5 + - key: http.request.header.accept-encoding + value: + stringValue: gzip, deflate, br, zstd + - key: http.request.header.referer + value: + stringValue: https://dsx88tsajqz63.cloudfront.net/ + - key: http.request.header.sec-fetch-dest + value: + stringValue: image + - key: http.request.header.sec-fetch-mode + value: + stringValue: no-cors + - key: http.request.header.sec-fetch-site + value: + stringValue: same-origin + - key: http.request.header.dnt + value: + stringValue: "1" + - key: http.request.header.sec-gpc + value: + stringValue: "1" + - key: http.request.header.priority + value: + stringValue: u=6 + - key: http.request.header.te + value: + stringValue: trailers + - key: client.address + value: + stringValue: 178.84.204.171 + - key: server.address + value: + stringValue: dsx88tsajqz63.cloudfront.net + - key: url.path + value: + stringValue: /favicon.ico + - key: http.request.method + value: + stringValue: GET + - key: aws.request_id + value: + stringValue: n6LHLPqblIh_4qRsVj0940K9LxKyrkiUUE7lyMol1eTptabtlhHiXQ== + - key: url.scheme + value: + stringValue: https + - key: geo.country.iso_code + value: + stringValue: NL + - key: tls.client.ja3 + value: + stringValue: 6f7889b9fb1a62a9577e685c1fcfa919 + - key: tls.client.ja4 + value: + stringValue: t13d1717h2_5b57614c22b0_3cbfd9057e0d + body: {} + timeUnixNano: "1748208718574000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.waf + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi_2.yaml b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi_2.yaml new file mode 100644 index 0000000000000..2f36813ea7e02 --- /dev/null +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/testdata/valid_log_multi_2.yaml @@ -0,0 +1,86 @@ +resourceLogs: + - resource: + attributes: + - key: cloud.provider + value: + stringValue: aws + - key: cloud.region + value: + stringValue: us-east-1 + - key: cloud.account.id + value: + stringValue: "123456789101" + - key: cloud.resource_id + value: + stringValue: arn:aws:wafv2:us-east-1:123456789101:global/webacl/open-telemetry-waf/e3132a63-134d-4da9-a0c4-b166ddd6de6c + scopeLogs: + - logRecords: + - attributes: + - key: network.protocol.name + value: + stringValue: http + - key: network.protocol.version + value: + stringValue: "1.1" + - key: aws.waf.terminating_rule.type + value: + stringValue: RATE_BASED + - key: aws.waf.terminating_rule.id + value: + stringValue: RateBasedRule + - key: aws.waf.action + value: + stringValue: BLOCK + - key: aws.waf.source.id + value: + stringValue: EXAMPLE11:rjvegx5guh:CanaryTest + - key: aws.waf.source.name + value: + stringValue: APIGW + - key: http.request.header.X-Forwarded-For + value: + stringValue: 52.46.82.45 + - key: http.request.header.X-Forwarded-Proto + value: + stringValue: https + - key: http.request.header.X-Forwarded-Port + value: + stringValue: "443" + - key: http.request.header.Host + value: + stringValue: rjvegx5guh.execute-api.eu-west-3.amazonaws.com + - key: http.request.header.X-Amzn-Trace-Id + value: + stringValue: Root=1-645566cf-7cb058b04d9bb3ee01dc4036 + - key: http.request.header.dogname + value: + stringValue: ella + - key: http.request.header.User-Agent + value: + stringValue: RateBasedRuleTestKoipOneKeyModulePV2 + - key: http.request.header.Accept-Encoding + value: + stringValue: gzip,deflate + - key: client.address + value: + stringValue: 52.46.82.45 + - key: url.path + value: + stringValue: /CanaryTest + - key: http.request.method + value: + stringValue: GET + - key: aws.request_id + value: + stringValue: Ed0AiHF_CGYF-DA= + - key: geo.country.iso_code + value: + stringValue: FR + body: {} + timeUnixNano: "1683355579981000000" + scope: + attributes: + - key: encoding.format + value: + stringValue: aws.waf + name: github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/unmarshaler.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/unmarshaler.go index 1c52666b7b690..135e6b94ccd5b 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/unmarshaler.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/unmarshaler.go @@ -4,7 +4,6 @@ package waf // import "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf" import ( - "bufio" "errors" "fmt" "io" @@ -16,21 +15,13 @@ import ( "go.opentelemetry.io/collector/pdata/plog" conventions "go.opentelemetry.io/otel/semconv/v1.38.0" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/constants" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/metadata" "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding/awslogsencodingextension/internal/unmarshaler" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/xstreamencoding" ) -type wafLogUnmarshaler struct { - buildInfo component.BuildInfo -} - -func NewWAFLogUnmarshaler(buildInfo component.BuildInfo) unmarshaler.AWSUnmarshaler { - return &wafLogUnmarshaler{ - buildInfo: buildInfo, - } -} - // See log fields: https://docs.aws.amazon.com/waf/latest/developerguide/logging-fields.html. type wafLog struct { Timestamp int64 `json:"timestamp"` @@ -61,83 +52,110 @@ type wafLog struct { Ja4Fingerprint string `json:"ja4Fingerprint"` } -func (w *wafLogUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs, error) { - logs := plog.NewLogs() - - resourceLogs := logs.ResourceLogs().AppendEmpty() - resourceLogs.Resource().Attributes().PutStr( - string(conventions.CloudProviderKey), - conventions.CloudProviderAWS.Value.AsString(), - ) - - scopeLogs := resourceLogs.ScopeLogs().AppendEmpty() - scopeLogs.Scope().SetName(metadata.ScopeName) - scopeLogs.Scope().SetVersion(w.buildInfo.Version) - scopeLogs.Scope().Attributes().PutStr(constants.FormatIdentificationTag, "aws."+constants.FormatWAFLog) - - scanner := bufio.NewScanner(reader) - webACLID := "" - for scanner.Scan() { - logLine := scanner.Bytes() - - var log wafLog - if err := gojson.Unmarshal(logLine, &log); err != nil { - return plog.Logs{}, fmt.Errorf("failed to unmarshal WAF log: %w", err) - } - if log.WebACLID == "" { - return plog.Logs{}, errors.New("invalid WAF log: empty webaclId field") - } - if webACLID != "" && log.WebACLID != webACLID { - return plog.Logs{}, fmt.Errorf( - "unexpected: new webaclId %q is different than previous one %q", - webACLID, - log.WebACLID, - ) - } - webACLID = log.WebACLID +type wafLogUnmarshaler struct { + buildInfo component.BuildInfo +} - record := scopeLogs.LogRecords().AppendEmpty() - if err := w.addWAFLog(log, record); err != nil { - return plog.Logs{}, err - } +func NewWAFLogUnmarshaler(buildInfo component.BuildInfo) unmarshaler.AWSUnmarshaler { + return &wafLogUnmarshaler{ + buildInfo: buildInfo, + } +} + +func (w *wafLogUnmarshaler) UnmarshalAWSLogs(reader io.Reader) (plog.Logs, error) { + // Decode as a stream but flush all at once using flush options + streamUnmarshaler, err := w.NewLogsDecoder(reader, encoding.WithFlushItems(0), encoding.WithFlushBytes(0)) + if err != nil { + return plog.Logs{}, err } - if err := setResourceAttributes(resourceLogs, webACLID); err != nil { - return plog.Logs{}, fmt.Errorf("failed to get resource attributes: %w", err) + logs, err := streamUnmarshaler.DecodeLogs() + if err != nil { + //nolint:errorlint + if err == io.EOF { + // EOF indicates no logs were found, return any logs that's available + return logs, nil + } + return plog.Logs{}, err } return logs, nil } -// setResourceAttributes based on the web ACL ID -func setResourceAttributes(resourceLogs plog.ResourceLogs, webACLID string) error { - expectedFormat := "arn:aws:wafv2:::/webacl//" - value, remaining, _ := strings.Cut(webACLID, "arn:aws:wafv2:") - if value != "" { - return fmt.Errorf("webaclId %q does not have expected prefix %q", webACLID, "arn:aws:wafv2:") - } - if remaining == "" { - return fmt.Errorf("webaclId %q contains no data after expected prefix %q", webACLID, "arn:aws:wafv2:") +// NewLogsDecoder returns a LogsDecoder that processes AWS WAF logs from the provided reader. +// Parses JSON-formatted logs containing WAF events (web ACL evaluations, actions, HTTP request details). +// Supports offset-based streaming; offset tracks bytes processed +func (w *wafLogUnmarshaler) NewLogsDecoder(reader io.Reader, options ...encoding.DecoderOption) (encoding.LogsDecoder, error) { + scannerHelper, err := xstreamencoding.NewScannerHelper(reader, options...) + if err != nil { + return nil, err } - value, remaining, _ = strings.Cut(remaining, ":") - if value == "" { - return fmt.Errorf("could not find region in webaclId %q", webACLID) - } - resourceLogs.Resource().Attributes().PutStr(string(conventions.CloudRegionKey), value) + var sharedWebACLID string + + decodeF := func() (plog.Logs, error) { + logs := plog.NewLogs() + + resourceLogs := logs.ResourceLogs().AppendEmpty() + resourceLogs.Resource().Attributes().PutStr( + string(conventions.CloudProviderKey), + conventions.CloudProviderAWS.Value.AsString(), + ) + + scopeLogs := resourceLogs.ScopeLogs().AppendEmpty() + scopeLogs.Scope().SetName(metadata.ScopeName) + scopeLogs.Scope().SetVersion(w.buildInfo.Version) + scopeLogs.Scope().Attributes().PutStr(constants.FormatIdentificationTag, "aws."+constants.FormatWAFLog) + + for { + logLine, flush, err := scannerHelper.ScanBytes() + if err != nil { + if !errors.Is(err, io.EOF) { + return plog.Logs{}, fmt.Errorf("error reading WAF logs from stream:: %w", err) + } + + if len(logLine) == 0 { + break + } + } + + var log wafLog + if err := gojson.Unmarshal(logLine, &log); err != nil { + return plog.Logs{}, fmt.Errorf("failed to unmarshal WAF log: %w", err) + } + if log.WebACLID == "" { + return plog.Logs{}, errors.New("invalid WAF log: empty webaclId field") + } + if sharedWebACLID != "" && log.WebACLID != sharedWebACLID { + return plog.Logs{}, fmt.Errorf( + "unexpected: new webaclId %q is different than previous one %q", + log.WebACLID, + sharedWebACLID, + ) + } + sharedWebACLID = log.WebACLID + record := scopeLogs.LogRecords().AppendEmpty() + if err := w.addWAFLog(log, record); err != nil { + return plog.Logs{}, err + } + + if flush { + break + } + } - value, remaining, _ = strings.Cut(remaining, ":") - if value == "" { - return fmt.Errorf("could not find account in webaclId %q", webACLID) - } - resourceLogs.Resource().Attributes().PutStr(string(conventions.CloudAccountIDKey), value) + if err := setResourceAttributes(resourceLogs, sharedWebACLID); err != nil { + return plog.Logs{}, fmt.Errorf("failed to get resource attributes: %w", err) + } - if remaining == "" { - return fmt.Errorf("webaclId %q does not have expected format %q", webACLID, expectedFormat) + if scopeLogs.LogRecords().Len() == 0 { + return logs, io.EOF + } + + return logs, nil } - resourceLogs.Resource().Attributes().PutStr(string(conventions.CloudResourceIDKey), webACLID) - return nil + return xstreamencoding.NewLogsDecoderAdapter(decodeF, scannerHelper.Offset), nil } func (*wafLogUnmarshaler) addWAFLog(log wafLog, record plog.LogRecord) error { @@ -193,3 +211,34 @@ func (*wafLogUnmarshaler) addWAFLog(log wafLog, record plog.LogRecord) error { return nil } + +// setResourceAttributes based on the web ACL ID +func setResourceAttributes(resourceLogs plog.ResourceLogs, webACLID string) error { + expectedFormat := "arn:aws:wafv2:::/webacl//" + value, remaining, _ := strings.Cut(webACLID, "arn:aws:wafv2:") + if value != "" { + return fmt.Errorf("webaclId %q does not have expected prefix %q", webACLID, "arn:aws:wafv2:") + } + if remaining == "" { + return fmt.Errorf("webaclId %q contains no data after expected prefix %q", webACLID, "arn:aws:wafv2:") + } + + value, remaining, _ = strings.Cut(remaining, ":") + if value == "" { + return fmt.Errorf("could not find region in webaclId %q", webACLID) + } + resourceLogs.Resource().Attributes().PutStr(string(conventions.CloudRegionKey), value) + + value, remaining, _ = strings.Cut(remaining, ":") + if value == "" { + return fmt.Errorf("could not find account in webaclId %q", webACLID) + } + resourceLogs.Resource().Attributes().PutStr(string(conventions.CloudAccountIDKey), value) + + if remaining == "" { + return fmt.Errorf("webaclId %q does not have expected format %q", webACLID, expectedFormat) + } + + resourceLogs.Resource().Attributes().PutStr(string(conventions.CloudResourceIDKey), webACLID) + return nil +} diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/unmarshaler_test.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/unmarshaler_test.go index c2aba338cc6ff..0db78f47421ed 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/unmarshaler_test.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/waf/unmarshaler_test.go @@ -5,17 +5,19 @@ package waf import ( "bytes" + "errors" + "fmt" "io" "os" "path/filepath" "testing" - gojson "github.com/goccy/go-json" "github.com/klauspost/compress/gzip" "github.com/stretchr/testify/require" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/pdata/plog" + "github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest/plogtest" ) @@ -38,10 +40,8 @@ func compressToGZIPReader(t *testing.T, buf []byte) io.Reader { func readAndCompressLogFile(t *testing.T, dir, file string) io.Reader { data, err := os.ReadFile(filepath.Join(dir, file)) require.NoError(t, err) - compacted := bytes.NewBuffer([]byte{}) - err = gojson.Compact(compacted, data) require.NoError(t, err) - return compressToGZIPReader(t, compacted.Bytes()) + return compressToGZIPReader(t, data) } func TestUnmarshalLogs(t *testing.T) { @@ -85,6 +85,66 @@ func TestUnmarshalLogs(t *testing.T) { } } +func TestNewLogsDecoder(t *testing.T) { + directory := "testdata" + expectPattern := "valid_log_multi_%d.yaml" + + tests := []struct { + name string + offset int64 + index int + }{ + { + name: "Normal streaming", + offset: 0, + index: 0, + }, + { + name: "Stream with offset", + offset: 1983, // skip first record + index: 1, // start from first index + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + input := readAndCompressLogFile(t, directory, "valid_log_multi.json") + + wafUnmarshaler := NewWAFLogUnmarshaler(component.BuildInfo{}) + // Flush after every log for testing purposes & set offset + streamer, err := wafUnmarshaler.NewLogsDecoder(input, encoding.WithFlushItems(1), encoding.WithOffset(tt.offset)) + require.NoError(t, err) + + index := tt.index + for { + index++ + + var logs plog.Logs + logs, err = streamer.DecodeLogs() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + + t.Errorf("failed to unmarshal log %d: %v", index, err) + } + + // To check or update offset, uncomment offset below + // fmt.Println(streamer.Offset()) + + var expectedLogs plog.Logs + expectedLogs, err = golden.ReadLogs(filepath.Join(directory, fmt.Sprintf(expectPattern, index))) + require.NoError(t, err) + require.NoError(t, plogtest.CompareLogs(expectedLogs, logs, plogtest.IgnoreResourceLogsOrder())) + } + + // expect EOF after all logs are read + _, err = streamer.DecodeLogs() + require.ErrorIs(t, err, io.EOF) + }) + } +} + func TestSetKeyAttributes(t *testing.T) { t.Parallel() diff --git a/internal/tidylist/tidylist.txt b/internal/tidylist/tidylist.txt index 1037aa3ac2d23..242ba397093db 100644 --- a/internal/tidylist/tidylist.txt +++ b/internal/tidylist/tidylist.txt @@ -149,6 +149,7 @@ extension/cgroupruntimeextension extension/datadogextension extension/encoding/avrologencodingextension extension/encoding/awscloudwatchmetricstreamsencodingextension +pkg/xstreamencoding extension/encoding/awslogsencodingextension extension/encoding/azureencodingextension extension/encoding/googlecloudlogentryencodingextension @@ -156,7 +157,6 @@ extension/encoding/jaegerencodingextension extension/encoding/jsonlogencodingextension pkg/translator/skywalking extension/encoding/skywalkingencodingextension -pkg/xstreamencoding extension/encoding/textencodingextension extension/encoding/zipkinencodingextension extension/googleclientauthextension From 9497a7b7c9321b19afd16254489c386f993f041e Mon Sep 17 00:00:00 2001 From: Kavindu Dodanduwa Date: Tue, 17 Feb 2026 11:57:14 -0800 Subject: [PATCH 2/2] review changes: remove non api bound method Signed-off-by: Kavindu Dodanduwa --- .../encoding/awslogsencodingextension/go.mod | 3 + .../subscription-filter/unmarshaler.go | 32 ++++++----- .../vpc-flow-log/benchmark_test.go | 2 +- .../unmarshaler/vpc-flow-log/unmarshaler.go | 56 ------------------- 4 files changed, 22 insertions(+), 71 deletions(-) diff --git a/extension/encoding/awslogsencodingextension/go.mod b/extension/encoding/awslogsencodingextension/go.mod index f0f000ca198f2..89fa6a1165e61 100644 --- a/extension/encoding/awslogsencodingextension/go.mod +++ b/extension/encoding/awslogsencodingextension/go.mod @@ -9,6 +9,7 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding v0.146.0 github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden v0.146.0 github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest v0.146.0 + github.com/open-telemetry/opentelemetry-collector-contrib/pkg/xstreamencoding v0.0.0-20260218150750-8f6cb4673d5f github.com/stretchr/testify v1.11.1 go.opentelemetry.io/collector/component v1.52.0 go.opentelemetry.io/collector/component/componenttest v0.146.1 @@ -62,3 +63,5 @@ replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest => ../../../pkg/pdatatest replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden => ../../../pkg/golden + +replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/xstreamencoding => ../../../pkg/xstreamencoding diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/subscription-filter/unmarshaler.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/subscription-filter/unmarshaler.go index 42442363499ee..e76d0e2dc7942 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/subscription-filter/unmarshaler.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/subscription-filter/unmarshaler.go @@ -92,35 +92,39 @@ func (f *subscriptionFilterUnmarshaler) NewLogsDecoder(reader io.Reader, _ ...en var isEOF bool return xstreamencoding.NewLogsDecoderAdapter( func() (plog.Logs, error) { + logs := plog.NewLogs() + resourceLogsByKey := make(map[resourceGroupKey]plog.LogRecordSlice) + if isEOF { - return plog.NewLogs(), io.EOF + return logs, io.EOF } - var cwLog cloudwatchLogsData decoder := gojson.NewDecoder(reader) - if err := decoder.Decode(&cwLog); err != nil { - return plog.Logs{}, fmt.Errorf("failed to decode decompressed reader: %w", err) - } + for decoder.More() { + var cwLog cloudwatchLogsData + if err := decoder.Decode(&cwLog); err != nil { + return plog.Logs{}, fmt.Errorf("failed to decode decompressed reader: %w", err) + } - if cwLog.MessageType == "CONTROL_MESSAGE" { - return plog.NewLogs(), nil - } + if cwLog.MessageType == "CONTROL_MESSAGE" { + continue + } + + if err := validateLog(cwLog); err != nil { + return plog.Logs{}, fmt.Errorf("invalid cloudwatch log: %w", err) + } - if err := validateLog(cwLog); err != nil { - return plog.Logs{}, fmt.Errorf("invalid cloudwatch log: %w", err) + f.appendLogs(logs, resourceLogsByKey, cwLog) } isEOF = true - return f.createLogs(cwLog), nil + return logs, nil }, func() int64 { return int64(0) }, ), nil } - return logs, nil -} - // appendLogs appends log records from cwLog into the given plog.Logs, reusing // existing ResourceLogs entries tracked by resourceLogsByKey when possible. // Events are grouped by their extracted fields (account ID + region) and diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/benchmark_test.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/benchmark_test.go index 52f96c01dc2c1..238fc79063c13 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/benchmark_test.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/benchmark_test.go @@ -61,7 +61,7 @@ func BenchmarkUnmarshalUnmarshalPlainTextLogs(b *testing.B) { b.Run(name, func(b *testing.B) { b.ReportAllocs() for b.Loop() { - _, err := u.unmarshalPlainTextLogs(bytes.NewReader(data)) + _, err := u.UnmarshalAWSLogs(bytes.NewReader(data)) require.NoError(b, err) } }) diff --git a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler.go b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler.go index 0e6e32d6ce089..e91581cb02a7c 100644 --- a/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler.go +++ b/extension/encoding/awslogsencodingextension/internal/unmarshaler/vpc-flow-log/unmarshaler.go @@ -231,62 +231,6 @@ func (v *vpcFlowLogUnmarshaler) NewLogsDecoder(reader io.Reader, options ...enco return xstreamencoding.NewLogsDecoderAdapter(decodeF, offsetF), nil } -func (v *vpcFlowLogUnmarshaler) unmarshalPlainTextLogs(reader io.Reader) (plog.Logs, error) { - // use buffered reader for efficiency and to avoid any size restrictions - bufReader := bufio.NewReader(reader) - - b, err := bufReader.ReadByte() - if err != nil { - return plog.Logs{}, fmt.Errorf("failed to read first byte: %w", err) - } - - err = bufReader.UnreadByte() - if err != nil { - return plog.Logs{}, fmt.Errorf("failed to unread first byte: %w", err) - } - - if b == '{' { - // Dealing with a JSON logs, so check for CW bound trigger - return v.fromCloudWatch(v.cfg.parsedFormat, bufReader) - } - - // This is S3 bound data, so use fromS3 - return v.fromS3(bufReader) -} - -// fromS3 expects VPC logs from S3 in plain text format -func (v *vpcFlowLogUnmarshaler) fromS3(reader *bufio.Reader) (plog.Logs, error) { - var err error - line, err := reader.ReadString('\n') - if err != nil { - return plog.Logs{}, fmt.Errorf("failed to read first line of VPC logs from S3: %w", err) - } - - fields := strings.Fields(line) - logs, resourceLogs, scopeLogs := v.createLogs() - for { - line, err = reader.ReadString('\n') - if err != nil { - if errors.Is(err, io.EOF) { - // Reached the end of the file, add the last line and exit - // EOF is ignored as we have already processed all log lines - if addLogErr := v.addToLogs(resourceLogs, scopeLogs, fields, strings.TrimSpace(line)); addLogErr != nil { - return plog.Logs{}, addLogErr - } - break - } - - return plog.Logs{}, fmt.Errorf("error reading VPC logs: %w", err) - } - - if err := v.addToLogs(resourceLogs, scopeLogs, fields, strings.TrimSpace(line)); err != nil { - return plog.Logs{}, err - } - } - - return logs, nil -} - // fromCloudWatch expects VPC logs from CloudWatch Logs subscription filter trigger func (v *vpcFlowLogUnmarshaler) fromCloudWatch(fields []string, reader *bufio.Reader) (plog.Logs, error) { var cwLog events.CloudwatchLogsData