open-telemetry
diff --git a/‎.chloggen/config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.chloggen/config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.chloggen/processor-drainprocessor.yaml‎
Lines changed: 36 additions & 0 deletions b/‎.chloggen/processor-drainprocessor.yaml‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 0 deletions b/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmd/otelcontribcol/builder-config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎cmd/otelcontribcol/builder-config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎internal/tidylist/tidylist.txt‎
Lines changed: 1 addition & 0 deletions b/‎internal/tidylist/tidylist.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎processor/drainprocessor/Makefile‎
Lines changed: 1 addition & 0 deletions b/‎processor/drainprocessor/Makefile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎processor/drainprocessor/README.md‎
Lines changed: 201 additions & 0 deletions b/‎processor/drainprocessor/README.md‎
Lines changed: 201 additions & 0 deletions
diff --git a/‎processor/drainprocessor/config.go‎
Lines changed: 98 additions & 0 deletions b/‎processor/drainprocessor/config.go‎
Lines changed: 98 additions & 0 deletions
@@ -188,6 +188,7 @@ components:
     - processor/cumulativetodelta
     - processor/deltatocumulative
     - processor/deltatorate
+    - processor/drain
     - processor/digitaloceandetector
     - processor/dnslookup
     - processor/dynatracedetector
 
@@ -0,0 +1,36 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: new_component
+
+# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog)
+component: processor/drain
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Add drain processor that applies the Drain log clustering algorithm to annotate log records with a derived template string and cluster ID.
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [47235]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext: |
+  The processor sets `log.record.template` (e.g. `"user <*> logged in from <*>"`) and
+  `log.record.template.id` on each log record. Downstream processors such as the filter
+  processor can then act on those attributes to, for example, drop entire classes of
+  noisy logs by template string.
+
+  Key features:
+  - Configurable Drain parse tree parameters (depth, similarity threshold, max clusters with LRU eviction)
+  - Optional seeding via known template strings or example log lines for stable IDs across restarts
+  - `passthrough` warmup mode (default) and `buffer` warmup mode that holds records until the tree has stabilised
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [user]
@@ -199,6 +199,7 @@ processor/cumulativetodeltaprocessor/                            @open-telemetry
 processor/deltatocumulativeprocessor/                            @open-telemetry/collector-contrib-approvers @RichieSams
 processor/deltatorateprocessor/                                  @open-telemetry/collector-contrib-approvers @Aneurysm9
 processor/dnslookupprocessor/                                    @open-telemetry/collector-contrib-approvers @andrzej-stencel @kaisecheng @edmocosta
+processor/drainprocessor/                                        @open-telemetry/collector-contrib-approvers @MikeGoldsmith
 processor/filterprocessor/                                       @open-telemetry/collector-contrib-approvers @TylerHelmuth @evan-bradley @edmocosta @bogdandrutu
 processor/geoipprocessor/                                        @open-telemetry/collector-contrib-approvers @andrzej-stencel @michalpristas @rogercoll
 processor/groupbyattrsprocessor/                                 @open-telemetry/collector-contrib-approvers @rnishtala-sumo @amdprophet
 
@@ -119,6 +119,7 @@ processors:
   - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/coralogixprocessor v0.148.0
   - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatocumulativeprocessor v0.148.0
   - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.148.0
+  - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/drainprocessor v0.148.0
   - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.148.0
   - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/geoipprocessor v0.148.0
   - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.148.0
 
@@ -215,6 +215,7 @@ processor/coralogixprocessor
 processor/cumulativetodeltaprocessor
 processor/deltatorateprocessor
 processor/dnslookupprocessor
+processor/drainprocessor
 processor/filterprocessor
 processor/geoipprocessor
 processor/groupbyattrsprocessor
 
@@ -0,0 +1 @@
+include ../../Makefile.Common
@@ -0,0 +1,201 @@
+# Drain Processor
+
+| Status        |           |
+| ------------- |-----------|
+| Stability     | [development]: logs |
+| Distributions | [contrib] |
+| Issues        | [![Open issues](https://img.shields.io/github/issues-search/open-telemetry/opentelemetry-collector-contrib?query=is%3Aissue%20is%3Aopen%20label%3Aprocessor%2Fdrain%20&label=open&color=orange&logo=opentelemetry)](https://github.com/open-telemetry/opentelemetry-collector-contrib/issues?q=is%3Aopen+is%3Aissue+label%3Aprocessor%2Fdrain) [![Closed issues](https://img.shields.io/github/issues-search/open-telemetry/opentelemetry-collector-contrib?query=is%3Aissue%20is%3Aclosed%20label%3Aprocessor%2Fdrain%20&label=closed&color=blue&logo=opentelemetry)](https://github.com/open-telemetry/opentelemetry-collector-contrib/issues?q=is%3Aclosed+is%3Aissue+label%3Aprocessor%2Fdrain) |
+
+[development]: https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/component-stability.md#development
+[contrib]: https://github.com/open-telemetry/opentelemetry-collector-releases/tree/main/distributions/otelcol-contrib
+
+The drain processor applies the [Drain log clustering algorithm](https://jiemingzhu.github.io/pub/pjhe_icws2017.pdf) to log records as they pass through the pipeline. For each record it derives a template string (e.g. `"user <*> logged in from <*>"`) and a numeric cluster ID, then attaches both as attributes on the record.
+
+This processor **annotates**; it does not filter. Use the [filter processor](../filterprocessor/README.md) downstream to act on the `log.record.template` attribute — for example, to drop entire classes of noisy logs by pattern.
+
+## How it works
+
+Drain builds a parse tree from the token structure of log lines. Lines with similar structure are grouped into a **cluster**, and a **template** is derived by replacing variable tokens with `<*>` wildcards. As more logs arrive the templates become more accurate and stable.
+
+Template IDs are numeric and local to each collector instance. They are not stable across restarts unless the tree is pre-seeded with known templates (see [Seeding](#seeding)). Use the template **string** (not the ID) for persistent filtering rules.
+
+## Configuration
+
+```yaml
+processors:
+  drain:
+    # Drain parse tree parameters
+    log_cluster_depth: 4       # default: 4 (minimum: 3)
+    sim_threshold: 0.4         # default: 0.4, range [0.0, 1.0]
+    max_children: 100          # default: 100
+    max_clusters: 0            # default: 0 (unlimited, LRU eviction when > 0)
+    extra_delimiters: []       # default: [] (extra token delimiters beyond whitespace)
+
+    # Body extraction
+    body_field: ""             # default: "" (use full body string)
+
+    # Output attribute names
+    template_attribute: "log.record.template"    # default
+    template_id_attribute: "log.record.template.id"  # default
+
+    # Seeding (optional)
+    seed_templates: []
+    seed_logs: []
+
+    # Warmup mode
+    warmup_mode: passthrough   # default: "passthrough" | "buffer"
+    warmup_min_clusters: 10    # default: 10 (only used when warmup_mode: buffer)
+    warmup_buffer_max_logs: 10000  # default: 10000 (only used when warmup_mode: buffer)
+```
+
+### Parameters
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `log_cluster_depth` | int | `4` | Max depth of the Drain parse tree. Higher values produce more specific templates. Minimum: 3. |
+| `sim_threshold` | float | `0.4` | Similarity threshold in [0.0, 1.0]. Lines below this threshold create a new cluster rather than merging with an existing one. |
+| `max_children` | int | `100` | Maximum children per parse tree node. |
+| `max_clusters` | int | `0` | Maximum clusters tracked. When exceeded, the least-recently-used cluster is evicted. `0` means unlimited. |
+| `extra_delimiters` | []string | `[]` | Additional token delimiters beyond whitespace (e.g. `[",", ":"]`). |
+| `body_field` | string | `""` | If set, and the log body is a structured map, the value of this top-level key is used as the text to template instead of the full body. |
+| `template_attribute` | string | `"log.record.template"` | Attribute key written with the derived template string. |
+| `template_id_attribute` | string | `"log.record.template.id"` | Attribute key written with the numeric cluster ID. |
+| `seed_templates` | []string | `[]` | Template strings to pre-load at startup (see [Seeding](#seeding)). |
+| `seed_logs` | []string | `[]` | Raw example log lines to train on at startup (see [Seeding](#seeding)). |
+| `warmup_mode` | string | `"passthrough"` | Controls behavior during the warmup period. `"passthrough"` (default) or `"buffer"` (see [Warmup mode](#warmup-mode)). |
+| `warmup_min_clusters` | int | `10` | Minimum distinct clusters before warmup ends. Only used when `warmup_mode: buffer`. |
+| `warmup_buffer_max_logs` | int | `10000` | Maximum records to buffer before flushing regardless of cluster count. Only used when `warmup_mode: buffer`. Must be > 0. |
+
+## Seeding
+
+Seeding pre-populates the Drain tree before any live logs arrive. This is the primary mechanism for stable template IDs across restarts.
+
+### `seed_templates`
+
+Provide known template strings directly. The processor trains on each entry at startup, establishing clusters for those patterns immediately.
+
+```yaml
+processors:
+  drain:
+    seed_templates:
+      - "user <*> logged in from <*>"
+      - "connected to <*>"
+      - "heartbeat ping <*>"
+```
+
+### `seed_logs`
+
+Provide raw example log lines. The processor trains on them at startup, letting Drain derive the templates itself. Useful when exact template strings are not known in advance.
+
+```yaml
+processors:
+  drain:
+    seed_logs:
+      - "user alice logged in from 10.0.0.1"
+      - "user bob logged in from 192.168.1.1"
+      - "connected to 10.0.0.1"
+```
+
+Empty and whitespace-only entries in both lists are silently skipped.
+
+> **Note on multi-instance deployments**: Each collector instance maintains its own independent Drain tree. Template IDs will differ between instances. Providing identical `seed_templates` across all instances produces consistent template **strings** (though IDs may still differ). Filtering rules should always match on the template string, not the ID.
+
+## Warmup mode
+
+### `passthrough` (default)
+
+Records are annotated and forwarded immediately from the first record. Early templates may be unstable (exact log lines rather than abstracted patterns) until enough similar lines have been observed.
+
+### `buffer`
+
+Records are held in memory until `warmup_min_clusters` distinct templates have been observed, at which point the buffer is flushed with annotations applied using the now-stable templates. If `warmup_buffer_max_logs` is reached before the cluster threshold, the buffer is flushed anyway.
+
+Use buffer mode when downstream consumers (e.g. a filter processor) must act on stable, wildcard-abstracted templates from the very first record.
+
+```yaml
+processors:
+  drain:
+    warmup_mode: buffer
+    warmup_min_clusters: 20
+    warmup_buffer_max_logs: 5000
+```
+
+> **Memory note**: in buffer mode, all records are held in memory until flush. Size the buffer with `warmup_buffer_max_logs` according to your available memory and expected log volume during startup.
+
+## Output attributes
+
+By default the processor sets two attributes on each log record:
+
+| Attribute | Type | Example | Description |
+|-----------|------|---------|-------------|
+| `log.record.template` | string | `"user <*> logged in from <*>"` | The Drain-derived template string. Stable within an instance once the tree has warmed up. Use this for filtering rules. |
+| `log.record.template.id` | int | `3` | Numeric cluster ID. Unstable across restarts unless seeding is used. |
+
+Both attribute names are configurable via `template_attribute` and `template_id_attribute`.
+
+> **Semantic conventions**: `log.record.template` aligns with the proposed OTel attribute in [open-telemetry/semantic-conventions#1283](https://github.com/open-telemetry/semantic-conventions/issues/1283) and [#2064](https://github.com/open-telemetry/semantic-conventions/issues/2064). These names may be updated if a convention is formally adopted.
+
+## Example pipeline
+
+The following pipeline annotates logs with Drain templates and then drops known noisy patterns using the filter processor:
+
+```yaml
+processors:
+  drain:
+    log_cluster_depth: 4
+    sim_threshold: 0.4
+    max_clusters: 500
+    seed_templates:
+      - "user <*> logged in from <*>"
+      - "connected to <*>"
+      - "heartbeat ping <*>"
+    warmup_mode: buffer
+    warmup_min_clusters: 20
+    warmup_buffer_max_logs: 5000
+
+  filter/drop_noisy:
+    error_mode: ignore
+    logs:
+      log_record:
+        - attributes["log.record.template"] == "heartbeat ping <*>"
+        - attributes["log.record.template"] == "connected to <*>"
+
+service:
+  pipelines:
+    logs:
+      receivers: [otlp]
+      processors: [drain, filter/drop_noisy]
+      exporters: [otlp]
+```
+
+## `body_field`
+
+`body_field` is a convenience for pipelines where the log body is a structured map and you do not have full control over how upstream processors shape it.
+
+If you **do** control the pipeline, the preferred approach is a `move` operator in the filelog receiver (or equivalent) to promote the message field back to a plain string body before the drain processor sees the record:
+
+```yaml
+operators:
+  - type: json_parser
+  - type: move
+    from: body.message
+    to: body
+```
+
+If you **cannot** do that — for example, logs arrive via OTLP already structured — set `body_field` to the map key whose value should be fed to Drain:
+
+```yaml
+processors:
+  drain:
+    body_field: "message"
+```
+
+Given a log body `{"level": "info", "message": "user alice logged in from 10.0.0.1"}`, only the `message` value is fed to Drain. The full body is used unchanged if the field is absent or the body is not a map.
+
+> **Note**: `body_field` only supports a single top-level key. Full OTTL path expressions (e.g. `body["event"]["message"]`) are not supported and are noted as a future extension.
+
+## Future extensions
+
+- **Snapshot persistence**: save and restore the Drain tree state across restarts, eliminating the need for seeding. This requires serialization support and is tracked as a future improvement.
+- **OTTL body extraction**: support full OTTL path expressions for `body_field` instead of a single top-level key name.
+- **Multi-instance synchronisation**: optional shared snapshot file or gossip-based tree merging for consistent templates across horizontally scaled deployments.
@@ -0,0 +1,98 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+package drainprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/drainprocessor"
+
+import (
+	"errors"
+	"fmt"
+)
+
+// Config defines configuration for the drain processor.
+type Config struct {
+	// LogClusterDepth is the max depth of the Drain parse tree.
+	// Higher values produce more specific templates. Default: 4. Minimum: 3.
+	LogClusterDepth int `mapstructure:"log_cluster_depth"`
+
+	// SimThreshold is the similarity threshold (0.0–1.0) below which a new
+	// cluster is created rather than merged with an existing one. Default: 0.4.
+	SimThreshold float64 `mapstructure:"sim_threshold"`
+
+	// MaxChildren is the maximum number of children per parse tree node.
+	// Default: 100.
+	MaxChildren int `mapstructure:"max_children"`
+
+	// MaxClusters is the maximum number of clusters tracked. When the limit is
+	// reached, the least recently used cluster is evicted. 0 means unlimited.
+	// Default: 0.
+	MaxClusters int `mapstructure:"max_clusters"`
+
+	// ExtraDelimiters are additional token delimiters beyond whitespace.
+	ExtraDelimiters []string `mapstructure:"extra_delimiters"`
+
+	// BodyField optionally specifies a top-level key to extract from a
+	// structured (map) log body before feeding the value to Drain. If empty,
+	// the full body string representation is used. This is a convenience for
+	// pipelines where the body is a parsed map (e.g. after json_parser) and
+	// the user does not have a move operator to promote the message field back
+	// to a plain string body. Pipelines that do have that control should use a
+	// move operator instead and leave this unset.
+	BodyField string `mapstructure:"body_field"`
+
+	// TemplateAttribute is the log record attribute key to write the derived
+	// template string to. Default: "log.record.template".
+	TemplateAttribute string `mapstructure:"template_attribute"`
+
+	// TemplateIDAttribute is the log record attribute key to write the numeric
+	// cluster ID to. Default: "log.record.template.id".
+	TemplateIDAttribute string `mapstructure:"template_id_attribute"`
+
+	// SeedTemplates is a list of pre-known template strings to train on at
+	// startup before any live logs arrive. Improves template stability across
+	// restarts for known log patterns.
+	SeedTemplates []string `mapstructure:"seed_templates"`
+
+	// SeedLogs is a list of raw example log lines to train on at startup.
+	// Drain derives templates from these lines itself.
+	SeedLogs []string `mapstructure:"seed_logs"`
+
+	// WarmupMode controls processor behavior during the initial period before
+	// the Drain tree has stabilized. Valid values: "passthrough" (default),
+	// "buffer".
+	WarmupMode string `mapstructure:"warmup_mode"`
+
+	// WarmupMinClusters is the number of distinct clusters that must be
+	// observed before warmup ends. Only used when WarmupMode is "buffer".
+	// Default: 10.
+	WarmupMinClusters int `mapstructure:"warmup_min_clusters"`
+
+	// WarmupBufferMaxLogs is the maximum number of log records to buffer
+	// during warmup before flushing regardless of cluster count. Only used
+	// when WarmupMode is "buffer". Must be > 0. Default: 10000.
+	WarmupBufferMaxLogs int `mapstructure:"warmup_buffer_max_logs"`
+}
+
+const (
+	warmupModePassthrough = "passthrough"
+	warmupModeBuffer      = "buffer"
+)
+
+// Validate checks the Config for invalid values.
+func (cfg *Config) Validate() error {
+	if cfg.LogClusterDepth < 3 {
+		return fmt.Errorf("log_cluster_depth must be >= 3, got %d", cfg.LogClusterDepth)
+	}
+	if cfg.SimThreshold < 0.0 || cfg.SimThreshold > 1.0 {
+		return fmt.Errorf("sim_threshold must be in [0.0, 1.0], got %f", cfg.SimThreshold)
+	}
+	if cfg.WarmupMode != warmupModePassthrough && cfg.WarmupMode != warmupModeBuffer {
+		return fmt.Errorf("warmup_mode must be %q or %q, got %q", warmupModePassthrough, warmupModeBuffer, cfg.WarmupMode)
+	}
+	if cfg.WarmupMode == warmupModeBuffer && cfg.WarmupMinClusters <= 0 {
+		return errors.New("warmup_min_clusters must be > 0 when warmup_mode is \"buffer\"")
+	}
+	if cfg.WarmupMode == warmupModeBuffer && cfg.WarmupBufferMaxLogs <= 0 {
+		return errors.New("warmup_buffer_max_logs must be > 0 when warmup_mode is \"buffer\"")
+	}
+	return nil
+}