bucket-scrapper/sample-config.yaml at main · ManoManoTech/bucket-scrapper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# bucket-scrapper configuration
#
# Each bucket entry describes an S3 bucket and how its objects are organized.
# The `path` array builds the S3 key prefix used to list objects for each
# date/hour in the requested time range.
#
# Path components:
#   - static_path: a literal prefix segment (e.g. "logs", "prod")
#   - datefmt:     a date-templated segment. Two conventions are recognized:
#
#       Hive-style partitioning:
#         datefmt: "dt=20060102/hour=15"
#         Produces keys like:  logs/dt=20240315/hour=09/service-a.json.zst
#
#       Go reference-time layout (YYYY/MM/DD/HH):
#         datefmt: "2006/01/02/15"
#         Produces keys like:  logs/2024/03/15/09/service-a.json.gz
#
#       The placeholder values (2006, 01, 02, 15) come from Go's reference
#       date "Mon Jan 2 15:04:05 MST 2006" and are replaced at runtime with
#       the actual year, month, day, and hour being searched.
#
# Object filtering:
#   only_prefix_patterns is an optional list of regexes matched against the
#   object key *after* the generated prefix. Only objects whose suffix matches
#   at least one pattern will be downloaded.
#
# File-level sampling:
#   sample_files (per-bucket) is an optional float in (0.0, 1.0] applied
#   *after* key filtering. 0.1 keeps 10% of files. Coarsest of the
#   work-shedding mechanisms — it sheds whole files, so for sources with
#   high per-file size variance the resulting line-volume sample can be
#   noisy. The global `--sample-files` CLI flag is the fallback when the
#   per-bucket field is absent. Set `sampling_seed` (top-level) to make
#   the choice reproducible across runs.

buckets:
  # Example: Hive-partitioned logs (dt=YYYYMMDD/hour=HH)
  - bucket: my-logs-bucket
    path:
      - static_path: application-logs/prod
      - datefmt: "dt=20060102/hour=15"
    only_prefix_patterns:
      - "^service-a.*\\.json\\.zst$"
      - "^service-b.*\\.json\\.gz$"
    # sample_files: 0.1   # optional: keep ~10% of files after filtering

  # Example: date-path layout (YYYY/MM/DD/HH)
  - bucket: my-other-logs-bucket
    path:
      - static_path: raw
      - datefmt: "2006/01/02/15"

# Default AWS region (can be overridden with --region)
region: eu-west-3

# Optional RNG seed for the file-level sampler. Omit for fresh entropy each
# run; set for reproducible sampling across runs.
# sampling_seed: 42

# ─── Output ────────────────────────────────────────────────────────────────
#
# `outputs:` is a list of one entry today; multi-output fan-out is reserved
# for a future release. Pick exactly one of the four types below.
#
# When `outputs:` is present, the CLI per-output flags (--output, --output-dir,
# --http-*, --s3-output-*) must NOT be passed — mixing CLI and config is a
# hard error. Drop `outputs:` and use the CLI flags instead, or keep config
# and stop passing those flags.
#
# String fields support `${VAR}` and `${VAR:-default}` interpolation so
# secrets stay out of the YAML.

outputs:
  # ── File output: per-prefix files under `dir` ────────────────────────────
  #
  # `compression`: { format: zstd|gzip|none, level: <int> }. Omit the block
  # to default to zstd:3. `level` ranges: zstd 1–22, gzip 0–9. Must be
  # unset when format=none.
  #
  # `path_template` is the per-prefix filename (joined with `dir`).
  # Placeholders: `{prefix}`, `{prefix_hash}`, `{run_id}`, `{ext}` (codec
  # extension — `zst`/`gz`/empty). Default `{prefix}.{ext}` matches the
  # historic layout. The template MUST contain `{prefix}` or
  # `{prefix_hash}`; otherwise distinct source prefixes collide and the
  # second one fails with a fatal error (two encoders cannot share a file).
  - type: file
    dir: ./scrapper-output
    # path_template: "{prefix}.{ext}"
    # compression:
    #   format: zstd
    #   level: 3

  # ── HTTP output: NDJSON POSTs with AIMD throttle ─────────────────────────
  #
  # `Content-Encoding` follows `compression.format` automatically (zstd /
  # gzip), or is omitted entirely when format=none.
  # - type: http
  #   url: https://logs.example.com/api/v1/logs
  #   bearer_auth: ${HTTP_BEARER_AUTH}
  #   timeout_secs: 30
  #   batch_max_mb: 2
  #   compressor_tasks: null     # null = auto (cpu_count / 8, min 1)
  #   upload_tasks: null         # null = 4 × compressor_tasks
  #   upload_channel_size: 4
  #   line_channel_size: 1000
  #   compression:
  #     format: zstd
  #     level: 3
  #   max_retries: 3
  #   max_upload_rate_mbps: 0    # 0 = unlimited
  #   aimd:
  #     decrease_factor: 0.15
  #     increase_mbps: 1.0
  #     max_submission_time_s: 4.0   # 0 = AIMD disabled

  # ── S3 output: per-prefix objects ────────────────────────────────────────
  #
  # Batching model. The s3 sink uploads "batches"; one batch is one
  # PutObject call. Every batch carries lines from a single source
  # prefix — no cross-prefix mixing. The configurable axis is how many
  # batches per source prefix.
  #
  # Default mode (no `batch_max_mb`). One encoder per source prefix,
  # finalized once at end-of-run → exactly one PutObject per prefix.
  # `{seq}` is always 00000.
  #
  # Batched mode (`batch_max_mb` set). After every ingested line the
  # sink reads the per-prefix encoder's compressed output buffer; when
  # it crosses the threshold, that encoder is finalized, the resulting
  # frame is rendered into a key (with `{seq}` substituted) and pushed
  # onto the upload queue, and a fresh encoder is started for the same
  # prefix with `{seq}` += 1. End-of-run runs one final flush per
  # prefix to capture the trailing partial. So a prefix that crosses
  # the threshold N times produces N+1 batches (00000..N).
  #
  # Caveats:
  #   - The threshold is checked against compressed bytes, not plaintext,
  #     so actual upload size sits a little above `batch_max_mb`.
  #   - zstd/gzip buffer internally and only flush blocks periodically.
  #     A small `batch_max_mb` paired with highly-compressible input may
  #     never trigger a mid-run flush — every line gets buffered inside
  #     the codec and end-of-run produces a single batch. Use
  #     `compression.format: none` if you want size-driven batching with
  #     predictable thresholds.
  #   - Batches at or above `multipart_threshold_mb` (default 5 MiB) are
  #     uploaded via S3 multipart with parts of `multipart_part_mb`
  #     (default 5 MiB). Smaller batches use a single PutObject.
  #     `multipart_concurrency` (omit for auto, or set a positive int)
  #     bounds parts in flight across all in-flight batches.
  #
  # When `batch_max_mb` is set, `key_template` MUST contain `{seq}`,
  # otherwise every batch within a prefix would render to the same key
  # and silently overwrite the previous one (rejected at startup).
  #
  # `key_template` placeholders: `{prefix}`, `{prefix_hash}`, `{seq}`,
  # `{run_id}`, `{ext}` (codec extension). MUST contain `{prefix}` or
  # `{prefix_hash}` — otherwise distinct source prefixes write to the same
  # destination key. Static rejection at startup; a runtime warn fires
  # as defence in depth (e.g. if two prefixes happen to share a
  # `{prefix_hash}`). `{run_id}` is unique per process invocation; use
  # it to disambiguate reruns, since `{seq}` resets to 0 each time.
  # - type: s3
  #   bucket: my-results-bucket
  #   region: eu-west-3                                           # optional
  #   endpoint_url: null                                          # optional
  #   key_template: "results/{prefix}/{run_id}-{seq}.ndjson.{ext}"
  #   # batch_max_mb: 16             # optional; omit for one object per prefix
  #   compression:
  #     format: zstd
  #     level: 3
  #   multipart_threshold_mb: 5      # batches >= this go multipart; AWS min 5 MiB
  #   multipart_part_mb: 5           # part size; AWS min 5 MiB, max 5000 MiB
  #   multipart_concurrency: null    # null = transfer manager auto-tuning
  #   upload_tasks: null             # null = auto (whole-batch concurrency)

  # ── Void output: drop all matches (benchmarking only) ────────────────────
  # - type: void