connectors/materialize-snowflake/stream.go at ebd3be73efcd6beef566249e04c53b8a68b737c4 · estuary/connectors · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
package main

import (
	"context"
	"errors"
	"fmt"
	"path"
	"strconv"
	"strings"
	"time"

	"github.com/aws/aws-sdk-go-v2/credentials"
	"github.com/estuary/connectors/go/blob"
	sql "github.com/estuary/connectors/materialize-sql"
	log "github.com/sirupsen/logrus"
	"golang.org/x/exp/maps"
	"golang.org/x/oauth2"
	"google.golang.org/api/option"
)

// channelName produces a reasonably readable channel name that is globally
// unique per materialization and shard. Channels are specific to a table, so
// there is no need to include the database or schema in the name, and they can
// be at least 1000 characters long. It's important for the channel name to
// never change to maintain data consistency.
func channelName(materialization string, keyBegin uint32) string {
	return sanitizeAndAppendHash(materialization) + "_" + fmt.Sprintf("%08x", keyBegin)
}

type tableStream struct {
	mappedColumns []*sql.Column
	channel       *channel
}

// streamManager is a high-level orchestrator for Snowpipe streaming operations,
// which mostly entails writing rows of data for bindings and registering the
// resulting blobs to tables.
type streamManager struct {
	c            *streamClient
	tableStreams map[int]*tableStream
	keyBegin     uint32
	channelName  string

	// Returned from the "configure" API.
	prefix       string
	deploymentId int
	bucketPath   string

	// For writing blob data to object storage.
	bucket          blob.Bucket
	bucketExpiresAt time.Time
	bdecWriter      *bdecWriter
	counter         int
	lastBinding     int // bookkeeping for when to flush the writer when a new binding starts writing rows

	blobStats map[int][]*blobStatsTracker
}

func newStreamManager(cfg *config, materialization string, account string, keyBegin uint32) (*streamManager, error) {
	c, err := newStreamClient(cfg, account)
	if err != nil {
		return nil, fmt.Errorf("newStreamClient: %w", err)
	}

	return &streamManager{
		c:            c,
		tableStreams: make(map[int]*tableStream),
		keyBegin:     keyBegin,
		channelName:  channelName(materialization, keyBegin),
		lastBinding:  -1,
		blobStats:    make(map[int][]*blobStatsTracker),
		counter:      0,
	}, nil
}

func (sm *streamManager) addBinding(ctx context.Context, schema string, table string, target sql.Table) error {
	channel, err := sm.c.openChannel(ctx, schema, table, sm.channelName)
	if err != nil {
		return fmt.Errorf("openChannel: %w", err)
	}

	// Verify that we support this table's columns for Snowpipe streaming.
	for _, col := range channel.TableColumns {
		if _, err := makeSchemaElement(col); err != nil {
			return err
		}
	}

	sm.tableStreams[target.Binding] = &tableStream{
		mappedColumns: target.Columns(),
		channel:       channel,
	}

	log.WithFields(log.Fields{
		"schema":      schema,
		"table":       table,
		"offsetToken": channel.OffsetToken,
		"clientSeq":   channel.ClientSequencer,
		"rowSeq":      channel.RowSequencer,
	}).Info("opened streaming channel")

	return nil
}

func (sm *streamManager) writeRow(ctx context.Context, binding int, row []any) error {
	if sm.lastBinding != -1 && binding != sm.lastBinding {
		if err := sm.finishBlob(); err != nil {
			return fmt.Errorf("finishBlob: %w", err)
		}
	}
	sm.lastBinding = binding

	if sm.bdecWriter == nil {
		if err := sm.startNewBlob(ctx, binding); err != nil {
			return fmt.Errorf("startNewBlob: %w", err)
		}
	}

	if err := sm.bdecWriter.writeRow(row); err != nil {
		return fmt.Errorf("bdecWriter writeRow: %w", err)
	}

	// Start a new blob based on the size of the current blob, or expiration of
	// the blob storage bucket credentials.
	if sm.bdecWriter.done || time.Now().After(sm.bucketExpiresAt) {
		if err := sm.finishBlob(); err != nil {
			return fmt.Errorf("finishBlob: %w", err)
		}
	}

	return nil
}

func (sm *streamManager) finishBlob() error {
	if sm.bdecWriter == nil {
		return nil
	}

	if err := sm.bdecWriter.close(); err != nil {
		return fmt.Errorf("bdecWriter close: %w", err)
	}
	sm.blobStats[sm.lastBinding] = append(sm.blobStats[sm.lastBinding], sm.bdecWriter.blobStats)
	sm.bdecWriter = nil

	return nil
}

func (sm *streamManager) startNewBlob(ctx context.Context, binding int) error {
	if err := sm.maybeInitializeBucket(ctx); err != nil {
		return fmt.Errorf("initializing bucket: %w", err)
	}

	fName := sm.getNextFileName(time.Now(), fmt.Sprintf("%s_%d", sm.prefix, sm.deploymentId))
	w := sm.bucket.NewWriter(ctx, path.Join(sm.bucketPath, string(fName)), sm.objMetadata())

	ts := sm.tableStreams[binding]
	bdecWriter, err := newBdecWriter(w, ts.mappedColumns, ts.channel.TableColumns, ts.channel.EncryptionKey, fName)
	if err != nil {
		return fmt.Errorf("newBdecWriter: %w", err)
	}
	sm.bdecWriter = bdecWriter

	return nil
}

func (sm *streamManager) objMetadata() blob.WriterOption {
	return blob.WithObjectMetadata(map[string]string{
		"ingestclientname": "EstuaryFlow",
		"ingestclientkey":  sm.prefix,
	})
}

func (sm *streamManager) flush(baseToken string) (map[int][]*blobMetadata, error) {
	if sm.bdecWriter != nil {
		if err := sm.finishBlob(); err != nil {
			return nil, fmt.Errorf("finishBlob: %w", err)
		}
	}

	out := make(map[int][]*blobMetadata)
	for binding, trackedBlobs := range sm.blobStats {
		for idx, trackedBlob := range trackedBlobs {
			out[binding] = append(out[binding], generateBlobMetadata(
				trackedBlob,
				sm.tableStreams[binding].channel,
				blobToken(baseToken, idx)),
			)
		}
	}
	maps.Clear(sm.blobStats)

	return out, nil
}

// write registers a series of blobs to the table. All the blobs must be for the
// same table.
//
// Exactly-once semantics are achieved using the offset tokens for the blobs:
// The offset token consists of the "base" token which is a string, and for each
// blob it is appended with a sequential counter, starting at 0. So if there are
// multiple blobs to append, they are organized with offset tokens like
// "basetoken:0", "basetoken:1", "basetoken:2" etc.
//
// Blobs are registered in this order, and it's possible that only some of the
// blobs get registered in a single Acknowledge before the connector fails for
// some reason, or all of the blobs get registered but the Acknowledge response
// is not persisted to the runtime before the connector restarts. The Snowpipe
// channel itself persists the last registered token, and this allows us to
// filter out blobs that had previously been registered and don't need
// registered again on a re-attempt of an Acknowledge.
func (sm *streamManager) write(ctx context.Context, blobs []*blobMetadata) error {
	if err := validWriteBlobs(blobs); err != nil {
		return fmt.Errorf("validWriteBlobs: %w", err)
	}

	var schema = blobs[0].Chunks[0].Schema
	var table = blobs[0].Chunks[0].Table
	var channelName = blobs[0].Chunks[0].Channels[0].Channel
	var thisChannel *channel
	for _, v := range sm.tableStreams {
		matches := v.channel.Schema == schema && v.channel.Table == table && v.channel.Channel == channelName
		if matches && thisChannel != nil {
			return fmt.Errorf("internal error: found duplicate channel %s in tableStreams", channelName)
		} else if matches {
			thisChannel = v.channel
		}
	}
	if thisChannel == nil {
		return fmt.Errorf("unknown channel %s", channelName)
	}

	for _, blob := range blobs {
		blobToken := blob.Chunks[0].Channels[0].OffsetToken
		currentChannelToken := thisChannel.OffsetToken
		log.WithFields(log.Fields{
			"schema":              schema,
			"table":               table,
			"currentChannelToken": thisChannel.OffsetToken,
			"blobToken":           blobToken,
		}).Info("evaluating blob registration")
		if shouldWrite, err := shouldWriteNextToken(blobToken, currentChannelToken); err != nil {
			return fmt.Errorf("shouldWriteNextToken: %w", err)
		} else if !shouldWrite {
			continue
		}

		blob.Chunks[0].Channels[0].ClientSequencer = thisChannel.ClientSequencer
		blob.Chunks[0].Channels[0].RowSequencer = thisChannel.RowSequencer + 1

		if err := sm.c.write(ctx, blob); err != nil {
			var apiError *streamingApiError
			if errors.As(err, &apiError) && apiError.Code == 38 {
				// The "blob has wrong format or extension" error occurs if the
				// blob was written not-so-recently; apparently anything older
				// than an hour or so is rejected by what seems to be a
				// server-side check the examines the name of the file, which
				// contains the timestamp it was written.
				//
				// In these cases, which may arise from re-enabling a disabled
				// binding / materialization, or extended outages, we have to
				// download the file and re-upload it with an up-to-date name.
				//
				// This should be quite rare, even more rare than one may think,
				// since blob registration tokens are persisted in Snowflake
				// rather than exclusively managed by our Acknowledge
				// checkpoints. But it is still technically possible and so it
				// is handled with this.
				if err := sm.maybeInitializeBucket(ctx); err != nil {
					return fmt.Errorf("initializing bucket to rename: %w", err)
				}
				nextName := sm.getNextFileName(time.Now(), fmt.Sprintf("%s_%d", sm.prefix, sm.deploymentId))

				ll := log.WithFields(log.Fields{
					"oldName": blob.Path,
					"newName": nextName,
					"token":   blobToken,
				})
				ll.Info("attempting to register blob with malformed name by renaming")

				if err := sm.renameBlob(ctx, blob, thisChannel.EncryptionKey, nextName); err != nil {
					return fmt.Errorf("renameBlob: %w", err)
				}

				if err := sm.c.write(ctx, blob); err != nil {
					log.WithField("blob", blob).Warn("blob metadata")
					return fmt.Errorf("failed to write renamed blob: %w", err)
				}
				ll.Info("successfully registered renamed blob")
			} else {
				return fmt.Errorf("write: %w", err)
			}
		}

		thisChannel.RowSequencer++
		thisChannel.OffsetToken = &blobToken
	}

	// We don't need to wait for each individual token to be persisted, but need
	// to wait to the final one to be persisted for our idempotency strategy to
	// work.
	if err := sm.c.waitForTokenPersisted(
		ctx,
		*thisChannel.OffsetToken,
		thisChannel.ClientSequencer,
		blobs[0].Chunks[0].Schema,
		blobs[0].Chunks[0].Table,
		channelName,
	); err != nil {
		return fmt.Errorf("waitForTokenPersisted: %w", err)
	}

	return nil
}

func (sm *streamManager) renameBlob(ctx context.Context, blob *blobMetadata, encryptionKey string, newName blobFileName) error {
	r, err := sm.bucket.NewReader(ctx, path.Join(sm.bucketPath, blob.Path))
	if err != nil {
		return fmt.Errorf("NewReader: %w", err)
	}
	w := sm.bucket.NewWriter(ctx, path.Join(sm.bucketPath, string(newName)), sm.objMetadata())

	if err := reencrypt(r, w, blob, encryptionKey, newName); err != nil {
		return fmt.Errorf("reencrypt: %w", err)
	} else if err := r.Close(); err != nil {
		return fmt.Errorf("closing r: %w", err)
	} else if err := w.Close(); err != nil {
		return fmt.Errorf("closing w: %w", err)
	}

	return nil
}

// maybeInitializeBucket retrieves blob storage parameters and initializes the
// appropriate blob storage bucket. A basic expiry mechanism is used to prevent
// this from being re-done too frequently.
func (sm *streamManager) maybeInitializeBucket(ctx context.Context) error {
	if time.Now().Before(sm.bucketExpiresAt) {
		return nil
	}

	cfg, err := sm.c.configure(ctx)
	if err != nil {
		return fmt.Errorf("configuring channel: %w", err)
	}

	parts := strings.Split(cfg.StageLocation.Location, "/")
	bucket := parts[0]
	sm.bucketPath = path.Join(parts[1:]...)
	sm.prefix = cfg.Prefix
	sm.deploymentId = cfg.DeploymentID

	switch cfg.StageLocation.LocationType {
	case "S3":
		provider := credentials.NewStaticCredentialsProvider(
			cfg.StageLocation.Creds.AwsKeyId,
			cfg.StageLocation.Creds.AwsSecretKey,
			cfg.StageLocation.Creds.AwsToken,
		)
		if sm.bucket, err = blob.NewS3Bucket(ctx, bucket, provider); err != nil {
			return fmt.Errorf("new s3 blob bucket: %w", err)
		}
	case "GCS":
		auth := option.WithTokenSource(oauth2.StaticTokenSource(&oauth2.Token{
			AccessToken: cfg.StageLocation.Creds.GcsAccessToken,
		}))
		if sm.bucket, err = blob.NewGCSBucket(ctx, bucket, auth); err != nil {
			return fmt.Errorf("new gcs blob bucket: %w", err)
		}
	case "AZURE":
		auth := blob.WithAzureSasToken(cfg.StageLocation.Creds.AzureSasToken)
		ep := blob.WithAzureEndpoint(cfg.StageLocation.Endpoint)
		if sm.bucket, err = blob.NewAzureBlobBucket(ctx, bucket, cfg.StageLocation.StorageAccount, auth, ep); err != nil {
			return fmt.Errorf("new azure blob bucket: %w", err)
		}
	default:
		return fmt.Errorf("unknown stage location type %q", cfg.StageLocation.LocationType)
	}

	sm.bucketExpiresAt = time.Now().Add(15 * time.Minute)
	log.WithFields(log.Fields{
		"locationType": cfg.StageLocation.LocationType,
		"location":     cfg.StageLocation.Location,
		"prefix":       cfg.Prefix,
		"deploymentId": cfg.DeploymentID,
	}).Info("configured bucket for Snowpipe streaming")

	return nil
}

// blobFileName is the file name for a blob, which is part of the file key. It
// is also used as the "diversifier" for encryption. It's just a string, but
// this custom type helps keeps its usage comprehensible in both of these
// capacities.
type blobFileName string

// Gets the next file name, with "next" being relative to the tracked counter.
// The threadID is for the Java thread, so any random int value should work.
//
// The names of these files need to be globally unique and the timestamp parts
// have only second resolution. The client prefix includes a random nonce from
// the stream configure response that appears to change every time, so that
// should be sufficient.
//
// This code is written kind of weirdly so that it matches the Java SDK as
// closely as possible, since the filenames must be constructed in exactly the
// same way.
//
// Ref:
// https://github.com/snowflakedb/snowflake-ingest-java/blob/3cbaebfe26f59dc3a8b8e973649e3f1a1014438c/src/main/java/net/snowflake/ingest/streaming/internal/InternalStageManager.java#L161-L206
func (sm *streamManager) getNextFileName(calendar time.Time, clientPrefix string) blobFileName {
	calendar = calendar.UTC()
	year := strconv.Itoa(calendar.Year())
	month := strconv.Itoa(int(calendar.Month()))
	day := strconv.Itoa(calendar.Day())
	hour := strconv.Itoa(calendar.Hour())
	minute := strconv.Itoa(calendar.Minute())
	timestamp := calendar.Unix()
	blobShortName := strconv.FormatInt(timestamp, 36) +
		"_" +
		clientPrefix +
		"_" +
		strconv.Itoa(int(sm.keyBegin)) +
		"_" +
		strconv.Itoa(sm.getAndIncrementCounter()) +
		"." +
		BLOB_EXTENSION_TYPE

	return blobFileName(year + "/" + month + "/" + day + "/" + hour + "/" + minute + "/" + blobShortName)
}

func (sm *streamManager) getAndIncrementCounter() int {
	out := sm.counter
	sm.counter++
	return out
}

// blobToken encodes `baseToken`, which is per-transaction, and the counter `n`
// which is based on the number of blobs written for the specific binding within
// the transaction. In typical streaming cases there will only be a single blob
// in a transaction, but larger backfill scenarios may write out more than one
// blob since there is a limit on how large a single blob can be.
func blobToken(baseToken string, n int) string {
	return fmt.Sprintf("%s:%d", baseToken, n)
}

// shouldWriteNextToken determines if a blob with the `next` token should be
// written or not, based on the `current` persisted token. Generally this means
// if the `n` value for `next` is exactly one larger than `current` (or there is
// no `current`), the blob should be written.
func shouldWriteNextToken(next string, current *string) (bool, error) {
	if current == nil {
		// Maybe this should be more strict and error out unless `next` is the
		// first one in the sequence, but that would block cases where a user
		// has manually dropped a table for some reason.
		log.WithField("nextToken", next).Info("no current token persisted; blob will be registered")
		return true, nil
	}

	currentToken := *current
	nextBase, nextN, err := splitToken(next)
	if err != nil {
		return false, err
	}
	currentBase, currentN, err := splitToken(currentToken)
	if err != nil {
		return false, err
	}

	ll := log.WithFields(log.Fields{
		"nextToken":    next,
		"currentToken": currentToken,
		"currentBase":  currentBase,
		"currentN":     currentN,
		"nextBase":     nextBase,
		"nextN":        nextN,
	})

	if nextBase != currentBase && nextN != 0 {
		return false, fmt.Errorf("expected blob token %s to start a new sequence (current: %s)", next, currentToken)
	} else if nextBase == currentBase && nextN <= currentN {
		ll.Info("skipping already persisted blob")
		return false, nil
	} else if nextBase == currentBase && nextN != currentN+1 {
		return false, fmt.Errorf("expected blob token %s to be written immediately after %s", next, currentToken)
	}
	ll.Info("blob will be registered")

	return true, nil
}

func splitToken(token string) (string, int, error) {
	idx := strings.Index(token, ":")
	if idx == -1 {
		return "", 0, fmt.Errorf("invalid token %q: no ':' found", token)
	} else if idx == 0 {
		return "", 0, fmt.Errorf("invalid token %q: no base token found", token)
	} else if idx == len(token)-1 {
		return "", 0, fmt.Errorf("invalid token %q: no number found", token)
	}

	baseToken := token[:idx]
	nStr := token[idx+1:]
	n, err := strconv.Atoi(nStr)
	if err != nil {
		return "", 0, err
	}

	return baseToken, n, nil
}

// validWriteBlobs does some sanity checking the a series of blobs is valid to
// write per our invariants. Theoretically this shouldn't be needed, but is a
// nice guard against some hypothetical bugs which would otherwise be more
// difficult to troubleshoot.
func validWriteBlobs(blobs []*blobMetadata) error {
	var baseToken, channelName, schema, table, database string
	var n int
	for _, blob := range blobs {
		if l := len(blob.Chunks); l != 1 {
			return fmt.Errorf("internal error: expected chunks to have length 1 but was %d", l)
		} else if l := len(blob.Chunks[0].Channels); l != 1 {
			return fmt.Errorf("internal error: expected chunk channel to have length 1 but was %d", l)
		}

		persistToken := blob.Chunks[0].Channels[0].OffsetToken
		token, thisN, err := splitToken(persistToken)
		if err != nil {
			return fmt.Errorf("invalid token %q: %w", persistToken, err)
		}

		if baseToken == "" {
			baseToken = token
			channelName = blob.Chunks[0].Channels[0].Channel
			schema = blob.Chunks[0].Schema
			table = blob.Chunks[0].Table
			database = blob.Chunks[0].Database
			n = thisN
			continue
		}

		if baseToken != token {
			return fmt.Errorf("expected all blobs to have the same base token %q but got %q", baseToken, token)
		} else if channelName != blob.Chunks[0].Channels[0].Channel {
			return fmt.Errorf("expected all blobs to have the same channel %q but got %q", channelName, blob.Chunks[0].Channels[0].Channel)
		} else if schema != blob.Chunks[0].Schema {
			return fmt.Errorf("expected all blobs to have the same schema %q but got %q", schema, blob.Chunks[0].Schema)
		} else if table != blob.Chunks[0].Table {
			return fmt.Errorf("expected all blobs to have the same table %q but got %q", table, blob.Chunks[0].Table)
		} else if database != blob.Chunks[0].Database {
			return fmt.Errorf("expected all blobs to have the same database %q but got %q", database, blob.Chunks[0].Database)
		} else if n+1 != thisN {
			return fmt.Errorf("expected blob tokens to be in ascending order but got %d vs %d", thisN, n)
		}

		n = thisN
	}

	return nil
}