-
Notifications
You must be signed in to change notification settings - Fork 138
feat: add configurable file size and row limits for Parquet writer #586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: staging
Are you sure you want to change the base?
Changes from all commits
9364100
d018e8c
0ea1b0e
5da95d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| package parquet | ||
|
|
||
| import "testing" | ||
|
|
||
| func TestConfigValidateDefaults(t *testing.T) { | ||
| config := &Config{} | ||
|
|
||
| if err := config.Validate(); err != nil { | ||
| t.Fatalf("Validate() failed: %v", err) | ||
| } | ||
|
|
||
| if config.MaxFileSizeMB != DefaultMaxFileSizeMB { | ||
| t.Errorf("Expected MaxFileSizeMB to be %d, got %d", DefaultMaxFileSizeMB, config.MaxFileSizeMB) | ||
| } | ||
|
|
||
| if config.MaxRowsPerFile != DefaultMaxRowsPerFile { | ||
| t.Errorf("Expected MaxRowsPerFile to be %d, got %d", DefaultMaxRowsPerFile, config.MaxRowsPerFile) | ||
| } | ||
| } | ||
|
|
||
| func TestConfigValidateCustomValues(t *testing.T) { | ||
| config := &Config{ | ||
| MaxFileSizeMB: 256, | ||
| MaxRowsPerFile: 500000, | ||
| } | ||
|
|
||
| if err := config.Validate(); err != nil { | ||
| t.Fatalf("Validate() failed: %v", err) | ||
| } | ||
|
|
||
| if config.MaxFileSizeMB != 256 { | ||
| t.Errorf("Expected MaxFileSizeMB to be 256, got %d", config.MaxFileSizeMB) | ||
| } | ||
|
|
||
| if config.MaxRowsPerFile != 500000 { | ||
| t.Errorf("Expected MaxRowsPerFile to be 500000, got %d", config.MaxRowsPerFile) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,9 +28,11 @@ | |
| ) | ||
|
|
||
| type FileMetadata struct { | ||
| fileName string | ||
| writer any | ||
| file source.ParquetFile | ||
| fileName string | ||
| writer any | ||
| file source.ParquetFile | ||
| rowCount int64 | ||
| estimatedSizeBytes int64 | ||
| } | ||
|
|
||
| // Parquet destination writes Parquet files to a local path and optionally uploads them to S3. | ||
|
|
@@ -105,15 +107,97 @@ | |
| }() | ||
|
|
||
| p.partitionedFiles[basePath] = &FileMetadata{ | ||
| fileName: fileName, | ||
| file: pqFile, | ||
| writer: writer, | ||
| fileName: fileName, | ||
| file: pqFile, | ||
| writer: writer, | ||
| rowCount: 0, | ||
| estimatedSizeBytes: 0, | ||
| } | ||
|
|
||
| logger.Infof("Thread[%s]: created new partition file[%s]", p.options.ThreadID, filePath) | ||
| return nil | ||
| } | ||
|
|
||
| func estimateRecordSize(data map[string]any) int64 { | ||
| var size int64 | ||
| for key, value := range data { | ||
| size += int64(len(key)) | ||
| switch v := value.(type) { | ||
| case string: | ||
| size += int64(len(v)) | ||
| case []byte: | ||
| size += int64(len(v)) | ||
| case int, int32, int64, uint, uint32, uint64, float32, float64: | ||
| size += 8 | ||
| case bool: | ||
| size += 1 | ||
| case nil: | ||
| size += 0 | ||
| default: | ||
| size += 100 | ||
| } | ||
| } | ||
| return size | ||
|
Comment on lines
+121
to
+140
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Calculating file size based on the variables used might cause problem, as there is no consideration of compression ratio here. Instead can you research about if there is some function available in parquet go library which might be helpful to us. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @vaibhav-datazip, So I initially tried to calculate the actual file size using New suggestion : We can periodically check actual file size every 1000 records (or a particular amount) by calling Flush() and then using os.Stat() to get the real compressed file size on disk. This way we measure the actual bytes after compression, not estimates. If the file exceeds the limit, we rotate to a new file. Tested with Docker integration tests across 1MB, 5MB, and 10MB limits and it gives file sizes very close to those limits. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for pointing out the File().Size() method! From what I understand the method seems designed for after the file is closed, not during active writing. |
||
| } | ||
|
|
||
| func (p *Parquet) shouldRotateFile(partitionFile *FileMetadata) bool { | ||
| if partitionFile.rowCount >= int64(p.config.MaxRowsPerFile) { | ||
| return true | ||
| } | ||
| estimatedSizeMB := partitionFile.estimatedSizeBytes / (1024 * 1024) | ||
| return estimatedSizeMB >= int64(p.config.MaxFileSizeMB) | ||
| } | ||
|
|
||
| func (p *Parquet) closeAndUploadFile(basePath string, partitionFile *FileMetadata) error { | ||
| filePath := filepath.Join(p.config.Path, basePath, partitionFile.fileName) | ||
|
|
||
| var err error | ||
| if p.stream.NormalizationEnabled() { | ||
| err = partitionFile.writer.(*pqgo.GenericWriter[any]).Close() | ||
| } else { | ||
| err = partitionFile.writer.(*pqgo.GenericWriter[types.RawRecord]).Close() | ||
| } | ||
| if err != nil { | ||
| return fmt.Errorf("failed to close writer: %s", err) | ||
| } | ||
|
|
||
| if err := partitionFile.file.Close(); err != nil { | ||
| return fmt.Errorf("failed to close file: %s", err) | ||
| } | ||
|
|
||
| logger.Infof("Thread[%s]: Finished writing file [%s] with %d rows", p.options.ThreadID, filePath, partitionFile.rowCount) | ||
|
|
||
| if p.s3Client != nil { | ||
| file, err := os.Open(filePath) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to open file: %s", err) | ||
| } | ||
| defer file.Close() | ||
|
|
||
| s3KeyPath := basePath | ||
| if p.config.Prefix != "" { | ||
| s3KeyPath = filepath.Join(p.config.Prefix, s3KeyPath) | ||
| } | ||
| s3KeyPath = filepath.Join(s3KeyPath, partitionFile.fileName) | ||
|
|
||
| _, err = p.s3Client.PutObject(&s3.PutObjectInput{ | ||
| Bucket: aws.String(p.config.Bucket), | ||
| Key: aws.String(s3KeyPath), | ||
| Body: file, | ||
| }) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to put object into s3: %s", err) | ||
| } | ||
|
|
||
| if err := os.Remove(filePath); err != nil { | ||
| logger.Warnf("Thread[%s]: Failed to delete file [%s]: %s", p.options.ThreadID, filePath, err) | ||
| } | ||
| logger.Infof("Thread[%s]: uploaded file to S3: s3://%s/%s", p.options.ThreadID, p.config.Bucket, s3KeyPath) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // Setup configures the parquet writer, including local paths, file names, and optional S3 setup. | ||
| func (p *Parquet) Setup(_ context.Context, stream types.StreamInterface, schema any, options *destination.Options) (any, error) { | ||
| p.options = options | ||
|
|
@@ -170,6 +254,8 @@ | |
| return fmt.Errorf("failed to create partition file for path[%s]", partitionedPath) | ||
| } | ||
|
|
||
| partitionFile.estimatedSizeBytes += estimateRecordSize(record.Data) | ||
|
|
||
| var err error | ||
| if p.stream.NormalizationEnabled() { | ||
| _, err = partitionFile.writer.(*pqgo.GenericWriter[any]).Write([]any{record.Data}) | ||
|
|
@@ -179,6 +265,15 @@ | |
| if err != nil { | ||
| return fmt.Errorf("failed to write in parquet file: %s", err) | ||
| } | ||
|
|
||
| partitionFile.rowCount++ | ||
|
|
||
| if p.shouldRotateFile(partitionFile) { | ||
| if err := p.closeAndUploadFile(partitionedPath, partitionFile); err != nil { | ||
| return fmt.Errorf("failed to rotate file: %s", err) | ||
| } | ||
| delete(p.partitionedFiles, partitionedPath) | ||
| } | ||
| } | ||
|
|
||
| return nil | ||
|
|
@@ -233,68 +328,11 @@ | |
| } | ||
|
|
||
| func (p *Parquet) closePqFiles() error { | ||
| removeLocalFile := func(filePath, reason string) { | ||
| err := os.Remove(filePath) | ||
| if err != nil { | ||
| logger.Warnf("Thread[%s]: Failed to delete file [%s], reason (%s): %s", p.options.ThreadID, filePath, reason, err) | ||
| return | ||
| } | ||
| logger.Debugf("Thread[%s]: Deleted file [%s], reason (%s).", p.options.ThreadID, filePath, reason) | ||
| } | ||
|
|
||
| for basePath, parquetFile := range p.partitionedFiles { | ||
| // construct full file path | ||
| filePath := filepath.Join(p.config.Path, basePath, parquetFile.fileName) | ||
|
|
||
| // Close writers | ||
| var err error | ||
| if p.stream.NormalizationEnabled() { | ||
| err = parquetFile.writer.(*pqgo.GenericWriter[any]).Close() | ||
| } else { | ||
| err = parquetFile.writer.(*pqgo.GenericWriter[types.RawRecord]).Close() | ||
| } | ||
| if err != nil { | ||
| return fmt.Errorf("failed to close writer: %s", err) | ||
| } | ||
|
|
||
| // Close file | ||
| if err := parquetFile.file.Close(); err != nil { | ||
| return fmt.Errorf("failed to close file: %s", err) | ||
| } | ||
|
|
||
| logger.Infof("Thread[%s]: Finished writing file [%s].", p.options.ThreadID, filePath) | ||
|
|
||
| if p.s3Client != nil { | ||
| // Open file for S3 upload | ||
| file, err := os.Open(filePath) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to open file: %s", err) | ||
| } | ||
| defer file.Close() | ||
|
|
||
| // Construct S3 key path | ||
| s3KeyPath := basePath | ||
| if p.config.Prefix != "" { | ||
| s3KeyPath = filepath.Join(p.config.Prefix, s3KeyPath) | ||
| } | ||
| s3KeyPath = filepath.Join(s3KeyPath, parquetFile.fileName) | ||
|
|
||
| // Upload to S3 | ||
| _, err = p.s3Client.PutObject(&s3.PutObjectInput{ | ||
| Bucket: aws.String(p.config.Bucket), | ||
| Key: aws.String(s3KeyPath), | ||
| Body: file, | ||
| }) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to put object into s3: %s", err) | ||
| } | ||
|
|
||
| // Remove local file after successful upload | ||
| removeLocalFile(filePath, "uploaded to S3") | ||
| logger.Infof("Thread[%s]: successfully uploaded file to S3: s3://%s/%s", p.options.ThreadID, p.config.Bucket, s3KeyPath) | ||
| if err := p.closeAndUploadFile(basePath, parquetFile); err != nil { | ||
| return err | ||
| } | ||
| } | ||
| // make map empty | ||
| p.partitionedFiles = make(map[string]*FileMetadata) | ||
| return nil | ||
| } | ||
|
|
||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we don't need to add row based size limit