Skip to content

Assignment 1: Completed #974

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Assignment2-IngestionTool
Submodule Assignment2-IngestionTool added at 882ac1
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,12 @@ Cask is a trademark of Cask Data, Inc. All rights reserved.

Apache, Apache HBase, and HBase are trademarks of The Apache Software Foundation. Used with
permission. No endorsement by The Apache Software Foundation is implied by the use of these marks.
## ByteSize and TimeDuration Parsers

Wrangler now supports parsing data size and time duration units in recipes.

**Supported Units**:
- ByteSize: KB, MB, GB, TB
- TimeDuration: ms, s, m, h

### New Directive: `aggregate-stats`
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright © 2025 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.cdap.wrangler.api.parser;

import com.google.gson.JsonElement;
import com.google.gson.JsonPrimitive;

public class ByteSize implements Token {
private final double valueInBytes;
private final String rawValue;

public ByteSize(String value) {
if (value == null || value.trim().isEmpty()) {
throw new IllegalArgumentException("Byte size value cannot be null or empty");
}
this.rawValue = value.trim();
this.valueInBytes = parse(this.rawValue);
}

private double parse(String value) {
value = value.trim().toUpperCase();

if (value.endsWith("KB")) {
return Double.parseDouble(value.replace("KB", "")) * 1024;
} else if (value.endsWith("MB")) {
return Double.parseDouble(value.replace("MB", "")) * 1024 * 1024;
} else if (value.endsWith("GB")) {
return Double.parseDouble(value.replace("GB", "")) * 1024 * 1024 * 1024;
} else if (value.endsWith("TB")) {
return Double.parseDouble(value.replace("TB", "")) * 1024 * 1024 * 1024 * 1024L;
}

throw new IllegalArgumentException("Unsupported byte size unit in value: " + value);
}

public double getBytes() {
return valueInBytes;
}

@Override
public Object value() {
return valueInBytes;
}

@Override
public TokenType type() {
return TokenType.BYTE_SIZE;
}

@Override
public JsonElement toJson() {
return new JsonPrimitive(valueInBytes);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright © 2025 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.cdap.wrangler.api.parser;

import com.google.gson.JsonElement;
import com.google.gson.JsonPrimitive;

public class TimeDuration implements Token {
private final long valueInMillis;
private final String rawValue;

public TimeDuration(String value) {
if (value == null || value.trim().isEmpty()) {
throw new IllegalArgumentException("Time duration value cannot be null or empty");
}
this.rawValue = value.trim();
this.valueInMillis = parse(this.rawValue);
}

private long parse(String value) {
value = value.trim().toLowerCase();

if (value.endsWith("ms")) {
return (long) Double.parseDouble(value.replace("ms", ""));
} else if (value.endsWith("s")) {
return (long) (Double.parseDouble(value.replace("s", "")) * 1000);
} else if (value.endsWith("m")) {
return (long) (Double.parseDouble(value.replace("m", "")) * 60 * 1000);
} else if (value.endsWith("h")) {
return (long) (Double.parseDouble(value.replace("h", "")) * 60 * 60 * 1000);
}

throw new IllegalArgumentException("Unsupported time duration unit in value: " + value);
}

public long getMillis() {
return valueInMillis;
}

@Override
public Object value() {
return valueInMillis;
}

@Override
public TokenType type() {
return TokenType.TIME_DURATION;
}

@Override
public JsonElement toJson() {
return new JsonPrimitive(valueInMillis);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,5 @@ public interface Token extends Serializable {
* @return {@code JsonElement} object containing members of implementing class.
*/
JsonElement toJson();

}
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ public enum TokenType implements Serializable {
*/
RANGES,

BYTE_SIZE,

TIME_DURATION,
/**
* Represents the enumerated type for the object of type {@code String} with restrictions
* on characters that can be present in a string.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,12 @@ numberRange
;

value
: String | Number | Column | Bool
: String | Number | Column | Bool | BYTE_SIZE | TIME_DURATION
;
byteSize
: BYTE_SIZE;

timeDuration: TIME_DURATION;

ecommand
: '!' Identifier
Expand Down Expand Up @@ -183,6 +187,7 @@ numberList
: Number (',' Number)+
;


boolList
: Bool (',' Bool)+
;
Expand Down Expand Up @@ -246,6 +251,12 @@ Pipe : '|';
BackSlash: '\\';
Dollar : '$';
Tilde : '~';
BYTE_SIZE: DIGIT+ ('.' DIGIT+)? BYTE_UNIT;
TIME_DURATION: DIGIT+ ('.' DIGIT+)? TIME_UNIT;

fragment BYTE_UNIT: [KkMmGgTt][Bb];
fragment TIME_UNIT: 'ms' | 's' | 'm' | 'h';



Bool
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import io.cdap.wrangler.api.Triplet;
import io.cdap.wrangler.api.parser.Bool;
import io.cdap.wrangler.api.parser.BoolList;
import io.cdap.wrangler.api.parser.ByteSize;
import io.cdap.wrangler.api.parser.ColumnName;
import io.cdap.wrangler.api.parser.ColumnNameList;
import io.cdap.wrangler.api.parser.DirectiveName;
Expand All @@ -33,6 +34,7 @@
import io.cdap.wrangler.api.parser.Ranges;
import io.cdap.wrangler.api.parser.Text;
import io.cdap.wrangler.api.parser.TextList;
import io.cdap.wrangler.api.parser.TimeDuration;
import io.cdap.wrangler.api.parser.Token;
import org.antlr.v4.runtime.ParserRuleContext;
import org.antlr.v4.runtime.misc.Interval;
Expand Down Expand Up @@ -228,7 +230,7 @@ public RecipeSymbol.Builder visitBool(DirectivesParser.BoolContext ctx) {
builder.addToken(new Bool(Boolean.valueOf(ctx.Bool().getText())));
return builder;
}

/**
* A Directive can include a expression or a condition to be evaluated. When
* such a token type is found, the visitor extracts the expression and generates
Expand Down
Loading