Skip to content

Native Grok Reader Implementation #25205

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public static KeyValue convert(String key, Object value, Grok grok)
{
String[] spec = key.split(";|:", 3);
try {
// process situations with fieldid [and datatype]
// process situations with field id [and datatype]
if (spec.length <= 2) {
String pattern = grok.getGrokPatternPatterns().get(key); // actual pattern name
String defaultDataType = grok.getGrokPatternDefaultDatatype().get(pattern); // default datatype of the pattern
Expand Down Expand Up @@ -112,17 +112,17 @@ else if (spec.length == 1) {
// if in strict mode, never do automatic data type conversion
defaultDataType = null;
}
// process situations with only fieldid (check default datatype, except date and datetime)
// process situations with only field id (check default datatype, except date and datetime)
return new KeyValue(spec[0],
defaultDataType == null ? String.valueOf(value) : getConverter(defaultDataType).convert(String.valueOf(value)));
}
else {
// process situations with fieldid and datatype (except date and datetime)
// process situations with field id and datatype (except date and datetime)
return new KeyValue(spec[0], getConverter(spec[1]).convert(String.valueOf(value)));
}
}
else if (spec.length == 3) {
// process situations with fieldid, datatype and datatype arguments
// process situations with field id, datatype and datatype arguments
return new KeyValue(spec[0], getConverter(spec[1]).convert(String.valueOf(value), spec[2]));
}
else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ public String discover(String text)

Map<String, Grok> groks = new TreeMap<String, Grok>();
Map<String, String> gPatterns = grok.getPatterns();
// Boolean done = false;
String texte = text;

// Compile the pattern
Expand All @@ -124,24 +123,21 @@ public String discover(String text)
String key = pairs.getKey().toString();
Grok g = new Grok();

// g.patterns.putAll(gPatterns);
try {
g.copyPatterns(gPatterns);
g.setSaved_pattern(key);
g.compile("%{" + key + "}");
groks.put(key, g);
}
catch (GrokException e) {
// Add logger
// TODO: Add logger
continue;
}
}

// Sort patterns by complexity
Map<String, Grok> patterns = this.sort(groks);

// while (!done){
// done = true;
Iterator<Entry<String, Grok>> pit = patterns.entrySet().iterator();
while (pit.hasNext()) {
@SuppressWarnings("rawtypes")
Expand Down Expand Up @@ -178,7 +174,6 @@ public String discover(String text)
}
texte = StringUtils.replace(texte, part, "%{" + key + "}");
}
// }

return texte;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public Garbage()
{
toRemove = new ArrayList<String>();
toRename = new TreeMap<String, Object>();
/** this is a default value to remove */
// this is a default value to remove
toRemove.add("UNWANTED");
}

Expand All @@ -62,7 +62,7 @@ public void addToRename(String origin, Object value)
}

/**
* Set a field to be remove when exporting the final output.
* Set a field to be removed when exporting the final output.
*
* @param name of the field to remove
*/
Expand All @@ -78,7 +78,7 @@ public void addToRemove(String name)
}

/**
* Set a list of field name to be remove when exporting the final output.
* Set a list of field name to be removed when exporting the final output.
*
* @param lst list of elem to remove
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ public Grok()
strictMode = false;
}

// grok creators
/**
* Create a {@code Grok} instance using the default pattern, datatype and dateformat file.
*
Expand Down Expand Up @@ -573,10 +572,10 @@ public void compile(String pattern, boolean namedOnly)
throw new GrokException("{pattern} should not be empty");
}
namedRegexCollection.clear(); // when the grok object compiles the second format, named patterns in last format should not influence the current one.
namedRegex = pattern;
namedRegex = removeUnderscores(pattern);
originalGrokPattern = pattern;
int index = 0;
/** flag for infinite recurtion */
// flag for infinite recursion
int iterationLeft = 1000;
Boolean continueIteration = true;

Expand Down Expand Up @@ -623,12 +622,28 @@ public void compile(String pattern, boolean namedOnly)
}

if (namedRegex.isEmpty()) {
throw new GrokException("Pattern not fount");
throw new GrokException("Pattern not found");
}
// Compile the regex
compiledNamedRegex = Pattern.compile(namedRegex);
}

public String removeUnderscores(String namedRegex)
{
// Pattern.compile() does not support underscores in named regex groups so need to remove all of them
Pattern groupPattern = Pattern.compile("\\(\\?<([^>]+)>");
Matcher groupMatcher = groupPattern.matcher(namedRegex);
StringBuilder result = new StringBuilder();

while (groupMatcher.find()) {
String groupName = groupMatcher.group(1);
String cleanName = groupName.replaceAll("_", "");
groupMatcher.appendReplacement(result, "(?<" + cleanName + ">");
}
groupMatcher.appendTail(result);
return result.toString();
}

/**
* {@code Grok} will try to find the best expression that will match your input.
* {@link Discovery}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public class GrokDeserializer
implements LineDeserializer
{
private final List<Column> columns;
private Grok grokPattern;
private final Grok grokPattern;
private final List<Type> types;

public GrokDeserializer(List<Column> columns, String inputFormat, String inputGrokCustomPatterns)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
// Copyright 2014 Anthony Corbacho, and contributors.
abstract class IConverter<T>
{
public T convert(String value, String informat)
public T convert(String value, String format)
throws Exception
{
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ public void setSubject(String text)
}

/**
* Retrurn the single line of log.
* Return the single line of log.
*
* @return the single line of log
*/
Expand All @@ -160,9 +160,6 @@ public void captures()
}
capture.clear();

// _capture.put("LINE", this.line);
// _capture.put("LENGTH", this.line.length() +"");

Map<String, String> mappedw = GrokUtils.namedGroups(this.match, this.subject);
Iterator<Entry<String, String>> it = mappedw.entrySet().iterator();
while (it.hasNext()) {
Expand Down Expand Up @@ -283,7 +280,7 @@ public Map<String, Object> toMap()
}

/**
* Remove and rename the unwanted elelents in the matched map.
* Remove and rename the unwanted elements in the matched map.
*/
private void cleanMap()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
@Deprecated
public class Pile
{
// Private
private List<Grok> groks;
private Map<String, String> patterns;
private List<String> patternFiles;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@ public void test001_httpd_access()

BufferedReader br = Files.newBufferedReader(Path.of(ResourceManager.ACCESS_LOG));
String line;
System.out.println("Starting test with httpd log");
while ((line = br.readLine()) != null) {
//System.out.println(line);
Match gm = g.match(line);
gm.captures();
assertThat(gm.toJson()).isNotNull();
Expand All @@ -52,14 +50,12 @@ public void test002_nasa_httpd_access()
throws GrokException, IOException
{
Grok g = Grok.create(ResourceManager.PATTERNS, "%{COMMONAPACHELOG}");
System.out.println("Starting test with nasa log -- may take a while");
BufferedReader br;
String line;
File dir = new File(ResourceManager.NASA);
for (File child : dir.listFiles()) {
br = Files.newBufferedReader(Path.of(ResourceManager.NASA + child.getName()));
while ((line = br.readLine()) != null) {
//System.out.println(child.getName() + " " +line);
Match gm = g.match(line);
gm.captures();
assertThat(gm.toJson()).isNotNull();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import io.trino.hive.formats.line.grok.exception.GrokException;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Locale;
Expand All @@ -36,11 +35,10 @@ public class TestApacheDataType

@Test
public void test002_httpd_access_semi()
throws GrokException, IOException, ParseException
throws GrokException, ParseException
{
Grok g = Grok.create(ResourceManager.PATTERNS, "%{IPORHOST:clientip} %{USER:ident;boolean} %{USER:auth} \\[%{HTTPDATE:timestamp;date;dd/MMM/yyyy:HH:mm:ss Z}\\] \"(?:%{WORD:verb;string} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion;float})?|%{DATA:rawrequest})\" %{NUMBER:response;int} (?:%{NUMBER:bytes;long}|-)");

System.out.println(line);
Match gm = g.match(line);
gm.captures();

Expand All @@ -57,11 +55,10 @@ public void test002_httpd_access_semi()

@Test
public void test002_httpd_access_colon()
throws GrokException, IOException, ParseException
throws GrokException, ParseException
{
Grok g = Grok.create(ResourceManager.PATTERNS, "%{IPORHOST:clientip} %{USER:ident:boolean} %{USER:auth} \\[%{HTTPDATE:timestamp:date:dd/MMM/yyyy:HH:mm:ss Z}\\] \"(?:%{WORD:verb:string} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion:float})?|%{DATA:rawrequest})\" %{NUMBER:response:int} (?:%{NUMBER:bytes:long}|-)");

System.out.println(line);
Match gm = g.match(line);
gm.captures();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public void test001_compileFailOnInvalidExpression()

boolean thrown = false;

/** This should always throw */
// This should always throw
for (String regx : badRegxp) {
try {
g.compile(regx);
Expand Down Expand Up @@ -87,7 +87,7 @@ public void test003_samePattern()
}

@Test
public void test004_sameExpantedPatern()
public void test004_sameExpandedPattern()
throws GrokException
{
Grok g = Grok.create();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public static void setUp()
}

@Test
public void test001_captureMathod()
public void test001_captureMethod()
throws GrokException
{
grok.addPattern("foo", ".*");
Expand All @@ -48,7 +48,7 @@ public void test001_captureMathod()
}

@Test
public void test002_captureMathodMulti()
public void test002_captureMethodMulti()
throws GrokException
{
grok.addPattern("foo", ".*");
Expand All @@ -65,7 +65,7 @@ public void test002_captureMathodMulti()
}

@Test
public void test003_captureMathodNasted()
public void test003_captureMethodNested()
throws GrokException
{
grok.addPattern("foo", "\\w+ %{bar}");
Expand All @@ -82,7 +82,7 @@ public void test003_captureMathodNasted()
}

@Test
public void test004_captureNastedRecustion()
public void test004_captureNestedRecursion()
throws GrokException
{
grok.addPattern("foo", "%{foo}");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ public class TestGrok
*
* g.addPatternFromFile("patterns/base"); g.compile("%{APACHE}"); Match gm =
* g.match("127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] \"GET /apache_pb.gif HTTP/1.0\" 200 2326"
* ); //Match gm = g.match("10.192.1.47"); gm.captures(); //See the result
* System.out.println(gm.toJson());
* ); //Match gm = g.match("10.192.1.47"); gm.captures(); //See the result gm.toJson()
*
* }
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,18 +118,9 @@ public void testPatternDoesNotExist()
assertError(columns, log, Optional.of("%{NONEXISTENT}"), Optional.empty(), "Grok compilation failure: Pattern NONEXISTENT is not defined.", Optional.of(GrokException.class));
}

// TODO: If pattern already exists, it just replaces the pattern, so this works (should we be able to do this though)
// @Test
// public void testPatternAlreadyExists()
// {
// String log = "abc123";
// List<Column> columns = ImmutableList.of(new Column("abc123", VARCHAR, 0));
//
// assertError(columns, log, Optional.of("%{POSINT}"), Optional.of("YEAR \\S+"), "<some message about pattern already exists>", Optional.of(GrokException.class));
// }
// TODO: If pattern already exists, it just replaces the pattern, should we allow this to happen?

// TODO: bug input.format = "%WORD}" with a missing opening brace should throw exception, but compiles fine right now
// testing against input.format is really testing against regex pattern, not grok, so did not include these type of tests

@Test
public void testNoMatches()
Expand All @@ -147,6 +138,49 @@ public void testNoMatches()
Arrays.asList(null, null));
}

@Test
public void testUnderscoreInNamedGroups()
throws IOException
{
String log = "abc";
List<Column> columns = ImmutableList.of(new Column("abc", VARCHAR, 0));
String inputFormat = "(?<name_underscore__>\\S+)";

assertLine(
columns,
log,
inputFormat,
Optional.empty(),
Arrays.asList("abc"));

String complexLog = "192.168.1.1 GET [/api/users] \"Mozilla/5.0 Browser\" 200 \"Success Message\" \"Additional Details\" 1234ms";
List<Column> complexColumns = ImmutableList.of(
new Column("0", VARCHAR, 0),
new Column("1", VARCHAR, 1),
new Column("2", VARCHAR, 2),
new Column("3", VARCHAR, 3),
new Column("4", VARCHAR, 4),
new Column("5", VARCHAR, 5),
new Column("6", VARCHAR, 6),
new Column("7", VARCHAR, 7));
String complexInputFormat = "(?<na_me0>\\S+) (?<nam__e1>\\S+) \\[(?<__name2>([^\\]]*))\\] \"?(?<n_am_e3>([^\"]*))\"? (?<name4>\\S+) \"?(?<name5____>([^\"]*))\"? \"?(?<n_a_m_e_6>([^\"]*))\"? (?<__name__7__>\\S+)";

assertLine(
complexColumns,
complexLog,
complexInputFormat,
Optional.empty(),
Arrays.asList(
"192.168.1.1",
"GET",
"/api/users",
"Mozilla/5.0 Browser",
"200",
"Success Message",
"Additional Details",
"1234ms"));
}

@Test
public void testExampleOneAthenaDocumentation()
throws IOException
Expand Down