Skip to content

Commit 2c25fcd

Browse files
author
Doug Schmidt
authored
Merge pull request #300 from DougSchmidt-AI/feature/PF-1405-PointZillaLogPointTimeRange
PF-1405 - Added /CsvHeaderStartsWith support for CSV and Excel files
2 parents fad3031 + 4b4d700 commit 2c25fcd

File tree

4 files changed

+123
-12
lines changed

4 files changed

+123
-12
lines changed

TimeSeries/PublicApis/SdkExamples/PointZilla/Context.cs

+1
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ public class Context
9191
public int CsvSkipRows { get; set; }
9292
public bool CsvHasHeaderRow { get; set; }
9393
public bool CsvIgnoreInvalidRows { get; set; }
94+
public string CsvHeaderStartsWith { get; set; }
9495
public bool CsvRealign { get; set; }
9596
public bool CsvRemoveDuplicatePoints { get; set; } = true;
9697
public string CsvDelimiter { get; set; } = ",";

TimeSeries/PublicApis/SdkExamples/PointZilla/PointReaders/CsvReader.cs

+83-11
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ namespace PointZilla.PointReaders
1717
{
1818
public class CsvReader : CsvReaderBase, IPointReader
1919
{
20+
// ReSharper disable once PossibleNullReferenceException
2021
private static readonly ILog Log = LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);
2122

2223
public CsvReader(Context context)
@@ -98,7 +99,7 @@ private List<TimeSeriesPoint> LoadPoints(string path)
9899
}
99100
}
100101

101-
Log.Info($"Loaded {PointSummarizer.Summarize(points, "point")} from '{path}'.");
102+
Log.Info($"Loaded {PointSummarizer.Summarize(points)} from '{path}'.");
102103

103104
return points;
104105
}
@@ -144,14 +145,7 @@ private List<TimeSeriesPoint> LoadPoints(IExcelDataReader excelReader)
144145
ConfigureDataTable = tableReader => new ExcelDataTableConfiguration
145146
{
146147
UseHeaderRow = Context.CsvHasHeaderRow,
147-
148-
ReadHeaderRow = rowReader =>
149-
{
150-
for (; skipRows > 0; --skipRows)
151-
{
152-
rowReader.Read();
153-
}
154-
}
148+
ReadHeaderRow = rowReader => ReadHeaderRow(rowReader, ref skipRows)
155149
}
156150
});
157151

@@ -168,7 +162,7 @@ private List<TimeSeriesPoint> LoadPoints(IExcelDataReader excelReader)
168162
ValidateHeaderFields(table
169163
.Columns
170164
.Cast<DataColumn>()
171-
.Select(c => c.ColumnName)
165+
.Select(c => c.ColumnName.Trim())
172166
.ToArray());
173167

174168
return table
@@ -180,6 +174,80 @@ private List<TimeSeriesPoint> LoadPoints(IExcelDataReader excelReader)
180174
.ToList();
181175
}
182176

177+
private void ReadHeaderRow(IExcelDataReader rowReader, ref int skipRows)
178+
{
179+
var startingHeaderColumns = GetStartingHeaderColumns();
180+
181+
for (; skipRows > 0; --skipRows)
182+
{
183+
if (!rowReader.Read())
184+
return;
185+
}
186+
187+
if (!startingHeaderColumns.Any())
188+
return;
189+
190+
while (true)
191+
{
192+
if (IsHeaderRowMatched(GetFields(rowReader), startingHeaderColumns))
193+
break;
194+
195+
if (!rowReader.Read())
196+
return;
197+
}
198+
}
199+
200+
private List<string> GetFields(IExcelDataReader rowReader)
201+
{
202+
var fieldCount = rowReader.FieldCount;
203+
204+
var fields = new List<string>();
205+
206+
for (var i = 0; i < fieldCount; ++i)
207+
{
208+
var field = rowReader.IsDBNull(i)
209+
? string.Empty
210+
: Convert.ToString(rowReader.GetValue(i)).Trim();
211+
212+
fields.Add(field);
213+
}
214+
215+
return fields;
216+
}
217+
218+
private List<string> GetStartingHeaderColumns()
219+
{
220+
return (Context.CsvHeaderStartsWith ?? string.Empty)
221+
.Split(',')
222+
.Select(s => s.Trim())
223+
.ToList();
224+
}
225+
226+
private bool IsHeaderRowMatched(IReadOnlyList<string> fields, IReadOnlyList<string> startingHeaderColumns)
227+
{
228+
if (!startingHeaderColumns.Any())
229+
return false;
230+
231+
if (!startingHeaderColumns.Any(string.IsNullOrEmpty))
232+
{
233+
// When the expected columns don't contain a blank column, we only match against non-empty fields
234+
fields = fields
235+
.Where(s => !string.IsNullOrEmpty(s))
236+
.ToList();
237+
}
238+
239+
for (var i = 0; i < startingHeaderColumns.Count; ++i)
240+
{
241+
if (i >= fields.Count)
242+
return false;
243+
244+
if (!startingHeaderColumns[i].Equals(fields[i], StringComparison.InvariantCultureIgnoreCase))
245+
return false;
246+
}
247+
248+
return true;
249+
}
250+
183251
private TimeSeriesPoint ParseExcelRow(DataRow row)
184252
{
185253
Instant? time = null;
@@ -306,7 +374,7 @@ private List<TimeSeriesPoint> LoadCsvPoints(string path)
306374
}
307375

308376
var skipCount = Context.CsvSkipRows;
309-
377+
var startingHeaderColumns = GetStartingHeaderColumns();
310378
var parseHeaderRow = Context.CsvHasHeaderRow;
311379

312380
while (!parser.EndOfData)
@@ -322,6 +390,9 @@ private List<TimeSeriesPoint> LoadCsvPoints(string path)
322390
continue;
323391
}
324392

393+
if (parseHeaderRow && startingHeaderColumns.Any() && !IsHeaderRowMatched(fields, startingHeaderColumns))
394+
continue;
395+
325396
if (parseHeaderRow)
326397
{
327398
ValidateHeaderFields(fields);
@@ -425,6 +496,7 @@ private TimeSeriesPoint ParsePoint(string[] fields)
425496
ParseField(fields, Context.CsvNotesField?.ColumnIndex, text =>
426497
{
427498
if (time.HasValue)
499+
// ReSharper disable once PossibleInvalidOperationException
428500
AddRowNote(time.Value, text);
429501
});
430502
}

TimeSeries/PublicApis/SdkExamples/PointZilla/Program.cs

+1
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ private static Context ParseArgs(string[] args)
189189
new Option {Key = nameof(context.CsvComment), Setter = value => context.CsvComment = value, Getter = () => context.CsvComment, Description = "CSV comment lines begin with this prefix"},
190190
new Option {Key = nameof(context.CsvSkipRows), Setter = value => context.CsvSkipRows = int.Parse(value), Getter = () => context.CsvSkipRows.ToString(), Description = "Number of CSV rows to skip before parsing"},
191191
new Option {Key = nameof(context.CsvHasHeaderRow), Setter = value => context.CsvHasHeaderRow = bool.Parse(value), Getter = () => string.Empty, Description = "Does the CSV have a header row naming the columns. [default: true if any columns are referenced by name]"},
192+
new Option {Key = nameof(context.CsvHeaderStartsWith), Setter = value => context.CsvHeaderStartsWith = value, Getter = () => context.CsvHeaderStartsWith, Description = "A comma separated list of of the first expected header column names"},
192193
new Option {Key = nameof(context.CsvIgnoreInvalidRows), Setter = value => context.CsvIgnoreInvalidRows = bool.Parse(value), Getter = () => context.CsvIgnoreInvalidRows.ToString(), Description = "Ignore CSV rows that can't be parsed"},
193194
new Option {Key = nameof(context.CsvRealign), Setter = value => context.CsvRealign = bool.Parse(value), Getter = () => context.CsvRealign.ToString(), Description = $"Realign imported CSV points to the /{nameof(context.StartTime)} value"},
194195
new Option {Key = nameof(context.CsvRemoveDuplicatePoints), Setter = value => context.CsvRemoveDuplicatePoints = bool.Parse(value), Getter = () => context.CsvRemoveDuplicatePoints.ToString(), Description = "Remove duplicate points in the CSV before appending."},

TimeSeries/PublicApis/SdkExamples/PointZilla/Readme.md

+38-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Points can be specified from:
99
- Signal generators: linear, saw-tooth, square-wave, or sine-wave signals. Useful for just getting *something* into a time-series
1010
- CSV files (including CSV exports from AQTS Springboard)
1111
- Points retrieved live from other AQTS systems, including from legacy 3.X systems.
12+
- The results of a database query (via direct support fo SqlServer, Postgres, and MySql. ODBC connections are supported too, but require configuration)
1213
- `CMD.EXE`, `PowerShell` or `bash`: `PointZilla` works well from within any shell.
1314

1415
Basic time-series will append time/value pairs. Reflected time-series also support setting grade codes and/or qualifiers to each point.
@@ -166,10 +167,45 @@ $ ./PointZilla.exe -server=myserver Stage.Label@MyLocation Downloads/ExportedFro
166167

167168
### Use column names or 1-based column indexes to reference a column from your CSV
168169

169-
You can reference a column either by a name (eg. `-CsvDateTimeField="ISO 8601 UTC"`) or by a 1-based column index (eg. `-CsvDateTimeField=1`). When at least one field has a column name, the `-CsvHasHeaderRow=true` option is assumed.
170+
You can reference a column either by a header name (eg. `-CsvDateTimeField="ISO 8601 UTC"`) or by a 1-based column index (eg. `-CsvDateTimeField=1`). When at least one field has a column name, the `-CsvHasHeaderRow=true` option is assumed.
171+
172+
Referencing columns by name has some nice benefits:
173+
- Columns can appear in any order in the header line.
174+
- Column name matching is case-insensitive.
170175

171176
Referencing columns by name is usually more robust, but you may not have control over the format of the CSV file being consumed.
172177

178+
### When your data isn't at the start of your CSV
179+
180+
Some data files have extra rows at the start. PointZilla has a few options to help locate the start of the data to extract:
181+
182+
The `/CsvComment={prefix}` option tells the CSV parser to skip over any lines that begin with the given prefix.
183+
184+
The `/CsvSkipRows={integer}` option tells the CSV parser to skip over the specified number of lines before parsing the data. Skipped rows are not counted for lines matching the `/CsvComment=` test.
185+
186+
The `/CsvHeaderStartsWith={hint1, hint2, ..., hintN}` option provides the CSV parser with a header-row detection hint, a comma-separated list of expected column names:
187+
188+
- Each hint is trimmed of leading/trailing whitespace.
189+
- Column name matching is case-insensitive.
190+
- If none of the expected column hints are empty, then the match is performed against non-empty fields from the header row. This is usually what you want.
191+
- If any of the expected column hints are empty, then the match is performed column-by-column and blank hints must match blank columns in the header row.
192+
193+
So `/CsvHeaderStartsWith="Date, Time, Value, Grade"` and `/CsvHeaderStartsWith=Date,Time,Value,Grade` will both match:
194+
195+
```csv
196+
Date, Time, Value, Grade, Status, Note
197+
2021-Oct-12, 12:56, 4.5, Good, Normal, Things are fine
198+
```
199+
200+
And will also this CSV with 3 blank columns between the `Date` and `Time` columns:
201+
202+
```csv
203+
Date,,,,Time,Value,Grade,Status,Note
204+
2021-Oct-12,,,,12:56,4.5,Good,Normal,Things are fine
205+
```
206+
207+
Adding empty hint colums like `/CsvHeaderStartsWith="Date, , , , Time, Value, Grade"` or `/CsvHeaderStartsWith=Date,,,,Time,Value,Grade` will only match the second CSV.
208+
173209
### Reading timestamps from CSV files
174210

175211
Timestamps can be extracted in a few ways:
@@ -597,6 +633,7 @@ Supported -option=value settings (/option=value works too):
597633
-CsvComment CSV comment lines begin with this prefix
598634
-CsvSkipRows Number of CSV rows to skip before parsing [default: 0]
599635
-CsvHasHeaderRow Does the CSV have a header row naming the columns. [default: true if any columns are referenced by name]
636+
-CsvHeaderStartsWith A comma separated list of of the first expected header column names
600637
-CsvIgnoreInvalidRows Ignore CSV rows that can't be parsed [default: False]
601638
-CsvRealign Realign imported CSV points to the /StartTime value [default: False]
602639
-CsvRemoveDuplicatePoints Remove duplicate points in the CSV before appending. [default: True]

0 commit comments

Comments
 (0)