Merge pull request #300 from DougSchmidt-AI/feature/PF-1405-PointZillaLogPointTimeRange

Doug Schmidt · web-flow · commit 2c25fcdc52d4 · 2022-03-11T10:06:03.000-08:00
PF-1405 - Added /CsvHeaderStartsWith support for CSV and Excel files
diff --git a/TimeSeries/PublicApis/SdkExamples/PointZilla/Context.cs b/TimeSeries/PublicApis/SdkExamples/PointZilla/Context.cs
@@ -91,6 +91,7 @@ public class Context
         public int CsvSkipRows { get; set; }
         public bool CsvHasHeaderRow { get; set; }
         public bool CsvIgnoreInvalidRows { get; set; }
+        public string CsvHeaderStartsWith { get; set; }
         public bool CsvRealign { get; set; }
         public bool CsvRemoveDuplicatePoints { get; set; } = true;
         public string CsvDelimiter { get; set; } = ",";
diff --git a/TimeSeries/PublicApis/SdkExamples/PointZilla/PointReaders/CsvReader.cs b/TimeSeries/PublicApis/SdkExamples/PointZilla/PointReaders/CsvReader.cs
@@ -17,6 +17,7 @@ namespace PointZilla.PointReaders
 {
     public class CsvReader : CsvReaderBase, IPointReader
     {
+        // ReSharper disable once PossibleNullReferenceException
         private static readonly ILog Log = LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);
 
         public CsvReader(Context context)
@@ -98,7 +99,7 @@ private List<TimeSeriesPoint> LoadPoints(string path)
                 }
             }
 
-            Log.Info($"Loaded {PointSummarizer.Summarize(points, "point")} from '{path}'.");
+            Log.Info($"Loaded {PointSummarizer.Summarize(points)} from '{path}'.");
 
             return points;
         }
@@ -144,14 +145,7 @@ private List<TimeSeriesPoint> LoadPoints(IExcelDataReader excelReader)
                 ConfigureDataTable = tableReader => new ExcelDataTableConfiguration
                 {
                     UseHeaderRow = Context.CsvHasHeaderRow,
-
-                    ReadHeaderRow = rowReader =>
-                    {
-                        for (; skipRows > 0; --skipRows)
-                        {
-                            rowReader.Read();
-                        }
-                    }
+                    ReadHeaderRow = rowReader => ReadHeaderRow(rowReader, ref skipRows)
                 }
             });
 
@@ -168,7 +162,7 @@ private List<TimeSeriesPoint> LoadPoints(IExcelDataReader excelReader)
             ValidateHeaderFields(table
                 .Columns
                 .Cast<DataColumn>()
-                .Select(c => c.ColumnName)
+                .Select(c => c.ColumnName.Trim())
                 .ToArray());
 
             return table
@@ -180,6 +174,80 @@ private List<TimeSeriesPoint> LoadPoints(IExcelDataReader excelReader)
                 .ToList();
         }
 
+        private void ReadHeaderRow(IExcelDataReader rowReader, ref int skipRows)
+        {
+            var startingHeaderColumns = GetStartingHeaderColumns();
+
+            for (; skipRows > 0; --skipRows)
+            {
+                if (!rowReader.Read())
+                    return;
+            }
+
+            if (!startingHeaderColumns.Any())
+                return;
+
+            while (true)
+            {
+                if (IsHeaderRowMatched(GetFields(rowReader), startingHeaderColumns))
+                    break;
+
+                if (!rowReader.Read())
+                    return;
+            }
+        }
+
+        private List<string> GetFields(IExcelDataReader rowReader)
+        {
+            var fieldCount = rowReader.FieldCount;
+
+            var fields = new List<string>();
+
+            for (var i = 0; i < fieldCount; ++i)
+            {
+                var field = rowReader.IsDBNull(i)
+                    ? string.Empty
+                    : Convert.ToString(rowReader.GetValue(i)).Trim();
+
+                fields.Add(field);
+            }
+
+            return fields;
+        }
+
+        private List<string> GetStartingHeaderColumns()
+        {
+            return (Context.CsvHeaderStartsWith ?? string.Empty)
+                .Split(',')
+                .Select(s => s.Trim())
+                .ToList();
+        }
+
+        private bool IsHeaderRowMatched(IReadOnlyList<string> fields, IReadOnlyList<string> startingHeaderColumns)
+        {
+            if (!startingHeaderColumns.Any())
+                return false;
+
+            if (!startingHeaderColumns.Any(string.IsNullOrEmpty))
+            {
+                // When the expected columns don't contain a blank column, we only match against non-empty fields
+                fields = fields
+                    .Where(s => !string.IsNullOrEmpty(s))
+                    .ToList();
+            }
+
+            for (var i = 0; i < startingHeaderColumns.Count; ++i)
+            {
+                if (i >= fields.Count)
+                    return false;
+
+                if (!startingHeaderColumns[i].Equals(fields[i], StringComparison.InvariantCultureIgnoreCase))
+                    return false;
+            }
+
+            return true;
+        }
+
         private TimeSeriesPoint ParseExcelRow(DataRow row)
         {
             Instant? time = null;
@@ -306,7 +374,7 @@ private List<TimeSeriesPoint> LoadCsvPoints(string path)
             }
 
             var skipCount = Context.CsvSkipRows;
-
+            var startingHeaderColumns = GetStartingHeaderColumns();
             var parseHeaderRow = Context.CsvHasHeaderRow;
 
             while (!parser.EndOfData)
@@ -322,6 +390,9 @@ private List<TimeSeriesPoint> LoadCsvPoints(string path)
                     continue;
                 }
 
+                if (parseHeaderRow && startingHeaderColumns.Any() && !IsHeaderRowMatched(fields, startingHeaderColumns))
+                    continue;
+
                 if (parseHeaderRow)
                 {
                     ValidateHeaderFields(fields);
@@ -425,6 +496,7 @@ private TimeSeriesPoint ParsePoint(string[] fields)
                 ParseField(fields, Context.CsvNotesField?.ColumnIndex, text =>
                 {
                     if (time.HasValue)
+                        // ReSharper disable once PossibleInvalidOperationException
                         AddRowNote(time.Value, text);
                 });
             }
diff --git a/TimeSeries/PublicApis/SdkExamples/PointZilla/Program.cs b/TimeSeries/PublicApis/SdkExamples/PointZilla/Program.cs
@@ -189,6 +189,7 @@ private static Context ParseArgs(string[] args)
                 new Option {Key = nameof(context.CsvComment), Setter = value => context.CsvComment = value, Getter = () => context.CsvComment, Description = "CSV comment lines begin with this prefix"},
                 new Option {Key = nameof(context.CsvSkipRows), Setter = value => context.CsvSkipRows = int.Parse(value), Getter = () => context.CsvSkipRows.ToString(), Description = "Number of CSV rows to skip before parsing"},
                 new Option {Key = nameof(context.CsvHasHeaderRow), Setter = value => context.CsvHasHeaderRow = bool.Parse(value), Getter = () => string.Empty, Description = "Does the CSV have a header row naming the columns. [default: true if any columns are referenced by name]"},
+                new Option {Key = nameof(context.CsvHeaderStartsWith), Setter = value => context.CsvHeaderStartsWith = value, Getter = () => context.CsvHeaderStartsWith, Description = "A comma separated list of of the first expected header column names"},
                 new Option {Key = nameof(context.CsvIgnoreInvalidRows), Setter = value => context.CsvIgnoreInvalidRows = bool.Parse(value), Getter = () => context.CsvIgnoreInvalidRows.ToString(), Description = "Ignore CSV rows that can't be parsed"},
                 new Option {Key = nameof(context.CsvRealign), Setter = value => context.CsvRealign = bool.Parse(value), Getter = () => context.CsvRealign.ToString(), Description = $"Realign imported CSV points to the /{nameof(context.StartTime)} value"},
                 new Option {Key = nameof(context.CsvRemoveDuplicatePoints), Setter = value => context.CsvRemoveDuplicatePoints = bool.Parse(value), Getter = () => context.CsvRemoveDuplicatePoints.ToString(), Description = "Remove duplicate points in the CSV before appending."},
diff --git a/TimeSeries/PublicApis/SdkExamples/PointZilla/Readme.md b/TimeSeries/PublicApis/SdkExamples/PointZilla/Readme.md
@@ -9,6 +9,7 @@ Points can be specified from:
 - Signal generators: linear, saw-tooth, square-wave, or sine-wave signals. Useful for just getting *something* into a time-series
 - CSV files (including CSV exports from AQTS Springboard)
 - Points retrieved live from other AQTS systems, including from legacy 3.X systems.
+- The results of a database query (via direct support fo SqlServer, Postgres, and MySql. ODBC connections are supported too, but require configuration)
 - `CMD.EXE`, `PowerShell` or `bash`: `PointZilla` works well from within any shell.
 
 Basic time-series will append time/value pairs. Reflected time-series also support setting grade codes and/or qualifiers to each point.
@@ -166,10 +167,45 @@ $ ./PointZilla.exe -server=myserver Stage.Label@MyLocation Downloads/ExportedFro
 
 ### Use column names or 1-based column indexes to reference a column from your CSV
 
-You can reference a column either by a name (eg. `-CsvDateTimeField="ISO 8601 UTC"`) or by a 1-based column index (eg. `-CsvDateTimeField=1`). When at least one field has a column name, the `-CsvHasHeaderRow=true` option is assumed.
+You can reference a column either by a header name (eg. `-CsvDateTimeField="ISO 8601 UTC"`) or by a 1-based column index (eg. `-CsvDateTimeField=1`). When at least one field has a column name, the `-CsvHasHeaderRow=true` option is assumed.
+
+Referencing columns by name has some nice benefits:
+- Columns can appear in any order in the header line.
+- Column name matching is case-insensitive.
 
 Referencing columns by name is usually more robust, but you may not have control over the format of the CSV file being consumed.
 
+### When your data isn't at the start of your CSV
+
+Some data files have extra rows at the start. PointZilla has a few options to help locate the start of the data to extract:
+
+The `/CsvComment={prefix}` option tells the CSV parser to skip over any lines that begin with the given prefix.
+
+The `/CsvSkipRows={integer}` option tells the CSV parser to skip over the specified number of lines before parsing the data. Skipped rows are not counted for lines matching the `/CsvComment=` test.
+
+The `/CsvHeaderStartsWith={hint1, hint2, ..., hintN}` option provides the CSV parser with a header-row detection hint, a comma-separated list of expected column names:
+
+- Each hint is trimmed of leading/trailing whitespace.
+- Column name matching is case-insensitive.
+- If none of the expected column hints are empty, then the match is performed against non-empty fields from the header row. This is usually what you want.
+- If any of the expected column hints are empty, then the match is performed column-by-column and blank hints must match blank columns in the header row.
+
+So `/CsvHeaderStartsWith="Date, Time, Value, Grade"` and `/CsvHeaderStartsWith=Date,Time,Value,Grade` will both match:
+
+```csv
+Date, Time, Value, Grade, Status, Note
+2021-Oct-12, 12:56, 4.5, Good, Normal, Things are fine
+```
+
+And will also this CSV with 3 blank columns between the `Date` and `Time` columns:
+
+```csv
+Date,,,,Time,Value,Grade,Status,Note
+2021-Oct-12,,,,12:56,4.5,Good,Normal,Things are fine
+```
+
+Adding empty hint colums like `/CsvHeaderStartsWith="Date, , , , Time, Value, Grade"` or `/CsvHeaderStartsWith=Date,,,,Time,Value,Grade` will only match the second CSV.
+
 ### Reading timestamps from CSV files
 
 Timestamps can be extracted in a few ways:
@@ -597,6 +633,7 @@ Supported -option=value settings (/option=value works too):
   -CsvComment               CSV comment lines begin with this prefix
   -CsvSkipRows              Number of CSV rows to skip before parsing [default: 0]
   -CsvHasHeaderRow          Does the CSV have a header row naming the columns. [default: true if any columns are referenced by name]
+  -CsvHeaderStartsWith      A comma separated list of of the first expected header column names
   -CsvIgnoreInvalidRows     Ignore CSV rows that can't be parsed [default: False]
   -CsvRealign               Realign imported CSV points to the /StartTime value [default: False]
   -CsvRemoveDuplicatePoints Remove duplicate points in the CSV before appending. [default: True]