Skip to content

Commit 976db12

Browse files
author
Doug Schmidt
authored
Merge pull request #316 from DougSchmidt-AI/feature/PF-1480-PointZillaImproveDuplicatePointRemoval
Feature/pf 1480 point zilla improve duplicate point removal
2 parents 12c5107 + d6351a1 commit 976db12

File tree

5 files changed

+61
-31
lines changed

5 files changed

+61
-31
lines changed

TimeSeries/PublicApis/SdkExamples/PointZilla/Context.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ public class Context
100100
public string CsvHeaderStartsWith { get; set; }
101101
public bool CsvRealign { get; set; }
102102
public bool CsvRemoveDuplicatePoints { get; set; } = true;
103+
public bool CsvWarnDuplicatePoints { get; set; } = true;
103104
public string CsvDelimiter { get; set; } = ",";
104105
public string CsvNanValue { get; set; }
105106
public int? ExcelSheetNumber { get; set; }

TimeSeries/PublicApis/SdkExamples/PointZilla/PointReaders/CsvReader.cs

Lines changed: 50 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -56,34 +56,9 @@ private List<TimeSeriesPoint> LoadPoints(string path)
5656

5757
var anyGapPoints = points.Any(p => p.Type == PointType.Gap);
5858

59-
if (Context.CsvRemoveDuplicatePoints && !anyGapPoints)
59+
if (points.Any() && Context.CsvRemoveDuplicatePoints && !anyGapPoints)
6060
{
61-
points = points
62-
.OrderBy(p => p.Time)
63-
.ToList();
64-
65-
var duplicatePointCount = 0;
66-
67-
for (var i = 1; i < points.Count; ++i)
68-
{
69-
var prevPoint = points[i - 1];
70-
var point = points[i];
71-
72-
if (point.Time != prevPoint.Time)
73-
continue;
74-
75-
++duplicatePointCount;
76-
77-
Log.Warn($"Discarding duplicate CSV point at {point.Time} with value {point.Value}");
78-
points.RemoveAt(i);
79-
80-
--i;
81-
}
82-
83-
if (duplicatePointCount > 0)
84-
{
85-
Log.Warn($"Removed {duplicatePointCount} duplicate CSV points.");
86-
}
61+
points = RemoveDuplicatePoints(points);
8762
}
8863

8964
if (Context.CsvRealign && !anyGapPoints)
@@ -110,6 +85,54 @@ private List<TimeSeriesPoint> LoadPoints(string path)
11085
return points;
11186
}
11287

88+
private List<TimeSeriesPoint> RemoveDuplicatePoints(List<TimeSeriesPoint> points)
89+
{
90+
points = points
91+
.OrderBy(p => p.Time)
92+
.ToList();
93+
94+
var duplicatePointCount = 0;
95+
var trimmedPoints = new List<TimeSeriesPoint>(points.Count);
96+
97+
for (var i = 1; i < points.Count; ++i)
98+
{
99+
var prevPoint = points[i - 1];
100+
var point = points[i];
101+
102+
if (point.Time != prevPoint.Time)
103+
{
104+
if (duplicatePointCount > 0)
105+
trimmedPoints.Add(point);
106+
107+
continue;
108+
}
109+
110+
if (duplicatePointCount == 0)
111+
{
112+
// We have just detected our first duplicate point, so collect all the leading points
113+
trimmedPoints.AddRange(points.Take(i));
114+
115+
if (points.Count > 2000)
116+
Log.Warn($"Removing duplicate timestamps from {points.Count} points (this may take a while) ...");
117+
}
118+
119+
++duplicatePointCount;
120+
121+
if (Context.CsvWarnDuplicatePoints)
122+
Log.Warn($"Discarding duplicate CSV point at {point.Time} with value {point.Value}");
123+
}
124+
125+
if (duplicatePointCount == 0)
126+
return points;
127+
128+
if (duplicatePointCount > 0)
129+
{
130+
Log.Warn($"Removed {duplicatePointCount} duplicate CSV points.");
131+
}
132+
133+
return trimmedPoints;
134+
}
135+
113136
private List<TimeSeriesPoint> LoadExcelPoints(string path)
114137
{
115138
using (var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))

TimeSeries/PublicApis/SdkExamples/PointZilla/PointReaders/ExternalPointsReader.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ private List<TimeSeriesPoint> LoadPointsFromNg(IAquariusClient client)
8989
{
9090
Time = Instant.FromDateTimeOffset(p.Timestamp.DateTimeOffset),
9191
Value = p.Value.Numeric,
92-
GradeCode = gradesLookup.GetFirstMetadata(p.Timestamp.DateTimeOffset, g => int.Parse(g.GradeCode)),
93-
Qualifiers = qualifiersLookup.GetManyMetadata(p.Timestamp.DateTimeOffset, q => q.Identifier).ToList()
92+
GradeCode = gradesLookup.ResolveSingleMetadata(p.Timestamp.DateTimeOffset, g => int.Parse(g.GradeCode)),
93+
Qualifiers = qualifiersLookup.ResolveOverlappingMetadata(p.Timestamp.DateTimeOffset, q => q.Identifier).ToList()
9494
})
9595
.ToList();
9696

TimeSeries/PublicApis/SdkExamples/PointZilla/PointReaders/MetadataLookup.cs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,15 +82,20 @@ public IEnumerable<TMetadata> GetMany(DateTimeOffset timestamp)
8282

8383
public static class MetadataExtensions
8484
{
85-
public static T GetFirstMetadata<TMetadata, T>(this MetadataLookup<TMetadata> lookup, DateTimeOffset time, Func<TMetadata, T> func)
85+
// Use for: Grade, Approval, Method, GapTolerance, InterpolationType
86+
// Each of these metadata types can have exactly one or no applicable metadata at any given time.
87+
// InterpolationType is fixed for the lifetime of the series. Only one InterpolationType will exist.
88+
public static T ResolveSingleMetadata<TMetadata, T>(this MetadataLookup<TMetadata> lookup, DateTimeOffset time, Func<TMetadata, T> func)
8689
where TMetadata : TimeRange
8790
{
8891
var metadata = lookup.FirstOrDefault(time);
8992

9093
return metadata == null ? default : func(metadata);
9194
}
9295

93-
public static IEnumerable<T> GetManyMetadata<TMetadata, T>(this MetadataLookup<TMetadata> lookup, DateTimeOffset time, Func<TMetadata, T> func)
96+
// Use for: Qualifier, Note, CorrectionOperation
97+
// Each of these metadata types can have zero or more applicable items at any given time
98+
public static IEnumerable<T> ResolveOverlappingMetadata<TMetadata, T>(this MetadataLookup<TMetadata> lookup, DateTimeOffset time, Func<TMetadata, T> func)
9499
where TMetadata : TimeRange
95100
{
96101
return lookup

TimeSeries/PublicApis/SdkExamples/PointZilla/Program.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ private static Context ParseArgs(string[] args)
199199
new Option {Key = nameof(context.CsvIgnoreInvalidRows), Setter = value => context.CsvIgnoreInvalidRows = bool.Parse(value), Getter = () => context.CsvIgnoreInvalidRows.ToString(), Description = "Ignore CSV rows that can't be parsed"},
200200
new Option {Key = nameof(context.CsvRealign), Setter = value => context.CsvRealign = bool.Parse(value), Getter = () => context.CsvRealign.ToString(), Description = $"Realign imported CSV points to the /{nameof(context.StartTime)} value"},
201201
new Option {Key = nameof(context.CsvRemoveDuplicatePoints), Setter = value => context.CsvRemoveDuplicatePoints = bool.Parse(value), Getter = () => context.CsvRemoveDuplicatePoints.ToString(), Description = "Remove duplicate points in the CSV before appending."},
202+
new Option {Key = nameof(context.CsvWarnDuplicatePoints), Setter = value => context.CsvWarnDuplicatePoints = bool.Parse(value), Getter = () => context.CsvWarnDuplicatePoints.ToString(), Description = "Log a warning for every duplicate point removed."},
202203
new Option {Key = nameof(context.CsvDelimiter), Setter = value => context.CsvDelimiter = HttpUtility.UrlDecode(value), Getter = () => context.CsvDelimiter, Description = "Delimiter between CSV fields. (use %20 for space or %09 for tab)"},
203204
new Option {Key = nameof(context.CsvNanValue), Setter = value => context.CsvNanValue = value, Getter = () => context.CsvNanValue, Description = "Special value text used to represent NaN values"},
204205
new Option {Key = "CsvFormat", Setter = value => Formats.SetFormat(context, value), Description = Formats.Description },

0 commit comments

Comments
 (0)