SS/removeDuplicates.m at main · magnetotellurics/SS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
function [dupl,seq,Data,GPS,tBox,tHead,stat,duplData] = removeDuplicates(seq,Data,GPS,tBox,tHead,stat,HZ)

% [dupl,seq,Data,GPS,tBox,tHead,stat,duplData] =
% removeDuplicates(seq,Data,GPS,tBox,tHead,stat)
%
% optionally, run dupl = removeDuplicates(seq) to find duplicate blocks
%
% Last edited by Anna Kelbert on Nov 2, 2015 to allow usage with 8 Hz data.
% The original version resulted in completely erroneous results: it not
% only wasn't looking for duplicates in the right places for 8 Hz, but also
% cut the data off at 1/8th of the size. 8 Hz data contains just as many
% blocks as similar 1 Hz data, but 8 times the data stream... This has now
% been fixed and can be used with HZ equal to any integer number.
%
% Strangely, for 8 Hz data, only the first measurement of 8 in the
% "duplicate" data blocks is in fact duplicate. I will still remove the
% second of the two blocks to be consistent with the GPS stream.
%
% Edited again by Anna Kelbert and Paul Bedrosian on Mar 25, 2016 to remove
% the first of the duplicate data blocks (not the second!) because that is
% the one where the E-fields tend to get corrupted, resulting in spikes.
% Additionally, we now locate the duplicate by checking that all the
% magnetics are identical, not the electrics. Finally, we now update the
% sequence numbers when we are done, to no longer count these (deleted)
% duplicates as data gaps.
%
% Optional output duplData are the data that have been removed on output.
% E.g., for 8 Hz data, Data(:,8*(dupl(4)-1)+1:8*dupl(4)) will contain the
% block of data that remains in the data stream while duplData(:,8*3+1:8*4)
% will contain the corresponding deleted data block.

% find all duplicate bytes in the seq string
dupl = find(diff(seq)==256);
duplData = [];

if nargin == 1 || isempty(dupl)
    return
end

% by default, assume 1 Hz data
if nargin < 7
    HZ = 1;
end

% verify that these correspond to duplicate blocks
% AND happen while the GPS lock is being acquired
status = ones(1,length(dupl));
for i = 1:length(dupl)
    status(i) = status(i) && min(min(Data(1:3,HZ*dupl(i):HZ*(dupl(i)+1)-1) == Data(1:3,HZ*dupl(i):HZ*(dupl(i)+1)-1)));
    status(i) = status(i) && (GPS(dupl(i)) == GPS(dupl(i)+1)) && (GPS(dupl(i))==199);
    status(i) = status(i) && (tBox(dupl(i)) == tBox(dupl(i)+1));
    status(i) = status(i) && (tHead(dupl(i)) == tHead(dupl(i)+1));
    status(i) = status(i) && (stat(dupl(i)) == stat(dupl(i)+1));
end

% count true duplicates
ndupl = sum(status);
dupl = dupl(status==1);

% find the same duplicates in the data stream
data_dupl = zeros(HZ,length(dupl));
for i = 1:length(dupl)
    data_dupl(:,i) = (HZ*(dupl(i)-1)+1:HZ*dupl(i));
end
data_dupl = reshape(data_dupl,1,HZ*length(dupl));

% an optional output used for keeping track of duplicate Data values
if nargout >= 8
    duplData = Data(:,data_dupl);
end

% remove them from the data
if ndupl > 0
    for i = 1:length(dupl)
        seq(dupl(i)+1:end) = seq(dupl(i)+1:end) - 256;
    end

    ind = 1:length(seq);
    ind = setdiff(ind,dupl);

    data_ind = 1:(HZ*length(seq));
    data_ind = setdiff(data_ind,data_dupl);

    Data = Data(:,data_ind);
    GPS = GPS(ind);
    tBox = tBox(ind);
    tHead = tHead(ind);
    stat = stat(ind);
    seq = seq(ind);

    disp(['Deleted ' num2str(ndupl) ' duplicate blocks from ' num2str(HZ) ' Hz data']);
end

% update duplicate indices to the new seq array
for i = 1:length(dupl)
    dupl(i) = dupl(i) - i + 1;
end