-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathstatic_sequencesSniffer.Rmd
More file actions
242 lines (166 loc) · 7.89 KB
/
static_sequencesSniffer.Rmd
File metadata and controls
242 lines (166 loc) · 7.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
---
title: "static sequenceSniffer"
author: "Anne Rutten"
date: "February 1, 2020"
output: html_document
---
static version of the duplicated-n-gram-flagging example. Easier to copy functionality out of.
## aim:
flag data points that are part of a sequence that is present at least twice in a given data set.
### `findDuplicates()`: identify duplicated sequences in a dataset
* works on data vectors (e.g., a dataframe column in 'long' format)
* function generates n-grams for given vector, iteratively increasing the sequence length from the specified `min_length` until no more duplicate n-grams are present in the data
* data points that are part of an n-gram that is present in the data more than once are marked with an identifier specific to the sequence
#### please note:
* the function does not yet check for overlapping sequences within a specific n-gram length, i.e. a sequence "A A B A A B A" will count and mark n-gram "A A B A" as duplicated
* longer sequences will overwrite shorter sequences that they overlap with, i.e., in the above example,3-gram "A B A" will be overwritten by 4-gram "A A B A".
### usage:
*either just copy the `findDuplicates` function and run it inside your script, or
1. change the constants below and
2. knit this document to generate a report for a specific datafile
#### dependencies
```{r, message=FALSE }
library(tidyverse)
library(ngram)
library(shiny)
library(DT)
library(colorspace)
library(ggplot2)
```
#### controls
```{r, message=FALSE }
# path to csv datafile
fn <- "/home/anne/Pardosa_mesocosm_activity_P_R.csv"
# column range of the focal data (including)
fromColumn <- 3
toColumn <- 10
# minimum sequence length to consider
minLength <- 4
# EXPERIMENTAL: fuzzy match ("A A A A" will also match "A A B A")
fuzzyMatch <- FALSE
# ignore n-grams that are repeats of one value (trying to ignore default values here without specifying the actual value)
ignoreAllEqual <- FALSE
# if you want to reorder datapoints within some group and analyze the distribution of 'number of datapoints in duplicated sequences' in your dataset, specify grouping variable here
# the variable name that is assigned to the `names of the long` concatenated columns specified above is 'key'
groupingVars <-c("Treatment", "key")
```
#### ducttape-and-tiewraps wrapper around ngram::ngrams()
* `somevector`: vector of values that needs to be tested for repeat sequences
* `min_length`: the minimum sequence length
* `sep`: in case this is a vector of strings, some separator that is not present in the data itself
* `ignoreAllEquall`: ignore n-grams that are repeats of one single value (trying to ignore defaults/censored data/etc)
```{r}
findDuplicates <- function(somevector, min_length, ignoreAllEqual=FALSE, sep=" ") {
# paste all values into one string for n-gram calculation
values_as_string <- paste(somevector, collapse= sep)
# input data has to be coverted to character as well
charactervector <- as.character(somevector)
output <- data.frame(value=somevector, seqID = NA)
done <- FALSE
i <- min_length
while (!done) {
duplicates <- get.phrasetable(ngram(values_as_string, i, sep)) %>%
filter(freq>1) %>%
select(ngrams) %>%
mutate(ngrams = strsplit(ngrams, " ")) # ngram() makes " " separated ngrams.
if (ignoreAllEqual & nrow(duplicates)>0) duplicates <- filter(duplicates, rle(ngrams[[1]])$lengths[[1]]<i)
if (nrow(duplicates)>0) {
for(k in 1:nrow(duplicates)) {
ng <- duplicates$ngrams[[k]]
idx <- which(charactervector == ng[1])
seqStart <- idx[sapply(idx, function(j) all(charactervector[j:(j+(length(ng)-1))] == ng))]
output$seqID[rep(seqStart, each = i) + 0:(i-1)] <-paste0("n",i,"Seq",k)
}
} else done=TRUE
i<-i+1
}
output$seqID
}
```
#### raw data:
duplicated rows in focal columns marked in blue. This may mean nothing.
```{r, echo=FALSE}
# read data
d <- read.csv(fn)
d$duplicatedRow <- (duplicated(d[,fromColumn:toColumn])|rev(duplicated(d[,fromColumn:toColumn] %>% map_df(rev))))
datatable(d) %>%
formatStyle(c(fromColumn:toColumn), "duplicatedRow",
backgroundColor = styleEqual(c(TRUE),"lightblue"))
```
```{r, echo=FALSE}
# identify duplicate n-grams
# add dummy rowID to be able to reformat back to `wide`. i doubt people call variables 'originalRowID' on Dryad so no check if this overwrites.
d$originalRowID <- row.names(d)
# reformat d from wide to long
longd <- gather(d, key, value, fromColumn:toColumn)
#identify duplicate ngrams
longd$ngramID <- findDuplicates(longd$value, minLength, ignoreAllEqual)
# format back to wide
duplicateNgramsWide <- longd %>% pivot_wider(names_from =key, values_from=c(value, ngramID))
```
#### data summary:
* `key`: column within specified column range
* `cardinality`: number of unique data values (less meaningful for `double` type, but who knows)
* `max_rle`: the maximum length of a single repeated value in a column
* `max_rle_at`: (if `max_rle`>1), the value(s) that has/have the `max_rle` (e.g. default values, censored data: many duplicate n-grams expected there)
-> many duplicates in a variable with low `n_distinct_levels` is to be expected.
-> many duplicates in a variable with a high `max_rle` are to be expected
```{r, echo=FALSE}
#summary
dataSummary <-longd %>% group_by(key) %>%
summarise(cardinality = n_distinct(value),
max_rle = with(rle(value) %>% set_names(c("lengths2", "values")), max(lengths2)),
max_rle_at_level = if (max_rle>1) with(rle(value) %>% set_names(c("lengths2", "values")), list(unique(values[lengths2==max(lengths2)]))) else list(NA),
n_duplicates = sum(!is.na(ngramID)),
n_total = n(),
fraction = round(n_duplicates/n_total,2)
)
datatable(dataSummary)
```
#### detected sequences:
* light grey: row is not unique
* colour: sequence in column is not unique
```{r, echo=FALSE}
#make a databale with highlighted background colour for duplicates
ids <- unique(longd$ngramID[!is.na(longd$ngramID)])
cols <- rainbow_hcl(length(ids))
varnames <- names(duplicateNgramsWide)
varsToFormat <- varnames[grepl("value", varnames)]
varsRef <- varnames[grepl("ngramID", varnames)]
markedTable <- datatable(duplicateNgramsWide) %>%
formatStyle(varsToFormat, "duplicatedRow",
backgroundColor = styleEqual(c(TRUE),"lightgrey")) %>%
formatStyle(varsToFormat, varsRef, backgroundColor=styleEqual(ids,cols))
markedTable
```
### expected datapoints-in-duplicated-sequences distribution for this dataset:
* randomly reorder datapoints within `r paste(groupingVars, collapse=", ")` (`key` contains the names of the column range specified by `fromCol` and `toCol`)
* calculate number of datapoints in recurring sequences
* repeat 1000 times
```{r, echo=FALSE}
# resample within grouping vars & columns
groupedD <- group_by_at(longd, groupingVars)
res <- list()
for (i in 1:1000) {
newd <- groupedD %>%
mutate(newOrder = runif(n())) %>%
arrange_at(c(groupingVars, "newOrder"))
newd$newDupes <- findDuplicates(newd$value, minLength, ignoreAllEqual)
res[[i]] <- group_by(newd, key) %>%
summarise(n_duplicates = sum(!is.na(newDupes)))
}
resdf <- bind_rows(res) %>%
mutate(datasource = "simulated") %>%
bind_rows(select(dataSummary, key, n_duplicates) %>%
mutate(datasource = "actual data"))
nfacets <- toColumn-fromColumn
```
```{r, echo=FALSE, fig.height=nfacets*2}
ggplot(as.data.frame(resdf), aes(n_duplicates, colour = datasource, fill=datasource)) +
geom_bar() +
theme_bw() +
theme(legend.position="top") +
facet_wrap(~key) +
labs(title = "number of datapoints that are part of a duplicate sequence",
subtitle =paste("data reordered within:", paste(groupingVars, collapse=", "), "\nn_runs=1000; minimum n-gram length =", minLength))
```