-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgetPairs.R
More file actions
77 lines (59 loc) · 1.81 KB
/
getPairs.R
File metadata and controls
77 lines (59 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#####
# GET PAIRS
####
library(RecordLinkage)
### load data
data(RLdata500)
### make pairs
# man bekommt eine Liste mit:
# "data", "pairs", "frequencies" und "type"
data <- compare.dedup(RLdata500,
blockfld = list(1,3,5:7), # focusing of certain constraints
strcmp = c(2,3,4),
strcmpfun = levenshteinSim)
# overview of data rpairs
summary(data)
### calculate weight
# es gibt zusaetzlich Variable:
?epiWeights
#########
### weight calculation
#########
data <- epiWeights(data)
# Wdata mit den weights
# data is a list
length(data$pairs$id1) # anzahl pairs
length(data$Wdata) # anzahl weights
# overview of data rpairs - weights
head(data$Wdata)
###########
### get pairs
##########
### get paris
?getPairs
# generate pairs of data.frame data
rpairs <- getPairs(data, single.rows=FALSE)
# not number of pairs, if single.row = FALSE, because 1 pairs use 3 rows
length(rpairs$id)
# overview of data rpairs
head(rpairs)
# show all record pairs with weights between 0.5 and 0.6
head(getPairs(data, min.weight=0.5, max.weight=0.6))
tail(getPairs(data, min.weight=0.5, max.weight=0.6))
# show all record pairs with weights between 0.5 and 0.6
getPairs(data, min.weight=0.5, max.weight=0.6)
### make classify
?epiClassify
# es gibt zusätzlich variabel:
# prediction und threshold
# @required data
# @required treshold.upper = 0.5
# treshold.lower = threshold.upper, if not defined
cldata <- epiClassify(data, 0.5)
# possible prediction-levels: Not-linked and linked (No Possible's because only treshold.upper defined)
summary(cldata$prediction)
# threshold.upper = 0.6, threshold.lower = 0.4
cldata <- epiClassify(data, 0.6, 0.4)
summary(cldata$prediction)
# now we can filter , show= "links", "nonlinks", "possible" or "all".
getPairs(cldata, show="links", single.rows=FALSE)