-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmilestone.R
161 lines (129 loc) · 5.33 KB
/
milestone.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#install.packages("quanteda", "LaF", "parallel", "textcat")
# For reproducibility's sake
set.seed(19394399)
download.maybe <- function(url, refetch=FALSE, path=".") {
dest <- file.path(path, basename(url))
if (refetch || !file.exists(dest))
download.file(url, dest)
dest
}
download.maybe("http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip")
# unzip file if not previously done
if (!file.exists("final")) { # final dir contains files
unzip("Coursera-SwiftKey.zip")
}
# Download profanity list
download.maybe("https://raw.githubusercontent.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en")
# Words to be excluded from text, eg profanity
excluded <- scan("en",
sep = "\n",
what = character())
# Filter out phrases, just use words
excluded <- excluded[grepl("^\\w+$", excluded, perl = TRUE)]
# Only replace whole words
excluded <- paste0(sapply(excluded, function(x) paste0('\\b', x, '\\b')), collapse = "|")
library(LaF) # for sample_lines
library(quanteda)
library(parallel)
library(RColorBrewer)
nLinesRatio <- .6
nTwitterLines <- 2360148 # can use determine_nlines("final/en_US/en_US.twitter.txt")
nBlogLines <- 899288 # can use determine_nlines("final/en_US/en_US.blogs.txt")
nNewsLines <- 1010242 # can use determine_nlines("final/en_US/en_US.news.txt")
nTotalLines <- nTwitterLines + nBlogLines + nNewsLines
tweets <- sample_lines("final/en_US/en_US.twitter.txt", nTwitterLines * nLinesRatio, nTwitterLines)
blogs <- sample_lines("final/en_US/en_US.blogs.txt", nBlogLines * nLinesRatio, nBlogLines)
news <- sample_lines("final/en_US/en_US.news.txt", nNewsLines * nLinesRatio, nNewsLines)
tweets <- tokenize(tweets, what = "sentence", simplify = TRUE)
blogs <- tokenize(blogs, what = "sentence", simplify = TRUE)
news <- tokenize(news, what = "sentence", simplify = TRUE)
# Remove words to be excluded using _UNK_ placeholder
removeExcluded <- function(txt) {
# replace profanity with UNK tag
# Convert to lower case to ease comparisons
txt <- toLower(txt)
ptm <- proc.time()
txt <- parSapply(cl, txt, function(x) gsub(excluded, "_UNK_", x, perl = TRUE))
proc.time() - ptm
txt
}
# Initiate cluster for parallel operations
cl <- makeCluster(detectCores() / 2, type="FORK") # FORK will not work on Windows
tweets <- removeExcluded(tweets)
blogs <- removeExcluded(blogs)
news <- removeExcluded(news)
# Stop cluster as we're done with it
stopCluster(cl)
makeDfm <- function(txt, ngramCount) {
if (ngramCount > 1) {
txt <- unname(sapply(txt, function(x) paste(paste(rep("_S_", ngramCount - 1), collapse = " "),
x,
paste(rep("_S_", ngramCount - 1), collapse = " "))))
}
txtDfm <- dfm(txt,
removePunct = TRUE,
toLower = FALSE,
removeNumbers = TRUE,
removeTwitter = TRUE,
ngrams = ngramCount)
#trim(txtDfm, minCount = 2) # Remove features with less than 2 counts
txtDfm
}
# Make list of ngrams 1-3 for each corpus
tweetsUnigram <- makeDfm(tweets, 1)
tweetsBigram <- makeDfm(tweets, 2)
tweetsTrigram <- makeDfm(tweets, 3)
blogsUnigram <- makeDfm(blogs, 1)
blogsBigram <- makeDfm(blogs, 2)
blogsTrigram <- makeDfm(blogs, 3)
newsUnigram <- makeDfm(news, 1)
newsBigram <- makeDfm(news, 2)
newsTrigram <- makeDfm(news, 3)
# Extract features with counts from each DFM
makeDfmFreq <- function(ngramDfm) {
topfeatures(ngramDfm, nfeature(ngramDfm))
}
# Make list of ngrams freq for each corpus
tweetsUnigramFreq <- makeDfmFreq(tweetsUnigram)
tweetsBigramFreq <- makeDfmFreq(tweetsBigram)
tweetsTrigramFreq <- makeDfmFreq(tweetsTrigram)
blogsUnigramFreq <- makeDfmFreq(blogsUnigram)
blogsBigramFreq <- makeDfmFreq(blogsBigram)
blogsTrigramFreq <- makeDfmFreq(blogsTrigram)
newsUnigramFreq <- makeDfmFreq(newsUnigram)
newsBigramFreq <- makeDfmFreq(newsBigram)
newsTrigramFreq <- makeDfmFreq(newsTrigram)
# Basic counts
tweetsSentences <- length(tweetsUnigram)
tweetsTokens <- sum(ntoken(tweetsUnigram))
tweetsTypes <- sum(ntype(tweetsUnigram))
blogsSentences <- length(blogsUnigram)
blogsTokens <- sum(ntoken(blogsUnigram))
blogsTypes <- sum(ntype(blogsUnigram))
newsSentences <- length(newsUnigram)
newsTokens <- sum(ntoken(newsUnigram))
newsTypes <- sum(ntype(newsUnigram))
totalSentences <- tweetsSentences + blogsSentences + newsSentences
totalTokens <- tweetsTokens + blogsTokens + newsTokens
totalTypes <- tweetsTypes + blogsTypes + newsTypes
old.par <- par(mfrow=c(1, 3))
plot(tweetsUnigram, max.words=100, colors = brewer.pal(6, "Dark2"), scale=c(8, .5))
tweetsUnigramWC <- recordPlot()
plot.new() ## clean up device
tweetsUnigramWC # redraw
plot(blogsUnigram, max.words=100, colors = brewer.pal(6, "Dark2"), scale=c(8, .5))
blogsUnigramWC <- recordPlot()
plot.new() ## clean up device
blogsUnigramWC # redraw
plot(newsUnigram, max.words=100, colors = brewer.pal(6, "Dark2"), scale=c(8, .5))
newsUnigramWC <- recordPlot()
plot.new() ## clean up device
newsUnigramWC # redraw
par(old.par)
save(list = ls(pattern = paste0(c("*Freq",
"*Sentences",
"*Tokens",
"*Types",
"n*Lines",
"*UnigramWC"), collapse="|")) ,
file= "all.RData")