-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNBA_TwitsMining.Rmd
345 lines (261 loc) · 12.3 KB
/
NBA_TwitsMining.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
---
title: "NBA Topic Mining"
author: "Shanshan Bradford"
output:
word_document: default
pdf_document: default
html_document: default
---
Section 1
```{r setup, include = TRUE, error=TRUE, fig.dim=c(2, 0.5)}
options(tinytex.verbose = TRUE)
#clear up memory, set working directory and seeds
rm(list = ls())
setwd("/Users/syu/Library/CloudStorage/OneDrive-St.JudeChildren'sResearchHospital/UDrive/Documents_syu_Backup/Github_deposit/TextMining")
#Load twitter package
library(twitteR)
library(bitops)
library(RCurl)
library(ROAuth)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```
#1.Retrieve tweets from Twitter with the hashtag #nba #
#Assign Twitter consumer key, secret and access token and secret
consumer_key <- "QLmWyDb3OmiMi7kkWta68F5rd"
consumer_secret <- "7YmvWUwlj6VwqA1B5P1TDetEelfJJnaqOyqGjIKisgvKFvXeib"
access.token <- "913200731829698560-N8rWlg3JqjK473rMWpqpEcvI8nHWC1B"
access.secret <- "Ka7zH3edvoOULrJC4CoudLZmqJiTxoI9JlkGOxBNJbGw2"
#connect to twitter and search tweets #nba
setup_twitter_oauth(consumer_key, consumer_secret, access.token, access.secret)
nba.tweets <- searchTwitter("#nba", n = 320, lang = "en")
#strip retweets and check the number of tweets afterward
nba.nort <- strip_retweets(nba.tweets, strip_manual = T, strip_mt = T)
length(nba.nort)
#convert the tweets to dataframe and check out associated attributes
nba.df <- twListToDF(nba.nort)
colnames(nba.df)
#save tweets to csv
write.csv(nba.df, file = "NBA_tweets.csv", na = "NA")
Section 2
```{r, include = TRUE, error=TRUE}
options(tinytex.verbose = TRUE)
# 2. Clean up tweets
tweet.tb <- read.csv("NBA_tweets.csv", header = T, sep = ",", as.is = T)
nba.text <- gettext(tweet.tb$text)
# Replace @UserName with one space
# One space replacement is to avoid words being glued together
nba.modify <- gsub("@\\w{1,20}", " ", nba.text)
# Replace control character "\n" and "\n\n" with one space
nba.modify <- gsub("[[:cntrl:]]{1,10}", " ", nba.modify)
# Replace https links with one space
nba.modify <- gsub("(https)(://)(.*)[/]\\w+", " ", nba.modify)
# Replace punctuation with one space
nba.modify <- gsub("[[:punct:]]{1,20}", " ", nba.modify)
# Replace non graphical character with space
nba.modify <- gsub("[^[:graph:]]", " ", nba.modify)
# Replace tab and extra space introduced early with one space
nba.modify <- gsub("[ |\t]{2,}", " ", nba.modify)
nba.modify <- gsub("\\s+", " ", nba.modify)
# Remove extra blank space at the beginning and the end
nba.modify <- gsub("^ +", "", nba.modify)
nba.modify <- gsub(" $+", "", nba.modify)
# 3. Preprocess tweets further for analysis
library(NLP)
library("tm")
library(RColorBrewer)
library(wordcloud)
library("SnowballC")
library("lsa")
# generate corpus for the cleaned nba tweets and check out the corpus length
nba.corpus <- VCorpus(VectorSource(nba.modify))
length(nba.corpus)
# transform the corpus to lower case, remove punctuation and numbers and randomly check sample
trans.nbacorp <- tm_map(nba.corpus, content_transformer(tolower)) #convert to lower cases
trans.nbacorp <- tm_map(trans.nbacorp, removePunctuation) # remove punctuation
trans.nbacorp <- tm_map(trans.nbacorp, removeNumbers) # remove numbers
trans.nbacorp <- tm_map(trans.nbacorp, stripWhitespace)
# remove stop words
input.nbacorp <- tm_map(trans.nbacorp, removeWords,
c(stopwords("english"), "can", "don", "just", "nba", "via",
"e", "s", "y"))
#find an empty entry
inspect(input.nbacorp[[4]])
# 4. Generate a word cloud with the NBA tweets
set.seed(1234)
#generate document-term matrix in order to remove empty documents
nbacorp.dtm <- DocumentTermMatrix(input.nbacorp)
row.total <- apply(nbacorp.dtm, 1, sum)
#Correspondingly, remove the same empty entries from the corpus and document-term matrix
input.nbacorp.noemp <- input.nbacorp[which(row.total > 0)]
nbacorp.docterm <- DocumentTermMatrix(input.nbacorp.noemp)
# or the following code will generate the same doc-term matrix with empty entries removed
nbacorp.docterm <- nbacorp.dtm[which(row.total > 0),]
#The index of the matrix shifts accordingly, but the doc entry index remains the same
inspect(nbacorp.docterm[15:16,])
# generate the term-document Matrix from the cleaned document-term matrix
nbacorp.terdoc <- t(nbacorp.docterm)
inspect(nbacorp.terdoc)
# Find frequency of terms in term-doc matrix with frequency over 3
findFreqTerms(nbacorp.terdoc, lowfreq = 3)
#sort the term by frequency and plot terms of frequency over 3 in a word cloud
nbacorp.terdoc.matrix <- as.matrix(nbacorp.terdoc)
nbaterm.freqbydoc <- sort(rowSums(nbacorp.terdoc.matrix), decreasing = T, na.last = NA)
wordcloud(names(nbaterm.freqbydoc), nbaterm.freqbydoc, min.freq = 3, max.words = 100,
textStemming = FALSE, colors=brewer.pal(8, "Dark2"))
# subset the terms with a frequency over 5 and generate a barplot
V.minfreq6 <- rowSums(nbacorp.terdoc.matrix) > 5
nbaterm.minfreq6 <- sort(rowSums(nbacorp.terdoc.matrix)[V.minfreq6], decreasing = T)
term.barplot <- barplot(nbaterm.minfreq6, horiz = F, col = rainbow(length(nbaterm.minfreq6)))
legend(20, 20.5, legend = names(nbaterm.minfreq6),fill = rainbow(length(nbaterm.minfreq6)),
cex = 0.75, ncol = 3, x.intersp = 0.2, y.intersp = 0.7, text.width = 9, bty = "n")
# 5. Identify the top three pairs of tweets and the most frequently used terms among these pairs
library(reshape2)
library(Matrix)
library("lsa")
#create consine similarity matrix and check result
nbacorp.cosi <- as.matrix(cosine(nbacorp.terdoc.matrix))
nbacorp.cosi[1:9, 1:9]
#replace all the diagonal value from 1 to NA
diag.replace <- function(x){
for (i in 1: nrow(x)){
if (x[i, i] == 1 | x[i, i] == 0)
{ x[i,i] <- NA }
}
return(x)
}
nbacorp.cosmod <- diag.replace(nbacorp.cosi)
nbacorp.cosmod[1:6, 1:6]
#convert the sparse matrix into a molten data frame and sort it based on the cosine value
nbacorp.cosmolten <- melt(nbacorp.cosmod, na.rm = T, c("m.row.doc", "m.col.doc"))
nbacorp.cosmolten <- nbacorp.cosmolten[order(nbacorp.cosmolten$value, decreasing = T),]
nbacorp.cosmolten[1:25,]
#inspect tweet pairs with cosine similarity of 1 --> These tweets seems to be repost
inspect(input.nbacorp[[15]])
inspect(input.nbacorp[[16]])
inspect(input.nbacorp[[42]])
inspect(input.nbacorp[[61]])
inspect(input.nbacorp[[44]])
inspect(input.nbacorp[[57]])
#After empty entries removed, the doc index numbers in the matrix shift from doc entry numbers
typeof(row.names(nbacorp.docterm)) # doc entry number in the doc-term matrix are characters
# Therefore, the character value instead of numeric values can correctly index doc entries
inspect(nbacorp.docterm[c("15","16","61","42","57","44"),])
#coerce the doc-term matrix to R matrix
nbacorp.docterm.matrix <- as.matrix(nbacorp.docterm)
#subset the matrix with reposted tweets
nbacorp.repost.matrix <- nbacorp.docterm.matrix[c("15","16","61","42","57","44"),]
nbacorp.repost.matrix[, 1:20] #although subsetted, matrix inherited every term from all the tweets
#which() & apply() index the terms that are only in the repost docs/tweets
term.inrepost <- names(which(apply(nbacorp.repost.matrix, 2, sum) > 0))
#The top 10 most used terms from all the tweets
top10.term <- names(nbaterm.freqbydoc[1:10])
#write a function to check whether any of the top 10 terms included in subsetted similar tweets
identical.term <- function(x, y){
for (i in 1: length(x)){
if(length(grep(x[i], y)) > 0)
{print(c(x[i],grep(x[i], y, value = T)))}
}
}
top10.term
term.inrepost
identical.term(term.inrepost, top10.term)
##inpect tweet pairs with cosine similarity less than 1
inspect(input.nbacorp[[182]])
inspect(input.nbacorp[[200]])
inspect(input.nbacorp[[50]])
inspect(input.nbacorp[[44]])
inspect(input.nbacorp[[138]])
inspect(input.nbacorp[[131]])
#subset the matrix with docs of cosine similarity
nbacorp.similar.matrix <- nbacorp.docterm.matrix[c("182", "200","50", "44", "138", "131"), ]
nbacorp.similar.matrix[, 1:20] #although subsetted, matrix inherited every term from all the tweets
#which() & apply() index the terms that are only in the similar docs/tweets
term.insimilar <- names(which(apply(nbacorp.similar.matrix, 2, sum) > 0))
#Check whether any of the top 10 terms are included in the similar tweets
top10.term
term.insimilar
identical.term(term.insimilar, top10.term)
# 6. Identify terms with the highest weighted tf-idf among the top three pairs of tweets
#calculate the tfidf of the document-term matrix created during # 4
nbacorp.dttfidf <- weightTfIdf(nbacorp.docterm)
inspect(nbacorp.dttfidf[1:6,])
#convert the document-term matrix to numeric matrix and calculate a total tfidf of each document
nbacorp.dttfidf.matrix <- as.matrix(nbacorp.dttfidf)
nbadoc.countfidf <- sort(rowSums(nbacorp.dttfidf.matrix), decreasing = T)
nbadoc.countfidf[1:20]
#write a function to find the identical sum of tfidf of each document/tweets
same.tweets <- function(x) {
temp.x <- x
names(temp.x) <- NULL
for(i in 1:length(temp.x))
{
if(identical(temp.x[i], temp.x[i+1]) == T)
{print(x[c(i,i+1)])}
}
}
same.tweets(nbadoc.countfidf)
# Inspect the content of the highest score of tweets
inspect(input.nbacorp[[3]])
inspect(input.nbacorp[[39]])
inspect(input.nbacorp[[49]])
inspect(input.nbacorp[[29]])
inspect(input.nbacorp[[38]])
inspect(input.nbacorp[[143]])
inspect(input.nbacorp[[148]])
#calculate tfidf of all the terms and convert results to R matrix
nbacorp.tertfidf.matrix <- as.matrix(weightTfIdf(nbacorp.terdoc, normalize = T))
#subset the matrix with 3 pairs of tweets having the highest tfidf sum
top3tweet.tfidf.matrix <- nbacorp.tertfidf.matrix[, c("3", "39", "29", "38", "143", "148")]
top3tweet.tfidf.matrix[1:10,]#terms used in other tweets were inherited in the subsetted matrix
term.top3tweet <- names(which(apply(top3tweet.tfidf.matrix, 1, sum) > 0))
#Harvest the top 10 terms of highest tfidf values
top10.tfidfterm <- sort(rowSums(nbacorp.tertfidf.matrix), decreasing = T)[1:10]
top10.tfidfterm <- names(top10.tfidfterm)
#check the overlapped term with identical.term function
term.top3tweet
top10.tfidfterm
identical.term(term.top3tweet, top10.tfidfterm)
# 7. Determine the optimal numbers of clusters for the tweets
# Compute kmean and plot wss from k = 1 to k = 20.
set.seed(2345)
k.max <- 15
tot.wss <- sapply(2:k.max, simplify = T,
function(k){kmeans(nbacorp.docterm.matrix, k, nstart = 50, iter.max = 100)$tot.withinss})
bet.ss <- sapply(2:k.max, simplify = T,
function(k){kmeans(nbacorp.docterm.matrix, k, nstart = 50, iter.max = 100)$betweenss})
tot.wss
bet.ss
plot(2:k.max, tot.wss/bet.ss,
type = "b", pch = 19, frame = T, lwd = 1, col= rainbow(k.max),
xlab = "Number of clusters K", ylab = "Ratio of total within-clusters to betweenss")
text(2:k.max, tot.wss/bet.ss, labels = 2:k.max, adj = c(-0.5, -0.5), cex = 0.75)
abline(v = 5, lwd = 2, lty = 4, col = "blue")
# 8. Identify the groups of tweets having similar characteristics
#pick up k-custer at 6
set.seed(2345)
nbacorp.cluster <- kmeans(nbacorp.docterm.matrix, 5, nstart = 30, iter.max = 50)
nbacorp.cluster$cluster[1:25]
#use sapply to extract the text from corpus
inputcorp.text <- t(data.frame(sapply(input.nbacorp.noemp, "[", "content")))
#index out empty entries of corpus from original tweets and extracted text,
#and combine the extract text, original tweet text and cluster vector
row.total.dataframe <- names(which(row.total > 0))
tweet.txtclust <- data.frame(tweet.tb[c(row.total.dataframe),]$text,
as.character(inputcorp.text),
nbacorp.cluster$cluster)
#change the column names and organize the table by clusters
names(tweet.txtclust) <- c("orginal tweets", "cleaned tweets", "K-clusters")
tweet.txtclust <- tweet.txtclust[order(tweet.txtclust$`K-clusters`, decreasing = F),]
#subset cleaned tweet text data by clusters.
tweet.txtK1 <- tweet.txtclust[tweet.txtclust$`K-clusters` == 1, ]$`cleaned tweets`
tweet.txtK2 <- tweet.txtclust[tweet.txtclust$`K-clusters` == 2, ]$`cleaned tweets`
tweet.txtK3 <- tweet.txtclust[tweet.txtclust$`K-clusters` == 3, ]$`cleaned tweets`
tweet.txtK4 <- tweet.txtclust[tweet.txtclust$`K-clusters` == 4, ]$`cleaned tweets`
tweet.txtK5 <- tweet.txtclust[tweet.txtclust$`K-clusters` == 5, ]$`cleaned tweets`
as.character(tweet.txtK1)[1:40]
as.character(tweet.txtK2)[1:5]
as.character(tweet.txtK3)[1:15]
as.character(tweet.txtK4)[1:20]
as.character(tweet.txtK5)[1:20]
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```