NBA_TwitsMining.Rmd

---
title: "NBA Topic Mining"
author: "Shanshan Bradford"
output:
  word_document: default
  pdf_document: default
  html_document: default
---

Section 1
```{r setup, include = TRUE, error=TRUE, fig.dim=c(2, 0.5)}
options(tinytex.verbose = TRUE)

#clear up memory, set working directory and seeds
rm(list = ls())
setwd("/Users/syu/Library/CloudStorage/OneDrive-St.JudeChildren'sResearchHospital/UDrive/Documents_syu_Backup/Github_deposit/TextMining")

#Load twitter package
library(twitteR)
library(bitops)
library(RCurl)
library(ROAuth)

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```

#1.Retrieve tweets from Twitter with the hashtag #nba #

#Assign Twitter consumer key, secret and access token and secret

consumer_key <- "QLmWyDb3OmiMi7kkWta68F5rd" 

consumer_secret <- "7YmvWUwlj6VwqA1B5P1TDetEelfJJnaqOyqGjIKisgvKFvXeib"

access.token <- "913200731829698560-N8rWlg3JqjK473rMWpqpEcvI8nHWC1B"

access.secret <- "Ka7zH3edvoOULrJC4CoudLZmqJiTxoI9JlkGOxBNJbGw2"

#connect to twitter and search tweets #nba 

setup_twitter_oauth(consumer_key, consumer_secret, access.token, access.secret)

nba.tweets <- searchTwitter("#nba", n = 320, lang = "en")


#strip retweets and check the number of tweets afterward 

nba.nort <- strip_retweets(nba.tweets, strip_manual = T, strip_mt = T)

length(nba.nort)


#convert the tweets to dataframe and check out associated attributes 

nba.df <- twListToDF(nba.nort)

colnames(nba.df)

#save tweets to csv 

write.csv(nba.df, file = "NBA_tweets.csv", na = "NA")

Section 2
```{r, include = TRUE, error=TRUE}
options(tinytex.verbose = TRUE)

# 2. Clean up tweets
tweet.tb <- read.csv("NBA_tweets.csv", header = T, sep = ",", as.is = T)
nba.text <- gettext(tweet.tb$text)

# Replace @UserName with one space
# One space replacement is to avoid words being glued together
nba.modify <- gsub("@\\w{1,20}", " ", nba.text)

# Replace control character "\n" and "\n\n" with one space
nba.modify <- gsub("[[:cntrl:]]{1,10}", " ", nba.modify)

# Replace https links with one space
nba.modify <- gsub("(https)(://)(.*)[/]\\w+", " ", nba.modify)

# Replace punctuation with one space
nba.modify <- gsub("[[:punct:]]{1,20}", " ", nba.modify)

# Replace non graphical character with space
nba.modify <- gsub("[^[:graph:]]", " ", nba.modify)

# Replace tab and extra space introduced early with one space
nba.modify <- gsub("[ |\t]{2,}", " ", nba.modify)
nba.modify <- gsub("\\s+", " ", nba.modify)

# Remove extra blank space at the beginning and the end
nba.modify <- gsub("^ +", "", nba.modify)
nba.modify <- gsub(" $+", "", nba.modify)


# 3. Preprocess tweets further for analysis
library(NLP)
library("tm")
library(RColorBrewer)
library(wordcloud)
library("SnowballC")
library("lsa")

# generate corpus for the cleaned nba tweets and check out the corpus length
nba.corpus <- VCorpus(VectorSource(nba.modify))
length(nba.corpus)

# transform the corpus to lower case, remove punctuation and numbers and randomly check sample
trans.nbacorp <- tm_map(nba.corpus, content_transformer(tolower)) #convert to lower cases
trans.nbacorp <- tm_map(trans.nbacorp, removePunctuation) # remove punctuation
trans.nbacorp <- tm_map(trans.nbacorp, removeNumbers) # remove numbers
trans.nbacorp <- tm_map(trans.nbacorp, stripWhitespace)

# remove stop words
input.nbacorp <- tm_map(trans.nbacorp, removeWords, 
                 c(stopwords("english"), "can", "don", "just", "nba", "via",
                   "e", "s", "y"))
#find an empty entry
inspect(input.nbacorp[[4]])


# 4. Generate a word cloud with the NBA tweets
set.seed(1234)

#generate document-term matrix in order to remove empty documents
nbacorp.dtm <- DocumentTermMatrix(input.nbacorp)
row.total <- apply(nbacorp.dtm, 1, sum)

#Correspondingly, remove the same empty entries from the corpus and document-term matrix 
input.nbacorp.noemp <- input.nbacorp[which(row.total > 0)]
nbacorp.docterm <- DocumentTermMatrix(input.nbacorp.noemp)
# or the following code will generate the same doc-term matrix with empty entries removed
nbacorp.docterm <- nbacorp.dtm[which(row.total > 0),]

#The index of the matrix shifts accordingly, but the doc entry index remains the same
inspect(nbacorp.docterm[15:16,])

# generate the term-document Matrix from the cleaned document-term matrix
nbacorp.terdoc <- t(nbacorp.docterm)
inspect(nbacorp.terdoc)

# Find frequency of terms in term-doc matrix with frequency over 3
findFreqTerms(nbacorp.terdoc, lowfreq = 3)

#sort the term by frequency and plot terms of frequency over 3 in a word cloud
nbacorp.terdoc.matrix <- as.matrix(nbacorp.terdoc)
nbaterm.freqbydoc <- sort(rowSums(nbacorp.terdoc.matrix), decreasing = T, na.last = NA)

wordcloud(names(nbaterm.freqbydoc), nbaterm.freqbydoc, min.freq = 3, max.words = 100, 
          textStemming = FALSE, colors=brewer.pal(8, "Dark2"))

# subset the terms with a frequency over 5 and generate a barplot 
V.minfreq6 <- rowSums(nbacorp.terdoc.matrix) > 5
nbaterm.minfreq6 <- sort(rowSums(nbacorp.terdoc.matrix)[V.minfreq6], decreasing = T)
term.barplot <- barplot(nbaterm.minfreq6, horiz = F, col = rainbow(length(nbaterm.minfreq6)))

legend(20, 20.5, legend = names(nbaterm.minfreq6),fill = rainbow(length(nbaterm.minfreq6)),
      cex = 0.75, ncol = 3, x.intersp = 0.2, y.intersp = 0.7, text.width = 9, bty = "n")


# 5. Identify the top three pairs of tweets and the most frequently used terms among these pairs
library(reshape2)
library(Matrix)
library("lsa")

#create consine similarity matrix and check result
nbacorp.cosi <- as.matrix(cosine(nbacorp.terdoc.matrix))
nbacorp.cosi[1:9, 1:9]

#replace all the diagonal value from 1 to NA
diag.replace <- function(x){
  for (i in 1: nrow(x)){
    if (x[i, i] == 1 | x[i, i] == 0) 
    { x[i,i] <- NA }  
  }
  return(x)
}

nbacorp.cosmod <- diag.replace(nbacorp.cosi)
nbacorp.cosmod[1:6, 1:6]

#convert the sparse matrix into a molten data frame and sort it based on the cosine value
nbacorp.cosmolten <- melt(nbacorp.cosmod, na.rm = T, c("m.row.doc", "m.col.doc"))
nbacorp.cosmolten <- nbacorp.cosmolten[order(nbacorp.cosmolten$value, decreasing = T),]
nbacorp.cosmolten[1:25,]

#inspect tweet pairs with cosine similarity of 1 --> These tweets seems to be repost
inspect(input.nbacorp[[15]])
inspect(input.nbacorp[[16]])
inspect(input.nbacorp[[42]])
inspect(input.nbacorp[[61]])
inspect(input.nbacorp[[44]])
inspect(input.nbacorp[[57]])

#After empty entries removed, the doc index numbers in the matrix shift from doc entry numbers
typeof(row.names(nbacorp.docterm)) # doc entry number in the doc-term matrix are characters
# Therefore, the character value instead of numeric values can correctly index doc entries
inspect(nbacorp.docterm[c("15","16","61","42","57","44"),]) 

#coerce the doc-term matrix to R matrix
nbacorp.docterm.matrix <- as.matrix(nbacorp.docterm)

#subset the matrix with reposted tweets
nbacorp.repost.matrix <- nbacorp.docterm.matrix[c("15","16","61","42","57","44"),]
nbacorp.repost.matrix[, 1:20] #although subsetted, matrix inherited every term from all the tweets
#which() & apply() index the terms that are only in the repost docs/tweets
term.inrepost <- names(which(apply(nbacorp.repost.matrix, 2, sum) > 0))

#The top 10 most used terms from all the tweets
top10.term <- names(nbaterm.freqbydoc[1:10])

#write a function to check whether any of the top 10 terms included in subsetted similar tweets
identical.term <- function(x, y){
  for (i in 1: length(x)){
    if(length(grep(x[i], y)) > 0)
    {print(c(x[i],grep(x[i], y, value = T)))}
  }
}

top10.term
term.inrepost
identical.term(term.inrepost, top10.term)

##inpect tweet pairs with cosine similarity less than 1 
inspect(input.nbacorp[[182]])
inspect(input.nbacorp[[200]])
inspect(input.nbacorp[[50]])
inspect(input.nbacorp[[44]])
inspect(input.nbacorp[[138]])
inspect(input.nbacorp[[131]])

#subset the matrix with docs of cosine similarity
nbacorp.similar.matrix <- nbacorp.docterm.matrix[c("182", "200","50", "44", "138", "131"), ]
nbacorp.similar.matrix[, 1:20] #although subsetted, matrix inherited every term from all the tweets
#which() & apply() index the terms that are only in the similar docs/tweets
term.insimilar <- names(which(apply(nbacorp.similar.matrix, 2, sum) > 0))

#Check whether any of the top 10 terms are included in the similar tweets
top10.term
term.insimilar
identical.term(term.insimilar, top10.term)


# 6. Identify terms with the highest weighted tf-idf among the top three pairs of tweets
#calculate the tfidf of the document-term matrix created during # 4
nbacorp.dttfidf <- weightTfIdf(nbacorp.docterm)
inspect(nbacorp.dttfidf[1:6,])

#convert the document-term matrix to numeric matrix and calculate a total tfidf of each document
nbacorp.dttfidf.matrix <- as.matrix(nbacorp.dttfidf)
nbadoc.countfidf <- sort(rowSums(nbacorp.dttfidf.matrix), decreasing = T)
nbadoc.countfidf[1:20]

#write a function to find the identical sum of tfidf of each document/tweets
same.tweets <- function(x) {
  temp.x <- x
  names(temp.x) <- NULL
  for(i in 1:length(temp.x))
  {
    if(identical(temp.x[i], temp.x[i+1]) == T)
    {print(x[c(i,i+1)])}
  }
}

same.tweets(nbadoc.countfidf)

# Inspect the content of the highest score of tweets
inspect(input.nbacorp[[3]])
inspect(input.nbacorp[[39]])
inspect(input.nbacorp[[49]])
inspect(input.nbacorp[[29]])
inspect(input.nbacorp[[38]])
inspect(input.nbacorp[[143]])
inspect(input.nbacorp[[148]])

#calculate  tfidf of all the terms and convert results to R matrix
nbacorp.tertfidf.matrix <- as.matrix(weightTfIdf(nbacorp.terdoc, normalize = T))
#subset the matrix with 3 pairs of tweets having the highest tfidf sum
top3tweet.tfidf.matrix <- nbacorp.tertfidf.matrix[, c("3", "39", "29", "38", "143", "148")]
top3tweet.tfidf.matrix[1:10,]#terms used in other tweets were inherited in the subsetted matrix
term.top3tweet <- names(which(apply(top3tweet.tfidf.matrix, 1, sum) > 0))

#Harvest the top 10 terms of highest tfidf values
top10.tfidfterm <- sort(rowSums(nbacorp.tertfidf.matrix), decreasing = T)[1:10]
top10.tfidfterm <- names(top10.tfidfterm)

#check the overlapped term with identical.term function
term.top3tweet
top10.tfidfterm
identical.term(term.top3tweet, top10.tfidfterm)


# 7. Determine the optimal numbers of clusters for the tweets
# Compute kmean and plot wss from k = 1 to k = 20.
set.seed(2345)
k.max <- 15
tot.wss <- sapply(2:k.max, simplify = T,
      function(k){kmeans(nbacorp.docterm.matrix, k, nstart = 50, iter.max = 100)$tot.withinss})

bet.ss <- sapply(2:k.max, simplify = T,
      function(k){kmeans(nbacorp.docterm.matrix, k, nstart = 50, iter.max = 100)$betweenss})
tot.wss
bet.ss
plot(2:k.max, tot.wss/bet.ss,
     type = "b", pch = 19, frame = T, lwd = 1, col= rainbow(k.max), 
     xlab = "Number of clusters K", ylab = "Ratio of total within-clusters to betweenss")
text(2:k.max, tot.wss/bet.ss, labels = 2:k.max, adj = c(-0.5, -0.5), cex = 0.75)
abline(v = 5, lwd = 2, lty = 4, col = "blue")


# 8. Identify the groups of tweets having similar characteristics
#pick up k-custer at 6
set.seed(2345)
nbacorp.cluster <- kmeans(nbacorp.docterm.matrix, 5, nstart = 30, iter.max = 50)
nbacorp.cluster$cluster[1:25]

#use sapply to extract the text from corpus 
inputcorp.text <- t(data.frame(sapply(input.nbacorp.noemp, "[", "content")))

#index out empty entries of corpus from original tweets and extracted text, 
#and combine the extract text, original tweet text and cluster vector
row.total.dataframe <- names(which(row.total > 0))
tweet.txtclust <- data.frame(tweet.tb[c(row.total.dataframe),]$text, 
                        as.character(inputcorp.text),
                        nbacorp.cluster$cluster)

#change the column names and organize the table by clusters
names(tweet.txtclust) <- c("orginal tweets", "cleaned tweets", "K-clusters")
tweet.txtclust <- tweet.txtclust[order(tweet.txtclust$`K-clusters`, decreasing = F),]

#subset cleaned tweet text data by clusters. 
tweet.txtK1 <- tweet.txtclust[tweet.txtclust$`K-clusters` == 1, ]$`cleaned tweets`
tweet.txtK2 <- tweet.txtclust[tweet.txtclust$`K-clusters` == 2, ]$`cleaned tweets`
tweet.txtK3 <- tweet.txtclust[tweet.txtclust$`K-clusters` == 3, ]$`cleaned tweets`
tweet.txtK4 <- tweet.txtclust[tweet.txtclust$`K-clusters` == 4, ]$`cleaned tweets`
tweet.txtK5 <- tweet.txtclust[tweet.txtclust$`K-clusters` == 5, ]$`cleaned tweets`

as.character(tweet.txtK1)[1:40]
as.character(tweet.txtK2)[1:5]
as.character(tweet.txtK3)[1:15]
as.character(tweet.txtK4)[1:20]
as.character(tweet.txtK5)[1:20]

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```