-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathLexiconBasedAnalysis.R
241 lines (191 loc) · 9.04 KB
/
LexiconBasedAnalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#Lexicon-Based Sentiment Analysis: A list of words are used to compare your scrapped text
#These words are taken from Hu and Liu's Lexicon list (https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html)
#They act as a dictionary and have a list of positive and negative words (6800 words approximately)
#Set your working directory
setwd("")
#Download the text files and place it in your directory
#Import the positive and negative words
pos = readLines("Positive-Words.txt")
neg = readLines("Negative-Words.txt")
#Example:
mytest= c("I love John Cena", "Cute cat!", "That was a bad night", "She loves bad boys")
testsentiment = score.sentiment(mytest, pos, neg)
testsentiment
#Output:
# text score
#1 I love John Cena 1
#2 Cute cat! 1
#3 That was a bad night -1
#4 She loves bad boys 0
#Installing the sentimentr package
#install.packages("sentimentr")
library("sentimentr")
# Lets do the whole process: writting the function and scraping - approach after J. Breen
library("stringr")
library("plyr")
#We follow Jeffrey Breen's approach and create the score.sentiment function
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
#Parameters
#sentences: a vector of text
#pos.words: a vector of words of postive sentiment
#neg.words: a vector of words of negative sentiment
#.progress: passed to laply() to control of progress bar: It shows the time elapsed in the console/terminal
#Creating an array of scores using laply
scores = laply(sentences, function(sentence, pos.words, neg.words)
{
#Removing punctuation using global substitute
sentence = gsub("[[:punct:]]", "", sentence)
#Removing control characters
sentence = gsub("[[:cntrl:]]", "", sentence)
#Removing digits
sentence = gsub('\\d+', '', sentence)
#Error handling while trying to get the text in lower case
tryTolower = function(x)
{
#Create missing value
y = NA
#Try Catch error
try_error = tryCatch(tolower(x), error=function(e) e)
#If not an error
if (!inherits(try_error, "error"))
y = tolower(x)
#Return the result
return(y)
}
#Use this tryTolower function in sapply
sentence = sapply(sentence, tryTolower)
#Split sentences into words with str_split (stringr package)
word.list = str_split(sentence, "\\s+")
#Unlist produces a vector which contains all atomic components in word.list
words = unlist(word.list)
#Compare these words to the dictionaries of positive & negative words
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
#Example: If a sentence is "He is a good boy",
#then, pos.matches returns: [NA, NA, NA, *some number*, NA] : the number depends on the severity of the word is my guess
#neg.matches returns: [NA, NA, NA, NA, NA]
#So the output has NAs and numbers
#We just want a TRUE/FALSE value for the pos.matches & neg.matches
#Getting the position of the matched term or NA
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
#This would return : [F, F, F, T, F] depending on the NA or the match
#The TRUE or FALSE values are treated as 1/0
#To get the final score of the sentence:
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
#Now the scores are put in a dataframe and returned
scores.df = data.frame(text=sentences, score=scores)
return(scores.df)
}
#Testing our 'mytest' sentences
testsentiment = score.sentiment(mytest, pos, neg)
class (testsentiment)
testsentiment$score
testsentiment
#Setting up Twitter:
#Install the required packages
#install.packages("twitteR")
#twitteR acts as an interface between R and Twitter and helps to scrap Twitter data
library("twitteR")
library("httr")
library("tm")
library("SnowballC")
#Create a Developer account in Twitter : https://apps.twitter.com
#Create a new App
#Get the key, secret, access token and access token secret from the Keys and Tokens tab
key = ""
secret = ""
accessToken = ""
accessTokenSecret = ""
#Download the cacert.pem file and store it in your working directory
#Authenticate handshake. Get the PIN and just run it in R
#This comppletes the handshake between R and Twitter
setup_twitter_oauth(key,secret,accessToken,accessTokenSecret)
#Now R is connected to Twitter. Start mininig!
#Note that Twitter allows only 15 scrapes in 15 minutes
#The Twitter Search API searches against a sampling of recent Tweets published in the past 7 days.
#So only tweets in the last 7 days can be mined
#Compare tweets of countries:
#Get the tweets of USA, India, China and Syria
#usatweets = searchTwitter("usa", n=1000, lang="en", since = "2017-12-10", until = "2017-12-10")
usatweets = searchTwitter("#USA", n=1000, lang="en")
indiatweets = searchTwitter("#India", n=1000, lang="en")
chinatweets = searchTwitter("#China", n=1000, lang="en")
syriatweets = searchTwitter("#Syria", n=1000, lang="en")
#Since it is Christmas today, getting the #MerryChristmas tweets :D
christmastweets = searchTwitter("#MerryChristmas", n=1000, lang="en")
#Get only texts from the tweets
usa_txt = sapply(usatweets, function(x) x$getText())
india_txt = sapply(indiatweets, function(x) x$getText())
china_txt = sapply(chinatweets, function(x) x$getText())
syria_txt = sapply(syriatweets, function(x) x$getText())
christmas_txt = sapply(christmastweets, function(x) x$getText())
#Counting the number of tweets for each country
count = c(length(usa_txt), length(india_txt), length(china_txt), length(syria_txt), length(christmas_txt))
#Joining texts
country = c(usa_txt, india_txt, china_txt, syria_txt, christmas_txt)
#Use the score.sentiment function
scores = score.sentiment(country, pos, neg, .progress='text')
#Adding variables to 'scores' dataframe
scores$country = factor(rep(c("USA", "India", "China", "Syria", "Merry Christmas"), count))
#Calculating the number of very positive and very negative tweets
scores$very.pos = as.numeric(scores$score >= 2)
scores$very.neg = as.numeric(scores$score <= -2)
numpos = sum(scores$very.pos)
numneg = sum(scores$very.neg)
#Calculating the global sentiment score
global_score = round( 100 * numpos / (numpos + numneg) )
head(scores)
#Plotting a Box Plot of these countries
#We observe that USA, India and #MerryChristmas have a positive sentiment score, while Syria's sentiment score is negative!
boxplot(score~country, data=scores, col = c("red", "grey"))
#Importing the 'lattice' package for data visualization
library("lattice")
#Plotting a histogram
histogram(data=scores, ~score|country, main="Sentiment Analysis of 4 Countries", col = c("red", "grey"), xlab="", sub="Sentiment Score")
#Writing the Word Cloud function
constructWordcloud = function(tweettext){
#Converting them to UTF-8
tweettext=iconv(tweettext, to= "utf-8", sub="")
#Putting them in a Corpus
mycorpus = Corpus(VectorSource(tweettext))
#Data Cleaning processes
mycorpus <- tm_map(mycorpus,content_transformer(tolower))
mycorpus <- tm_map(mycorpus, removePunctuation)
mycorpus <- tm_map(mycorpus,removeWords,stopwords("english"))
mycorpus <- tm_map(mycorpus, removeNumbers)
#Converting them to a DocumentTermMatrix & TermDocumentMatrix
dtm=DocumentTermMatrix(mycorpus)
tdm=TermDocumentMatrix(mycorpus)
dtmMatrix=as.matrix(dtm)
dtmMatrix
tdm2=as.matrix(tdm)
tdm2
#Getting the frequency of the words
frequency=colSums(dtmMatrix)
frequency=sort(frequency,decreasing = TRUE)
#Importing the 'wordcloud' & 'RColorBrewer' packages
library("wordcloud")
library("RColorBrewer")
#Get the names of the words
words=names(frequency)
#Basic Word Cloud
wordcloud(words[1:100], frequency[1:100])
#Advanced Word Cloud
col <- brewer.pal(5,"Dark2")
wordcloud(words[1:100], frequency[1:100], scale = c(4,1), rot.per = 0, colors= col,random.color=F, max.word=100, random.order=F)
}
#Constructing Word Clouds for the countries
constructWordcloud(india_txt)
constructWordcloud(usa_txt)
constructWordcloud(china_txt)
constructWordcloud(syria_txt)
#Constructing Word Cloud for '#MerryChristmas'
constructWordcloud(christmas_txt)
#If you want to remove specific words from the tweets:
mytest = c("Su", "Cena", "love")
myindex = which(mytest == "love")
mytest[-myindex]