-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtopicmodel.R
105 lines (82 loc) · 2.95 KB
/
topicmodel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
########################################
# Analyze text data
# LDA Topic modeling
# Yuxiao Luo
library(tidyverse)
library(quanteda)
library(topicmodels)
lyrics <- read_csv("https://raw.githubusercontent.com/YuxiaoLuo/r_analysis_dri_2022/main/data/genius.csv")
lyrics %>% select(artist_name) %>% unique()
lyrics %>% filter(artist_name == 'Full of Hell') %>% select(song_name) %>% unique()
# idea: we want to compare lyrics between 2 artists
# see what topics each one foucses on
# remove unrelevant columns
lyrics <- lyrics %>% select(-section_name, -section_artist, -line_number)
# combine lines of lyrics for 1 song for each artist
lyrics <- lyrics %>% group_by(song_name) %>%
mutate(txt = str_c(line,collapse = ' '), .before = line) %>%
select(-line)
# remove repetitive observations
lyrics <- distinct(lyrics)
BM <- lyrics %>% filter(artist_name == 'Buck Meek')
FH <- lyrics %>% filter(artist_name == 'Full of Hell')
# LDA topic model (ML) Latent Dirichlet allocation
# unsupervised learning, exract features/variables from the dataset
library(quanteda) # text analysis, preprocessing techniques, doc-feature-matrix
library(tm) # text mining, text analysis, doc-term-matrix
# check Buck Meek's songs first
song <- BM
# check Full of Hell's songs
song <- FH
txt_corpus <- corpus(song, text_field = "txt")
# create tokens without preprocessing
txt_corpus <- tokens(txt_corpus)
# create tokens with preprocessing jobs (cleaning)
txt_corpus <- tokens(txt_corpus,
remove_punct = TRUE,
remove_numbers = TRUE,
remove_separators = TRUE)
# remove stopwords
library(stopwords)
stopwords::stopwords('en')
txt_corpus <- tokens_select(txt_corpus, stopwords::stopwords('en', source = 'snowball'),
selection = 'remove')
# create DFM
DFM <- dfm(txt_corpus, tolower = TRUE)
DFM
# remove the songs where all words are stopwords
dfm_subset(DFM, ntoken(DFM)>0)
# selection process (optional)
#dfm_trim(DFM, min_docfreq = 20)
#dfm_trim(DFM, max_docfreq = 0.99, docfreq_type = 'prop')
# remove the text files that all features are 0
DFM <- dfm_subset(DFM, ntoken(DFM)>0)
# convert dfm to dtm for topic model
DTM <- convert(DFM, to = 'tm')
inspect(DTM)
# LDA topic model
library(topicmodels)
# Gibbs sampling
result <- LDA(x = DTM, k = 3, method = 'Gibbs', control = list(seed = 1234))
result
library(tidytext)
topics <- tidy(result, matrix = 'beta')
topics %>% head
# visualize
top_terms <- topics %>% group_by(topic) %>% top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
# graph for Buck Meek's songs
BM_topic <- top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free_y") +
coord_flip()
# graph for Full of Hell's songs
FH_topic <- top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free_y") +
coord_flip()