-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path01-search-DemRes.R
179 lines (149 loc) · 5.4 KB
/
01-search-DemRes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
## --------------------------------------------------------- ##
##
## FILE 01: estimate share of articles with open software
## in relevant publications of Demographic Research
## during the years 2010-2023 (all sharable Open Access publications)
##
## sessionInfo() details:
##
## R version 4.2.3 (2023-03-15 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows Server x64 (build 17763)
##
## attached base packages:
## stats graphics grDevices utils datasets
## methods base
##
## other attached packages:
## strex_2.0.0 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
## dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0 pdftools_3.4.0
##
## --------------------------------------------------------- ##
## cleaning the workspace
rm(list=ls(all=TRUE))
## set up the directory where .R is saved (R-studio command)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
## packages
library(pdftools)
library(tidyverse)
library(strex)
## loading list of keywords
source("funs/keywords.R")
## input directory (where Demographic Research papers are stored)
inputDIR <- paste0(getwd(),"/data/input/DemographicResearch")
## output directory
outDIR <- paste0(getwd(),"/data/output")
## temporary directory
tempDIR <- paste0(getwd(),"/data/temp")
## delete temporary folder if present
unlink(tempDIR,recursive = T)
##----- starting the text search -----
setwd(inputDIR)
## number of volumes
n.vol <- length(dir())
## starting a for loop to analyse each volume separately
vol <- 1
for (vol in 1:n.vol){
## unzipping
setwd(inputDIR)
zipF <- dir()[vol]
unzip(zipF,exdir=tempDIR)
## loading all files
file_list <- list.files(path=tempDIR,pattern="*.pdf")
setwd(tempDIR)
all_files <- lapply(file_list, FUN = function(files) {
pdftools::pdf_text(files)
})
## delete temporary folder
unlink(tempDIR,recursive = T)
## extract volume number and year
volume <- str_nth_number(zipF, n = 1)
years <- rep(seq(2010, 2023, by = 1), each = 2)
year <- years[vol]
cat("Analysing volume",volume,"year",year,"\n")
## number of articles in this volume
n <- length(all_files)
## create long dataframe for storing results
DF.long.temp <- tibble(Year=rep(year,n),Volume=rep(volume,n),
Issue=rep(NA,n),Article=rep(NA,n),
Name=rep(NA,n),OpenAccess=rep(1,n),OpenMaterials=rep(0,n),
OMpage=rep(NA,n),OMkeyword=rep(NA,n),
OMrequest=rep(0,n),Journal=rep("Dem Res",n))
## analyse each paper
i <- 1
for (i in 1:n){
## save article number and name
DF.long.temp$Article[i] <- i
DF.long.temp$Name[i] <- file_list[i]
## transom to text
paper <- all_files[i]
text<-tolower(unlist(paper))
## transformed text to remove character sometimes imported by pdftools
text2 <-gsub("", "", tolower(unlist(paper)))
## find References page
any_ref_page <- any(str_detect(text2,"\\breferences\\b"))
## exclude references (keep first page of references as sometimes it
## coincides with data availability statement)
if (any_ref_page){
text2 <- text2[1:max(which(str_detect(text2,"\\breferences\\b")))]
}
## find Open Access mention
OAmention <- any(str_detect(text2,paste("creative commons",collapse = '|')))
if (OAmention){
DF.long.temp$OpenAccess[i] <- 1
}
## find Open Materials mention and page
OMmention <- any(str_detect(text2,paste(paste0("\\b",keywords,"\\b"),collapse = '|')))
whi <- which(str_detect(text2,paste(paste0("\\b",keywords,"\\b"),collapse = '|')))
## flag if this is available upon request (and check later)
requestTRUE <- any(str_detect(text2,paste(paste0("\\b",keywordsREQUEST,"\\b"),collapse = '|')))
if (OMmention){
DF.long.temp$OpenMaterials[i] <- 1
DF.long.temp$OMpage[i] <- paste(whi, collapse = "; ")
if (requestTRUE) DF.long.temp$OMrequest[i] <- 1
## extract specific keywords detected
keytemp <- list()
k <- 1
j <- 1
for (j in 1:length(keywords)){
whi <- which(str_detect(text2,paste0("\\b",keywords[j],"\\b")))
if (length(whi)>0){
keytemp[[k]] <- keywords[j]
k <- k+1
}
}
DF.long.temp$OMkeyword[i] <- paste(unlist(keytemp), collapse = "; ")
}
}
if (vol==1){
DF.long <- DF.long.temp
}else{
DF.long <- DF.long %>%
bind_rows(DF.long.temp)
}
}
## save data
setwd(outDIR)
save(DF.long,file="01-DemRes.Rdata")
##---- plotting
DF.long %>%
group_by(Year) %>%
summarise(Articles=n(),OpenAccess=sum(OpenAccess),
OpenMaterials=sum(OpenMaterials)) %>%
mutate(Year=as.integer(Year),frac_OA=OpenAccess/Articles,
frac_OM=OpenMaterials/Articles) %>%
pivot_longer(c(frac_OA,frac_OM)) %>%
mutate(name=recode(name,frac_OA="Open Access",
frac_OM="Open Study Materials")) %>%
ggplot(aes(x=Year,y=value, group=1)) +
geom_point(size=2) + geom_line() +
facet_wrap(.~name)+
scale_shape_manual(values = c(15)) +
labs(y="Percentage",color="Journal") +
theme_bw(base_size = 14) +
theme(legend.position="bottom")+
scale_y_continuous(labels = scales::percent_format(accuracy = 1),
limits = c(0,1))+
scale_x_continuous(breaks= seq(min(DF.long$Year),max(DF.long$Year),2))
## END