forked from ubasellini/OpenScience-DemographicResearch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-search-Demography.R
184 lines (153 loc) · 5.58 KB
/
01-search-Demography.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
## --------------------------------------------------------- ##
##
## FILE 01: estimate share of articles with Open Access and
## open software in relevant publications of Demography
## during the years 2021-2023 (all sharable Open Access publications)
##
## sessionInfo() details:
##
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows Server x64 (build 17763)
##
## attached base packages:
## stats graphics grDevices utils datasets
## methods base
##
## other attached packages:
## strex_2.0.0 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
## dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0 pdftools_3.4.0
##
## --------------------------------------------------------- ##
## cleaning the workspace
rm(list=ls(all=TRUE))
## set up the directory where .R is saved (R-studio command)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
## packages
library(pdftools)
library(tidyverse)
library(strex)
## loading list of keywords
source("funs/keywords.R")
## input directory (where Demography papers are stored)
inputDIR <- paste0(getwd(),"/data/input/Demography")
## output directory
outDIR <- paste0(getwd(),"/data/output")
## temporary directory
tempDIR <- paste0(getwd(),"/data/temp")
## delete temporary folder if present
unlink(tempDIR,recursive = T)
##----- starting the text search -----
setwd(inputDIR)
## number of volumes
n.vol <- length(dir())
## starting a for loop to analyse each volume separately
vol <- 1
for (vol in 1:n.vol){
## unzipping
setwd(inputDIR)
zipF <- dir()[vol]
unzip(zipF,exdir=tempDIR)
## loading all files
file_list <- list.files(path=tempDIR,pattern="*.pdf")
setwd(tempDIR)
all_files <- lapply(file_list, FUN = function(files) {
# message(files)
pdftools::pdf_text(files)
})
## delete temporary folder
unlink(tempDIR,recursive = T)
## extract volume number, issue number and year (from zipped folder)
volume <- str_nth_number(zipF, n = 1)
issue <- str_nth_number(zipF, n = 2)
year <- str_nth_number(zipF, n = 3)
cat("Analysing volume",volume,
"issue", issue,"year",year,"\n")
## number of articles in this volume
n <- length(all_files)
## create long dataframe for storing individual-level results (i.e. for each paper)
DF.long.temp <- tibble(Year=rep(year,n),Volume=rep(volume,n),
Issue=rep(issue,n),Article=rep(NA,n),Name=rep(NA,n),
OpenAccess=rep(0,n),OpenMaterials=rep(0,n),
OMpage=rep(NA,n),OMkeyword=rep(NA,n),
OMrequest=rep(0,n),
Journal=rep("Demography",n))
## analyse each individual paper
i <- 1
for (i in 1:n){
## save article number and name
DF.long.temp$Article[i] <- i
DF.long.temp$Name[i] <- file_list[i]
## transform paper to text
paper <- all_files[i]
text <- tolower(unlist(paper))
## transformed text to remove character sometimes imported by pdftools
text2 <- gsub("", "", tolower(unlist(paper)))
## find References page
any_ref_page <- any(str_detect(text2,"\\breferences\\b"))
## exclude references (keep first page of references as sometimes it
## coincides with data availability statement)
if (any_ref_page){
text2 <- text2[1:max(which(str_detect(text2,"\\breferences\\b")))]
}
## find Open Access mention
OAmention <- any(str_detect(text2,paste("creative commons",collapse = '|')))
if (OAmention){
DF.long.temp$OpenAccess[i] <- 1
}
## find Open Materials mention and page
OMmention <- any(str_detect(text2,paste(paste0("\\b",keywords,"\\b"),collapse = '|')))
whi <- which(str_detect(text2,paste(paste0("\\b",keywords,"\\b"),collapse = '|')))
## flag if this is available upon request (and check later)
requestTRUE <- any(str_detect(text2,paste(paste0("\\b",keywordsREQUEST,"\\b"),collapse = '|')))
if (OMmention){
DF.long.temp$OpenMaterials[i] <- 1
DF.long.temp$OMpage[i] <- paste(whi, collapse = "; ")
if (requestTRUE) DF.long.temp$OMrequest[i] <- 1
## extract specific keywords detected
keytemp <- list()
k <- 1
j <- 1
for (j in 1:length(keywords)){
whi <- which(str_detect(text2,paste0("\\b",keywords[j],"\\b")))
if (length(whi)>0){
keytemp[[k]] <- keywords[j]
k <- k+1
}
}
DF.long.temp$OMkeyword[i] <- paste(unlist(keytemp), collapse = "; ")
}
}
## combine long dataframe
if (vol==1){
DF.long <- DF.long.temp
}else{
DF.long <- DF.long %>%
bind_rows(DF.long.temp)
}
}
## save data
setwd(outDIR)
save(DF.long,file="01-Demography.Rdata")
##---- plotting
DF.long %>%
group_by(Year) %>%
summarise(Articles=n(),OpenAccess=sum(OpenAccess),
OpenMaterials=sum(OpenMaterials)) %>%
mutate(Year=as.integer(Year),frac_OA=OpenAccess/Articles,
frac_OM=OpenMaterials/Articles) %>%
pivot_longer(c(frac_OA,frac_OM)) %>%
mutate(name=recode(name,frac_OA="Open Access",
frac_OM="Open Study Materials")) %>%
ggplot(aes(x=Year,y=value, group=1)) +
geom_point(size=2) + geom_line() +
facet_wrap(.~name)+
scale_shape_manual(values = c(15)) +
labs(y="Percentage",color="Journal") +
theme_bw(base_size = 14) +
theme(legend.position="bottom")+
scale_y_continuous(labels = scales::percent_format(accuracy = 1),
limits = c(0,1))+
scale_x_continuous(breaks= seq(min(DF.long$Year),max(DF.long$Year),2))
## END