1
+ # ' Retrieve filtered statements from a database
2
+ # '
3
+ # ' Function \code{get_filtered_statements} reads filtered statements from a database.
4
+ # '
5
+ # ' @details
6
+ # ' Function \code{get_filtered_statements} reads filtered statements from a database.
7
+ ? # ' The result of this function is an invisible data frame with statements' data.
8
+ # '
9
+ # ' Possible filters:
10
+ # ' \enumerate{
11
+ # ' \item terms_of_office - range of terms of office's numbers. This filter is a integer
12
+ # ' vector with two elements, where the first describes a left boundary of range
13
+ # ' and the second a right boundary. It is possible to choose only one term of office,
14
+ # ' just try the same number as first and second element of vector.
15
+ # ' \item deputies - full names of deputies. This filter is a character vector with full
16
+ # ' names of deputies in format: 'surname first_name second_name'. If you are not sure
17
+ # ' if the deputy you were thinking about has second name, try 'surname first_name' or
18
+ # ' just 'surname'. There is high probability that proper deputy will be chosen.
19
+ # ' It is possible to choose more than one deputy.
20
+ # ' \item dates - period of time. This filter is a character vector with two elements
21
+ # ' in date format 'YYYY-MM-DD', where the first describes left boundary of period and
22
+ # ' the second right boundary. It is possible to choose only one day, just try the same
23
+ # ' date as first and second element of vector.
24
+ # ' \item topics - text patterns. This filter is a character vector with text patterns of
25
+ # ' topics in order points. Note that the order points are written like
26
+ # ' sentences, so remember about case inflection of nouns and adjectives and use stems of
27
+ # ' words as patterns. For example if you want to find order points about education (in Polish:
28
+ # ' szkolnictwo) try 'szkolnictw'. It is possible to choose more than one pattern.}
29
+ # ' \item content - text patterns. This filter is a character vector with text patterns
30
+ # ' in statements. Note that strings with statements are sentences, so remember about case
31
+ # ' inflection of nouns and adjectives and use stems of words as patterns.
32
+ # ' For example if you want to find order points about education (in Polish:
33
+ # ' szkolnictwo) try 'szkolnictw'. It is possible to choose more than one pattern.}
34
+ # '
35
+ # ' If you did not choose any filter, the whole database will be downloaded.
36
+ # ' Note that, due to data size (<= ~150 MB) it may take few seconds / minutes
37
+ # ' to download all statements.
38
+ # '
39
+ # ' Because of encoding issue on Windows operation system, you also need to select
40
+ # ' if you use Windows.
41
+ # '
42
+ # ' @usage get_filtered_statements(dbname = 'sejmrp', user = 'reader',
43
+ # ' password = 'qux94874', host = 'services.mini.pw.edu.pl',
44
+ # ' windows = .Platform$OS.type == 'windows', terms_of_office = integer(0),
45
+ # ' deputies = character(0), dates = character(0), topics = character(0))
46
+ # ' content = integer(0)
47
+ # '
48
+ # ' @param dbname name of database; default: 'sejmrp'
49
+ # ' @param user name of user; default: 'reader'
50
+ # ' @param password password of database; default: 'qux94874'
51
+ # ' @param host name of host; default: 'services.mini.pw.edu.pl'
52
+ # ' @param windows information of used operation system; default: .Platform$OS.type == 'windows'
53
+ # ' @param terms_of_office range of terms of office's numbers that will be taken to filter data
54
+ # ' from database; default: integer(0)
55
+ # ' @param deputies full names of deputies that will be taken to filter data from database;
56
+ # ' default: character(0)
57
+ # ' @param dates period of time that will be taken to filter data from database;
58
+ # ' default: character(0)
59
+ # ' @param topics text patterns that will be taken to filter data from database;
60
+ # ' default: character(0)
61
+ # ' @param content text patterns that will be taken to filter data from database;
62
+ # ' default: character(0)
63
+ # ' @param max_rows maximum number of rows to download
64
+ # '
65
+ # ' @return data frame with NULL
66
+ # '
67
+ # ' @examples
68
+ # ' \dontrun{
69
+ # ' filtered_statements <- get_filtered_statements()
70
+ # ' dim(filtered_statements)
71
+ # ' # [1] 234483 6
72
+ ?# ' names(filtered_statements)
73
+ ? # ' [1] 'id_statement' 'nr_term_of_office' 'surname_name' 'date_statement'
74
+ ? # ' [5] 'titles_order_points' 'statement'
75
+ ?# ' object.size(filtered_statements)
76
+ ?# ' # 148694336 bytes}
77
+ # '
78
+ # ' @note
79
+ # ' Default parameters use privilages of 'reader'. It can only SELECT data from database.
80
+ # '
81
+ # ' All information is stored in PostgreSQL database.
82
+ # '
83
+ # ' @author Tomasz Mikolajczyk, Piotr Smuda
84
+ # '
85
+ # ' @export
86
+ # '
87
+ # ' @importFrom dplyr src_postgres
88
+ # ' @importFrom dplyr tbl
89
+ # ' @importFrom dplyr sql
90
+ # ' @importFrom dplyr filter
91
+ # ' @importFrom dplyr between
92
+ # ' @importFrom dplyr mutate
93
+ # ' @importFrom dplyr collect
94
+ # '
95
+
96
+ get_filtered_statements <- function (dbname = " sejmrp" , user = " reader" , password = " qux94874" ,
97
+ host = " services.mini.pw.edu.pl" , windows = .Platform $ OS.type == " windows" ,
98
+ terms_of_office = integer(0 ), deputies = character (0 ),
99
+ dates = character (0 ), topics = character (0 ), content = character (0 ),
100
+ max_rows = Inf ) {
101
+ stopifnot(is.numeric(max_rows ), is.character(dbname ), is.character(user ), is.character(password ),
102
+ is.character(host ), is.logical(windows ), is.numeric(terms_of_office ), is.character(deputies ),
103
+ is.character(dates ), is.character(topics ), is.character(content ),
104
+ all(c(terms_of_office )%% 1 == 0 ))
105
+
106
+ length_terms_of_office <- length(terms_of_office )
107
+ length_deputies <- length(deputies )
108
+ length_dates <- length(dates )
109
+ length_topics <- length(topics )
110
+ length_content <- length(content )
111
+
112
+ stopifnot(length_terms_of_office > = 0 , length_deputies > = 0 , length_dates == 0 | length_dates == 2 ,
113
+ length_topics > = 0 , length_content > = 0 )
114
+
115
+ # connecting to database with dplyr to get statements
116
+ drv <- dbDriver(" PostgreSQL" )
117
+ database_diet <- dbConnect(drv , dbname = dbname , user = user , password = password , host = host )
118
+
119
+ # add information about new SELECT to the counter table
120
+ dbSendQuery(database_diet , paste0(" INSERT INTO counter (what, date) VALUES ('filt_statements','" , Sys.Date(), " ')" ))
121
+
122
+ suppressWarnings(dbDisconnect(database_diet ))
123
+
124
+ # fake variables in order to pass CRAN CHECK
125
+ nr_term_of_office <- NULL
126
+ surname_name <- NULL
127
+ date_statement <- NULL
128
+ titles_order_points <- NULL
129
+ statement <- NULL
130
+ `%SIMILAR TO%` <- NULL
131
+
132
+ # connecting to database with dplyr to get statements
133
+ database_diet <- src_postgres(dbname = dbname , user = user , password = password , host = host )
134
+
135
+ # read data dodac potem
136
+ statements <- tbl(database_diet , sql(" SELECT * FROM statements" ))
137
+
138
+ # terms_of_office filter
139
+ if (length_terms_of_office == 1 ) {
140
+ statements <- filter(statements , between(nr_term_of_office , terms_of_office [1 ]))
141
+ } else if (length_terms_of_office == 2 ) {
142
+ statements <- filter(statements , between(nr_term_of_office , terms_of_office [1 ], terms_of_office [2 ]))
143
+ }
144
+
145
+ # deputies filter
146
+ if (length_deputies > 0 ) {
147
+ # changing polish characters for any character
148
+ deputies <- stri_replace_all_regex(deputies , " [^a-zA-Z %]" , " _" )
149
+ deputies <- paste0(" (%" , deputies , " %)" )
150
+ deputies <- paste0(deputies , collapse = " |" )
151
+ statements <- filter(statements , surname_name %SIMILAR TO % deputies )
152
+ }
153
+
154
+ # dates filter
155
+ if (length_dates == 2 ) {
156
+ statements <- filter(statements , between(date_statement , dates [1 ], dates [2 ]))
157
+ }
158
+
159
+ # topics filter
160
+ if (length_topics > 0 ) {
161
+ # changing polish characters for any character
162
+ topics <- stri_replace_all_regex(topics , " [^a-zA-Z %]" , " _" )
163
+ topics <- paste0(" (%" , topics , " %)" )
164
+ topics <- paste0(topics , collapse = " |" )
165
+ statements <- filter(statements , titles_order_points %SIMILAR TO % topics )
166
+ }
167
+
168
+ # topics filter
169
+ if (length_content > 0 ) {
170
+ # changing polish characters for any character
171
+ content <- stri_replace_all_regex(content , " [^a-zA-Z %]" , " _" )
172
+ content <- paste0(" (%" , content , " %)" )
173
+ content <- paste0(content , collapse = " |" )
174
+ statements <- filter(statements , statement %SIMILAR TO % content )
175
+ }
176
+
177
+ # reading data
178
+ statements <- as.data.frame(collect(statements , stringsAsFactors = FALSE , n = max_rows ))
179
+
180
+ # if empty result of query
181
+ if (nrow(statements ) == 0 ) {
182
+ suppressWarnings(dbDisconnect(database_diet $ con ))
183
+ return (statements )
184
+ }
185
+
186
+ # encoding for windows
187
+ if (windows ) {
188
+ statements [, 3 ] <- iconv(statements [, 3 ], from = " UTF-8" , to = " Windows-1250" )
189
+ statements [, 5 ] <- iconv(statements [, 5 ], from = " UTF-8" , to = " Windows-1250" )
190
+ statements [, 6 ] <- iconv(statements [, 6 ], from = " UTF-8" , to = " Windows-1250" )
191
+ }
192
+
193
+ suppressWarnings(dbDisconnect(database_diet $ con ))
194
+ return (invisible (statements ))
195
+
196
+ }
0 commit comments