Skip to content

Commit c014a84

Browse files
author
mikolajjj
committed
filtrowanie wypowiedzi
1 parent 1c99d5b commit c014a84

File tree

1 file changed

+196
-0
lines changed

1 file changed

+196
-0
lines changed

sejmRP/R/get_filtered_statements.R

+196
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
#' Retrieve filtered statements from a database
2+
#'
3+
#' Function \code{get_filtered_statements} reads filtered statements from a database.
4+
#'
5+
#' @details
6+
#' Function \code{get_filtered_statements} reads filtered statements from a database.
7+
? #' The result of this function is an invisible data frame with statements' data.
8+
#'
9+
#' Possible filters:
10+
#' \enumerate{
11+
#' \item terms_of_office - range of terms of office's numbers. This filter is a integer
12+
#' vector with two elements, where the first describes a left boundary of range
13+
#' and the second a right boundary. It is possible to choose only one term of office,
14+
#' just try the same number as first and second element of vector.
15+
#' \item deputies - full names of deputies. This filter is a character vector with full
16+
#' names of deputies in format: 'surname first_name second_name'. If you are not sure
17+
#' if the deputy you were thinking about has second name, try 'surname first_name' or
18+
#' just 'surname'. There is high probability that proper deputy will be chosen.
19+
#' It is possible to choose more than one deputy.
20+
#' \item dates - period of time. This filter is a character vector with two elements
21+
#' in date format 'YYYY-MM-DD', where the first describes left boundary of period and
22+
#' the second right boundary. It is possible to choose only one day, just try the same
23+
#' date as first and second element of vector.
24+
#' \item topics - text patterns. This filter is a character vector with text patterns of
25+
#' topics in order points. Note that the order points are written like
26+
#' sentences, so remember about case inflection of nouns and adjectives and use stems of
27+
#' words as patterns. For example if you want to find order points about education (in Polish:
28+
#' szkolnictwo) try 'szkolnictw'. It is possible to choose more than one pattern.}
29+
#' \item content - text patterns. This filter is a character vector with text patterns
30+
#' in statements. Note that strings with statements are sentences, so remember about case
31+
#' inflection of nouns and adjectives and use stems of words as patterns.
32+
#' For example if you want to find order points about education (in Polish:
33+
#' szkolnictwo) try 'szkolnictw'. It is possible to choose more than one pattern.}
34+
#'
35+
#' If you did not choose any filter, the whole database will be downloaded.
36+
#' Note that, due to data size (<= ~150 MB) it may take few seconds / minutes
37+
#' to download all statements.
38+
#'
39+
#' Because of encoding issue on Windows operation system, you also need to select
40+
#' if you use Windows.
41+
#'
42+
#' @usage get_filtered_statements(dbname = 'sejmrp', user = 'reader',
43+
#' password = 'qux94874', host = 'services.mini.pw.edu.pl',
44+
#' windows = .Platform$OS.type == 'windows', terms_of_office = integer(0),
45+
#' deputies = character(0), dates = character(0), topics = character(0))
46+
#' content = integer(0)
47+
#'
48+
#' @param dbname name of database; default: 'sejmrp'
49+
#' @param user name of user; default: 'reader'
50+
#' @param password password of database; default: 'qux94874'
51+
#' @param host name of host; default: 'services.mini.pw.edu.pl'
52+
#' @param windows information of used operation system; default: .Platform$OS.type == 'windows'
53+
#' @param terms_of_office range of terms of office's numbers that will be taken to filter data
54+
#' from database; default: integer(0)
55+
#' @param deputies full names of deputies that will be taken to filter data from database;
56+
#' default: character(0)
57+
#' @param dates period of time that will be taken to filter data from database;
58+
#' default: character(0)
59+
#' @param topics text patterns that will be taken to filter data from database;
60+
#' default: character(0)
61+
#' @param content text patterns that will be taken to filter data from database;
62+
#' default: character(0)
63+
#' @param max_rows maximum number of rows to download
64+
#'
65+
#' @return data frame with NULL
66+
#'
67+
#' @examples
68+
#' \dontrun{
69+
#' filtered_statements <- get_filtered_statements()
70+
#' dim(filtered_statements)
71+
#' # [1] 234483 6
72+
?#' names(filtered_statements)
73+
? #' [1] 'id_statement' 'nr_term_of_office' 'surname_name' 'date_statement'
74+
? #' [5] 'titles_order_points' 'statement'
75+
?#' object.size(filtered_statements)
76+
?#' # 148694336 bytes}
77+
#'
78+
#' @note
79+
#' Default parameters use privilages of 'reader'. It can only SELECT data from database.
80+
#'
81+
#' All information is stored in PostgreSQL database.
82+
#'
83+
#' @author Tomasz Mikolajczyk, Piotr Smuda
84+
#'
85+
#' @export
86+
#'
87+
#' @importFrom dplyr src_postgres
88+
#' @importFrom dplyr tbl
89+
#' @importFrom dplyr sql
90+
#' @importFrom dplyr filter
91+
#' @importFrom dplyr between
92+
#' @importFrom dplyr mutate
93+
#' @importFrom dplyr collect
94+
#'
95+
96+
get_filtered_statements <- function(dbname = "sejmrp", user = "reader", password = "qux94874",
97+
host = "services.mini.pw.edu.pl", windows = .Platform$OS.type == "windows",
98+
terms_of_office = integer(0), deputies = character(0),
99+
dates = character(0), topics = character(0), content = character(0),
100+
max_rows=Inf) {
101+
stopifnot(is.numeric(max_rows), is.character(dbname), is.character(user), is.character(password),
102+
is.character(host), is.logical(windows), is.numeric(terms_of_office), is.character(deputies),
103+
is.character(dates), is.character(topics), is.character(content),
104+
all(c(terms_of_office)%%1 == 0))
105+
106+
length_terms_of_office <- length(terms_of_office)
107+
length_deputies <- length(deputies)
108+
length_dates <- length(dates)
109+
length_topics <- length(topics)
110+
length_content <- length(content)
111+
112+
stopifnot(length_terms_of_office >= 0, length_deputies >= 0, length_dates == 0 | length_dates == 2,
113+
length_topics >= 0, length_content >= 0)
114+
115+
# connecting to database with dplyr to get statements
116+
drv <- dbDriver("PostgreSQL")
117+
database_diet <- dbConnect(drv, dbname = dbname, user = user, password = password, host = host)
118+
119+
# add information about new SELECT to the counter table
120+
dbSendQuery(database_diet, paste0("INSERT INTO counter (what, date) VALUES ('filt_statements','", Sys.Date(), "')"))
121+
122+
suppressWarnings(dbDisconnect(database_diet))
123+
124+
# fake variables in order to pass CRAN CHECK
125+
nr_term_of_office <- NULL
126+
surname_name <- NULL
127+
date_statement <- NULL
128+
titles_order_points <- NULL
129+
statement <- NULL
130+
`%SIMILAR TO%` <- NULL
131+
132+
# connecting to database with dplyr to get statements
133+
database_diet <- src_postgres(dbname = dbname, user = user, password = password, host = host)
134+
135+
# read data dodac potem
136+
statements <- tbl(database_diet, sql("SELECT * FROM statements"))
137+
138+
# terms_of_office filter
139+
if (length_terms_of_office == 1) {
140+
statements <- filter(statements, between(nr_term_of_office, terms_of_office[1]))
141+
} else if (length_terms_of_office == 2) {
142+
statements <- filter(statements, between(nr_term_of_office, terms_of_office[1], terms_of_office[2]))
143+
}
144+
145+
# deputies filter
146+
if (length_deputies > 0) {
147+
# changing polish characters for any character
148+
deputies <- stri_replace_all_regex(deputies, "[^a-zA-Z %]", "_")
149+
deputies <- paste0("(%", deputies, "%)")
150+
deputies <- paste0(deputies, collapse = "|")
151+
statements <- filter(statements, surname_name %SIMILAR TO% deputies)
152+
}
153+
154+
# dates filter
155+
if (length_dates == 2) {
156+
statements <- filter(statements, between(date_statement, dates[1], dates[2]))
157+
}
158+
159+
# topics filter
160+
if (length_topics > 0) {
161+
# changing polish characters for any character
162+
topics <- stri_replace_all_regex(topics, "[^a-zA-Z %]", "_")
163+
topics <- paste0("(%", topics, "%)")
164+
topics <- paste0(topics, collapse = "|")
165+
statements <- filter(statements, titles_order_points %SIMILAR TO% topics)
166+
}
167+
168+
# topics filter
169+
if (length_content > 0) {
170+
# changing polish characters for any character
171+
content <- stri_replace_all_regex(content, "[^a-zA-Z %]", "_")
172+
content <- paste0("(%", content, "%)")
173+
content <- paste0(content, collapse = "|")
174+
statements <- filter(statements, statement %SIMILAR TO% content)
175+
}
176+
177+
# reading data
178+
statements <- as.data.frame(collect(statements, stringsAsFactors = FALSE, n = max_rows))
179+
180+
# if empty result of query
181+
if (nrow(statements) == 0) {
182+
suppressWarnings(dbDisconnect(database_diet$con))
183+
return(statements)
184+
}
185+
186+
# encoding for windows
187+
if (windows) {
188+
statements[, 3] <- iconv(statements[, 3], from = "UTF-8", to = "Windows-1250")
189+
statements[, 5] <- iconv(statements[, 5], from = "UTF-8", to = "Windows-1250")
190+
statements[, 6] <- iconv(statements[, 6], from = "UTF-8", to = "Windows-1250")
191+
}
192+
193+
suppressWarnings(dbDisconnect(database_diet$con))
194+
return(invisible(statements))
195+
196+
}

0 commit comments

Comments
 (0)