-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract.R
161 lines (142 loc) · 5.44 KB
/
extract.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# TODO: include table names and descriptions
# Values "..C" for "confidential" and "..." for "not applicable" are both
# imported as `NA`
library(tidyverse)
library(stringr)
library(tidyxl)
library(unpivotr)
library(here)
book_path <- here("inst", "extdata", "2013-census-electorate-tables.xlsx")
out_path <- here("inst", "extdata", "nzcensus2013.tsv")
book <- tidy_xlsx(book_path)
# Combine all sheets into one data frame (the new tidyxl api will do this)
sheets <- bind_rows(book$data, .id = "sheet")
formats <- book$formats
# Omit the first sheet
datasheets <- filter(sheets, sheet != "Contents")
# Formatting vectors for lookup
bottom_border_ids <- which(!is.na(formats$local$border$bottom$style))
top_border_ids <- which(!is.na(formats$local$border$top$style))
indents <- formats$local$alignment$indent
# Functions to help locate the top-left data cell
get_top_left_header_cell <- function(cells, top_border_ids) {
# First cell in col 1 that has a top border
cells %>%
filter(col == 1, local_format_id %in% top_border_ids) %>%
top_n(1, desc(row))
}
get_top_left_data_cell <- function(cells, top_left_header_cell) {
# From the top-left header cell, go down until the first non-blank cell (which
# is the first row header), then right one column. That cell might be blank,
# but it definitely is not a header.
top_left_header_cell %>%
offset_S(cells, boundary = ~ !is.na(character), include = TRUE) %>%
offset_E(cells, 1)
}
get_bottom_cell <- function(cells, bottom_border_ids) {
# The final cell in column 1 that has a bottom border
cells %>%
filter(col == 1, local_format_id %in% bottom_border_ids) %>%
arrange(desc(row)) %>%
slice(1)
}
# Functions to split headers into levels
get_row_headers <- function(cells, top_left_header_cell, bottom_cell) {
# Row headers are all in the same column, so use the amount of indentation as
# the hierarchy
out <-
cells %>%
filter(col == 1,
between(row,
top_left_header_cell$row + 1,
bottom_cell$row),
!is.na(character)) %>%
mutate(indent = indents[local_format_id]) %>%
select(row, col, indent, header = character) %>%
mutate(indent = if_else(is.na(indent), 0L, indent)) %>%
split(.$indent) %>%
map(~ select(.x, -indent))
c(list(select(top_left_header_cell, row, col, header = character)), out)
}
get_col_headers <- function(cells, top_left_header_cell, top_left_data_cell) {
# Column header hierarchy is given by the row number
headers <-
cells %>%
filter(col >= 2,
between(row,
top_left_header_cell$row,
top_left_data_cell$row - 1),
!is.na(character)) %>%
select(row, col, header = character)
# Fill down, fill right, and split by row into separate data frames
headers %>%
expand(row, col) %>%
left_join(headers, by = c("row", "col")) %>%
group_by(col) %>%
arrange(col, row) %>%
fill(header) %>%
group_by(row) %>%
arrange(row, col) %>%
fill(header) %>%
ungroup() %>%
split(.$row)
}
# Function to find the data cells
get_data_cells <- function(cells,
top_left_data_cell,
bottom_cell,
row_headers,
col_headers) {
# Data cells are at the intersection of row and column headers
cells %>%
filter(col >= 2,
between(row,
top_left_data_cell$row,
bottom_cell$row)) %>%
semi_join(bind_rows(row_headers), by = "row") %>%
semi_join(bind_rows(col_headers), by = "col") %>%
filter(!is.na(numeric) | character %in% c("..C", "...")) %>%
select(row, col, value = numeric)
}
# Function to match data cells to their nearest headers in each tier
match_headers <- function(datacells, row_headers, col_headers) {
out <- datacells
for (i in seq_along(row_headers)) {
out <- WNW(out, row_headers[[i]])
}
for (i in seq_along(col_headers)) {
out <- NNW(out, col_headers[[i]])
}
new_colnames <-
c("row", "col", "value",
paste0("col_header", rev(seq_along(col_headers))),
paste0("row_header", seq_along(row_headers)))
datacells <- arrange(datacells, row, col)
colnames(out) <- new_colnames
out
}
# Function to import the table on a worksheet
tidy_data <- function(cells) {
top_left_header_cell <- get_top_left_header_cell(cells, top_border_ids)
top_left_data_cell <- get_top_left_data_cell(cells, top_left_header_cell)
bottom_cell <- get_bottom_cell(cells, bottom_border_ids)
row_headers <- get_row_headers(cells, top_left_header_cell, bottom_cell)
col_headers <- get_col_headers(cells, top_left_header_cell, top_left_data_cell)
data_cells <- get_data_cells(cells, top_left_data_cell, bottom_cell, row_headers, col_headers)
out <- match_headers(data_cells, row_headers, col_headers)
out$title <- filter(cells, row == 3, col == 1)$character
out$dimensions <- filter(cells, row == 4, col == 1)$character
out$population <- filter(cells, row == 5, col == 1)$character
out
}
# Import all sheets
nzcensus2013 <-
datasheets %>%
nest(-sheet) %>%
mutate(tidy_data = map(data, tidy_data)) %>%
select(sheet, tidy_data) %>%
unnest() %>%
# Sort the columns by name, apart from sheet, row, col and value
select(sort(colnames(.))) %>%
select(sheet, title, dimensions, population, row, col, value, everything())
devtools::use_data(nzcensus2013, overwrite = TRUE)