Skip to content

Commit 99720e1

Browse files
committedFeb 28, 2025
feat: adds model (de)chunker
1 parent 300e24d commit 99720e1

File tree

2 files changed

+222
-0
lines changed

2 files changed

+222
-0
lines changed
 

‎R/model-de-chunker.R

+221
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
###
2+
# Large text file (de)chunker
3+
# Zane
4+
# 2024-06-30
5+
# Updated 2025-02-28
6+
# Takes a large file (typically Rds, qs, or qs2) and splits it into multiple
7+
# smaller files which are sufficiently small to be hosted on GitHub.
8+
# Those smaller files can then be recombined into the original full file on
9+
# the user-end and loadly correctly, preventing the need for a way to distribute
10+
# large model files as one huge file.
11+
# NOTE that GitHub repoistories must still be less than 5 GB in size, and they
12+
# sometimes get mad if your repo is more than 2 or so GB. So this is only
13+
# for model files of an intermediate size.
14+
###
15+
16+
# Utility function that, given a file path, returns the entire file path
17+
# without the final component that has a file extension. The point is to
18+
# get the full directory path that a file should live in.
19+
get_directory_path <- function(file_path) {
20+
# Check if the file path contains an extension
21+
if (!grepl("\\.[^\\\\/]+$", file_path)) {
22+
stop("Error: File name does not contain an extension.")
23+
}
24+
25+
# Extract the directory path
26+
directory_path <- dirname(file_path)
27+
return(directory_path)
28+
}
29+
30+
# We need to run a shell command for this, and the way we have to do that
31+
# is different or Windows vs unix-based OS. So this function will detect the
32+
# OS that an individual is using. If it isn't Windows or Unix, the user will
33+
# get an error and the (de)chunker won't work.
34+
detect_os <- function(verbose = TRUE) {
35+
# This part does the entire OS detection and seems to be the most stable
36+
# or preferred way to do that.
37+
os_detected <- .Platform$OS.type
38+
39+
# This part prints a nicely formatted message if the verbose argument is
40+
# set to TRUE.
41+
if (isTRUE(verbose)) {
42+
if (os_detected == "windows") {
43+
crayon::green("Detected Windows OS.") |> rlang::inform()
44+
} else if (os_detected == "unix") {
45+
crayon::yellow("Detected a unix OS.") |> rlang::inform()
46+
} else {
47+
rlang::abort(paste0(
48+
"OS detected isn't 'windows' or 'unix'!\nThis code isn't configured",
49+
" to work with your OS. Sorry!"
50+
))
51+
}
52+
}
53+
54+
invisible(os_detected)
55+
}
56+
57+
# This function constructs a shell call that is appropriate for the system
58+
# OS. The system OS is passed as the first argument, so it should be detected
59+
# before calling this function or passed manually.
60+
# The remaining arguments are the arguments for the shell call, which are the
61+
# same regardless of OS for the purposes of the model (de)chunker.
62+
# We also implement error handling via purrr::possibly(). The "otherwise" value
63+
# in purrr possibly is set to -1 because this is the error code normally
64+
# returned by a shell() or system() invocation if a system-level error occurs.
65+
os_call <- function(os_detected, ...) {
66+
67+
# Construct the function call based on the OS
68+
if (os_detected == "windows") {
69+
FUN <- shell
70+
} else if (os_detected == "unix") {
71+
FUN <- system
72+
} else {
73+
rlang::abort(paste0(
74+
"OS detected isn't 'windows' or 'unix'!\nThis code isn't configured",
75+
" to work with your OS. Sorry!"
76+
))
77+
}
78+
79+
# Add error handling to the call
80+
fun_possibly <- purrr::possibly(FUN, otherwise = -1)
81+
82+
# Run the call
83+
out <- fun_possibly(...)
84+
85+
invisible(out)
86+
}
87+
88+
# This function is the model chunker.
89+
# The two arguments are both character vectors that should be equal length.
90+
# It takes each file specified in the vector "files_to_chunk", separates it
91+
# into smaller chunks with a fixed memory size, and saves all of the chunks to
92+
# the DIRECTORY PATH at the same position specified in
93+
# "destination_directories".
94+
# This invokes the bash command "split" which is available natively on both
95+
# windows and MacOS. If you're on a different OS you might need to install it
96+
# or something, I don't know.
97+
# You can learn how the "split" command works here:
98+
# https://web.archive.org/web/20250228173733/https://man7.org/linux/man-pages/man1/split.1.html
99+
# or by googling it.
100+
# Note that the backslash character \ indicates an escape character in an
101+
# R string (https://en.wikipedia.org/wiki/Escape_character).
102+
model_split <- function(files_to_chunk, destination_directories) {
103+
# Detect the OS and message user about it
104+
this_os <- detect_os()
105+
106+
# Validate the input arguments
107+
if (!is.character(files_to_chunk)) {
108+
rlang::abort(paste0(
109+
"'current_location' should be a character vector, not an object of",
110+
" class ", class(files_to_chunk), "."
111+
))
112+
}
113+
114+
if (!is.character(destination_directories)) {
115+
rlang::abort(paste0(
116+
"'destination_location' should be a character vector, not an object of",
117+
" class ", class(destination_directories), "."
118+
))
119+
}
120+
121+
if (length(files_to_chunk) != length(destination_directories)) {
122+
rlang::abort(paste0(
123+
"'current_location' and 'destination_location' should be ",
124+
"the same length."
125+
))
126+
}
127+
128+
for (i in seq_along(files_to_chunk)) {
129+
# Make sure a directory exists to save the splitted models
130+
dir.create(
131+
destination_directories[[i]],
132+
showWarnings = FALSE, recursive = TRUE
133+
)
134+
135+
# Create a string for the system command that uses the right file names
136+
split_cmd <-
137+
paste0(
138+
# Split command and options
139+
"split -b 25m ",
140+
# Current location of file
141+
"\"", files_to_chunk[[i]], "\" ",
142+
# Directory the splitted files will go to
143+
"\"", destination_directories[[i]], "/\""
144+
)
145+
146+
# Invoke the bash system command
147+
res <- os_call(this_os, split_cmd)
148+
149+
# Message if successful -- i.e. we get error code 0.
150+
if (res == 0) message(paste0("Successfully chunked model ", i, "."))
151+
}
152+
153+
invisible(res)
154+
}
155+
156+
# This function is the model dechunker.
157+
# The two arguments are both character vectors that should be equal length.
158+
# Each directory specified in the vector "chunk_directories" should be a
159+
# directory created by invoking model_cat() that contains the individual
160+
# chunks from a splitted file.
161+
# This function recombines then and saves the combined file to the file name
162+
# in the same position in the vector "destination_file".
163+
# This invokes the bash command "cat" which is available natively on both
164+
# windows and MacOS. If you're on a different OS you might need to install it
165+
# or something, I don't know.
166+
# You can learn how the "cat" command works here:
167+
# https://web.archive.org/web/20250228174620/https://www.man7.org/linux/man-pages/man1/cat.1.html
168+
# or by googling it.
169+
model_cat <- function(chunk_directories, destination_file) {
170+
# Detect the OS and message user about it
171+
this_os <- detect_os()
172+
173+
# Validate the input arguments
174+
if (!is.character(chunk_directories)) {
175+
rlang::abort(paste0(
176+
"'current_location' should be a character vector, not an object of",
177+
" class ", class(chunk_directories), "."
178+
))
179+
}
180+
181+
if (!is.character(destination_file)) {
182+
rlang::abort(paste0(
183+
"'destination_location' should be a character vector, not an object of",
184+
" class ", class(destination_file), "."
185+
))
186+
}
187+
188+
if (length(chunk_directories) != length(destination_file)) {
189+
rlang::abort(paste0(
190+
"'current_location' and 'destination_location' should be ",
191+
"the same length."
192+
))
193+
}
194+
195+
for (i in seq_along(chunk_directories)) {
196+
# Make sure a directory exists to save the splitted models
197+
dir.create(
198+
get_directory_path(destination_file[[i]]),
199+
showWarnings = FALSE, recursive = TRUE
200+
)
201+
202+
# Create a string for the system command
203+
cat_cmd <-
204+
paste0(
205+
# Split command and options
206+
"cat ",
207+
# Current location of file
208+
"\"", paste0(chunk_directories[[i]]), "\"/* > ",
209+
# Directory the splitted files will go to
210+
"\"", paste0(destination_file[[i]]), "\""
211+
)
212+
213+
# Invoke the bash system command
214+
res <- os_call(this_os, cat_cmd)
215+
216+
# Message if successful -- i.e. we get error code 0.
217+
if (res == 0) message(paste0("Successfully dechunked model ", i, "."))
218+
}
219+
220+
invisible(res)
221+
}

‎hgp.Rproj

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
Version: 1.0
2+
ProjectId: f6d6f985-f920-40c5-895b-0bf8fa3a3c55
23

34
RestoreWorkspace: Default
45
SaveWorkspace: Default

0 commit comments

Comments
 (0)
Please sign in to comment.