|
| 1 | +### |
| 2 | +# Large text file (de)chunker |
| 3 | +# Zane |
| 4 | +# 2024-06-30 |
| 5 | +# Updated 2025-02-28 |
| 6 | +# Takes a large file (typically Rds, qs, or qs2) and splits it into multiple |
| 7 | +# smaller files which are sufficiently small to be hosted on GitHub. |
| 8 | +# Those smaller files can then be recombined into the original full file on |
| 9 | +# the user-end and loadly correctly, preventing the need for a way to distribute |
| 10 | +# large model files as one huge file. |
| 11 | +# NOTE that GitHub repoistories must still be less than 5 GB in size, and they |
| 12 | +# sometimes get mad if your repo is more than 2 or so GB. So this is only |
| 13 | +# for model files of an intermediate size. |
| 14 | +### |
| 15 | + |
| 16 | +# Utility function that, given a file path, returns the entire file path |
| 17 | +# without the final component that has a file extension. The point is to |
| 18 | +# get the full directory path that a file should live in. |
| 19 | +get_directory_path <- function(file_path) { |
| 20 | + # Check if the file path contains an extension |
| 21 | + if (!grepl("\\.[^\\\\/]+$", file_path)) { |
| 22 | + stop("Error: File name does not contain an extension.") |
| 23 | + } |
| 24 | + |
| 25 | + # Extract the directory path |
| 26 | + directory_path <- dirname(file_path) |
| 27 | + return(directory_path) |
| 28 | +} |
| 29 | + |
| 30 | +# We need to run a shell command for this, and the way we have to do that |
| 31 | +# is different or Windows vs unix-based OS. So this function will detect the |
| 32 | +# OS that an individual is using. If it isn't Windows or Unix, the user will |
| 33 | +# get an error and the (de)chunker won't work. |
| 34 | +detect_os <- function(verbose = TRUE) { |
| 35 | + # This part does the entire OS detection and seems to be the most stable |
| 36 | + # or preferred way to do that. |
| 37 | + os_detected <- .Platform$OS.type |
| 38 | + |
| 39 | + # This part prints a nicely formatted message if the verbose argument is |
| 40 | + # set to TRUE. |
| 41 | + if (isTRUE(verbose)) { |
| 42 | + if (os_detected == "windows") { |
| 43 | + crayon::green("Detected Windows OS.") |> rlang::inform() |
| 44 | + } else if (os_detected == "unix") { |
| 45 | + crayon::yellow("Detected a unix OS.") |> rlang::inform() |
| 46 | + } else { |
| 47 | + rlang::abort(paste0( |
| 48 | + "OS detected isn't 'windows' or 'unix'!\nThis code isn't configured", |
| 49 | + " to work with your OS. Sorry!" |
| 50 | + )) |
| 51 | + } |
| 52 | + } |
| 53 | + |
| 54 | + invisible(os_detected) |
| 55 | +} |
| 56 | + |
| 57 | +# This function constructs a shell call that is appropriate for the system |
| 58 | +# OS. The system OS is passed as the first argument, so it should be detected |
| 59 | +# before calling this function or passed manually. |
| 60 | +# The remaining arguments are the arguments for the shell call, which are the |
| 61 | +# same regardless of OS for the purposes of the model (de)chunker. |
| 62 | +# We also implement error handling via purrr::possibly(). The "otherwise" value |
| 63 | +# in purrr possibly is set to -1 because this is the error code normally |
| 64 | +# returned by a shell() or system() invocation if a system-level error occurs. |
| 65 | +os_call <- function(os_detected, ...) { |
| 66 | + |
| 67 | + # Construct the function call based on the OS |
| 68 | + if (os_detected == "windows") { |
| 69 | + FUN <- shell |
| 70 | + } else if (os_detected == "unix") { |
| 71 | + FUN <- system |
| 72 | + } else { |
| 73 | + rlang::abort(paste0( |
| 74 | + "OS detected isn't 'windows' or 'unix'!\nThis code isn't configured", |
| 75 | + " to work with your OS. Sorry!" |
| 76 | + )) |
| 77 | + } |
| 78 | + |
| 79 | + # Add error handling to the call |
| 80 | + fun_possibly <- purrr::possibly(FUN, otherwise = -1) |
| 81 | + |
| 82 | + # Run the call |
| 83 | + out <- fun_possibly(...) |
| 84 | + |
| 85 | + invisible(out) |
| 86 | +} |
| 87 | + |
| 88 | +# This function is the model chunker. |
| 89 | +# The two arguments are both character vectors that should be equal length. |
| 90 | +# It takes each file specified in the vector "files_to_chunk", separates it |
| 91 | +# into smaller chunks with a fixed memory size, and saves all of the chunks to |
| 92 | +# the DIRECTORY PATH at the same position specified in |
| 93 | +# "destination_directories". |
| 94 | +# This invokes the bash command "split" which is available natively on both |
| 95 | +# windows and MacOS. If you're on a different OS you might need to install it |
| 96 | +# or something, I don't know. |
| 97 | +# You can learn how the "split" command works here: |
| 98 | +# https://web.archive.org/web/20250228173733/https://man7.org/linux/man-pages/man1/split.1.html |
| 99 | +# or by googling it. |
| 100 | +# Note that the backslash character \ indicates an escape character in an |
| 101 | +# R string (https://en.wikipedia.org/wiki/Escape_character). |
| 102 | +model_split <- function(files_to_chunk, destination_directories) { |
| 103 | + # Detect the OS and message user about it |
| 104 | + this_os <- detect_os() |
| 105 | + |
| 106 | + # Validate the input arguments |
| 107 | + if (!is.character(files_to_chunk)) { |
| 108 | + rlang::abort(paste0( |
| 109 | + "'current_location' should be a character vector, not an object of", |
| 110 | + " class ", class(files_to_chunk), "." |
| 111 | + )) |
| 112 | + } |
| 113 | + |
| 114 | + if (!is.character(destination_directories)) { |
| 115 | + rlang::abort(paste0( |
| 116 | + "'destination_location' should be a character vector, not an object of", |
| 117 | + " class ", class(destination_directories), "." |
| 118 | + )) |
| 119 | + } |
| 120 | + |
| 121 | + if (length(files_to_chunk) != length(destination_directories)) { |
| 122 | + rlang::abort(paste0( |
| 123 | + "'current_location' and 'destination_location' should be ", |
| 124 | + "the same length." |
| 125 | + )) |
| 126 | + } |
| 127 | + |
| 128 | + for (i in seq_along(files_to_chunk)) { |
| 129 | + # Make sure a directory exists to save the splitted models |
| 130 | + dir.create( |
| 131 | + destination_directories[[i]], |
| 132 | + showWarnings = FALSE, recursive = TRUE |
| 133 | + ) |
| 134 | + |
| 135 | + # Create a string for the system command that uses the right file names |
| 136 | + split_cmd <- |
| 137 | + paste0( |
| 138 | + # Split command and options |
| 139 | + "split -b 25m ", |
| 140 | + # Current location of file |
| 141 | + "\"", files_to_chunk[[i]], "\" ", |
| 142 | + # Directory the splitted files will go to |
| 143 | + "\"", destination_directories[[i]], "/\"" |
| 144 | + ) |
| 145 | + |
| 146 | + # Invoke the bash system command |
| 147 | + res <- os_call(this_os, split_cmd) |
| 148 | + |
| 149 | + # Message if successful -- i.e. we get error code 0. |
| 150 | + if (res == 0) message(paste0("Successfully chunked model ", i, ".")) |
| 151 | + } |
| 152 | + |
| 153 | + invisible(res) |
| 154 | +} |
| 155 | + |
| 156 | +# This function is the model dechunker. |
| 157 | +# The two arguments are both character vectors that should be equal length. |
| 158 | +# Each directory specified in the vector "chunk_directories" should be a |
| 159 | +# directory created by invoking model_cat() that contains the individual |
| 160 | +# chunks from a splitted file. |
| 161 | +# This function recombines then and saves the combined file to the file name |
| 162 | +# in the same position in the vector "destination_file". |
| 163 | +# This invokes the bash command "cat" which is available natively on both |
| 164 | +# windows and MacOS. If you're on a different OS you might need to install it |
| 165 | +# or something, I don't know. |
| 166 | +# You can learn how the "cat" command works here: |
| 167 | +# https://web.archive.org/web/20250228174620/https://www.man7.org/linux/man-pages/man1/cat.1.html |
| 168 | +# or by googling it. |
| 169 | +model_cat <- function(chunk_directories, destination_file) { |
| 170 | + # Detect the OS and message user about it |
| 171 | + this_os <- detect_os() |
| 172 | + |
| 173 | + # Validate the input arguments |
| 174 | + if (!is.character(chunk_directories)) { |
| 175 | + rlang::abort(paste0( |
| 176 | + "'current_location' should be a character vector, not an object of", |
| 177 | + " class ", class(chunk_directories), "." |
| 178 | + )) |
| 179 | + } |
| 180 | + |
| 181 | + if (!is.character(destination_file)) { |
| 182 | + rlang::abort(paste0( |
| 183 | + "'destination_location' should be a character vector, not an object of", |
| 184 | + " class ", class(destination_file), "." |
| 185 | + )) |
| 186 | + } |
| 187 | + |
| 188 | + if (length(chunk_directories) != length(destination_file)) { |
| 189 | + rlang::abort(paste0( |
| 190 | + "'current_location' and 'destination_location' should be ", |
| 191 | + "the same length." |
| 192 | + )) |
| 193 | + } |
| 194 | + |
| 195 | + for (i in seq_along(chunk_directories)) { |
| 196 | + # Make sure a directory exists to save the splitted models |
| 197 | + dir.create( |
| 198 | + get_directory_path(destination_file[[i]]), |
| 199 | + showWarnings = FALSE, recursive = TRUE |
| 200 | + ) |
| 201 | + |
| 202 | + # Create a string for the system command |
| 203 | + cat_cmd <- |
| 204 | + paste0( |
| 205 | + # Split command and options |
| 206 | + "cat ", |
| 207 | + # Current location of file |
| 208 | + "\"", paste0(chunk_directories[[i]]), "\"/* > ", |
| 209 | + # Directory the splitted files will go to |
| 210 | + "\"", paste0(destination_file[[i]]), "\"" |
| 211 | + ) |
| 212 | + |
| 213 | + # Invoke the bash system command |
| 214 | + res <- os_call(this_os, cat_cmd) |
| 215 | + |
| 216 | + # Message if successful -- i.e. we get error code 0. |
| 217 | + if (res == 0) message(paste0("Successfully dechunked model ", i, ".")) |
| 218 | + } |
| 219 | + |
| 220 | + invisible(res) |
| 221 | +} |
0 commit comments