mozilla · sylvestre · Oct 30, 2023 · Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -32,6 +32,7 @@ bincode = "1"
 blake3 = "1"
 byteorder = "1.0"
 bytes = "1"
+chrono = "0.4"
 clap = { version = "4.1.11", features = ["derive", "env", "wrap_help"] }
 directories = "5.0.0"
 encoding = "0.2"
@@ -53,6 +54,7 @@ jwt = { package = "jsonwebtoken", version = "8", optional = true }
 libc = "0.2.140"
 linked-hash-map = "0.5"
 log = "0.4"
+memchr = "2"
 num_cpus = "1.15"
 number_prefix = "0.4"
 once_cell = "1.17"

diff --git a/docs/Caching.md b/docs/Caching.md
@@ -24,7 +24,7 @@ In parallel, we also take into account in the hash:
 
 For C/C++, the hash is generated with a blake3 digest of the preprocessed
 file (-E with gcc/clang). For compilations that specify multiple `-arch` flags,
-these flags are rewritten to their corresponding preprocessor defines to allow 
+these flags are rewritten to their corresponding preprocessor defines to allow
 pre-processing the file (e.g `-arch x86_64` is rewritten to `-D__X86_64__=1`),
 this can be enabled by setting the environment variable
 `SCCACHE_CACHE_MULTIARCH` but is disabled by default as it may not work in all
@@ -43,3 +43,15 @@ We also take into account in the hash:
 * Color mode
 * Environment variables
 See https://github.com/mozilla/sccache/blob/8567bbe2ba493153e76177c1f9a6f98cc7ba419f/src/compiler/c.rs#L84
+
+### C/C++ preprocessor
+
+In "preprocessor cache mode", [explained in the local doc](Local.md), an
+extra key is computed to cache the preprocessor output itself. It is very close
+to the C/C++ compiler one, but with additional elements:
+
+* The path of the input file
+* The hash of the input file
+
+Note that some compiler options can disable preprocessor cache mode. As of this
+writing, only `-Xpreprocessor` and `-Wp,*` do.
diff --git a/docs/Configuration.md b/docs/Configuration.md
@@ -24,6 +24,21 @@ token = "secrettoken"
 dir = "/tmp/.cache/sccache"
 size = 7516192768 # 7 GiBytes
 
+# See the local docs on more explanations about this mode
+[cache.disk.preprocessor_cache_mode]
+# Whether to use the preprocessor cache mode
+use_preprocessor_cache_mode = true
+# Whether to use file times to check for changes
+file_stat_matches = true
+# Whether to also use ctime (file status change) time to check for changes
+use_ctime_for_stat = true
+# Whether to ignore `__TIME__` when caching
+ignore_time_macros = false
+# Whether to skip (meaning not cache, only hash) system headers
+skip_system_headers = false
+# Whether hash the current working directory
+hash_working_directory = true
+
 [cache.gcs]
 # optional oauth url
 oauth_url = "..."
@@ -84,6 +99,7 @@ configuration variables
 
 * `SCCACHE_DIR` local on disk artifact cache directory
 * `SCCACHE_CACHE_SIZE` maximum size of the local on disk cache i.e. `2G` - default is 10G
+* `SCCACHE_PREPROCESSOR_MODE` enable/disable preprocessor caching (see [the local doc](Local.md))
 
 #### s3 compatible
 

diff --git a/docs/Local.md b/docs/Local.md
@@ -5,3 +5,36 @@ sccache defaults to using local disk storage. You can set the `SCCACHE_DIR` envi
 The default cache size is 10 gigabytes. To change this, set `SCCACHE_CACHE_SIZE`, for example `SCCACHE_CACHE_SIZE="1G"`.
 
 The local storage only supports a single sccache server at a time. Multiple concurrent servers will race and cause spurious build failures.
+
+## Preprocessor cache mode
+
+This is inspired by [ccache's direct mode](https://ccache.dev/manual/3.7.9.html#_the_direct_mode) and works roughly the same.
+
+In preprocessor cache mode, sccache caches the preprocessor step for C/C++ whenever possible. This can make the compilation a lot faster, since the preprocessor accounts for a non-negligible amount of time in the entire compile chain.
+
+In order to cache the preprocessor step sccache needs to remember, among other things, all files included by the given input file. To quote ccache's documentation:
+
+> There is a catch with the [preprocessor cache] mode: header files that were used by the compiler are recorded, but header files that were not used, but would have been used if they existed, are not. So, when [sccache] checks if a result can be taken from the cache, it currently can’t check if the existence of a new header file should invalidate the result. In practice, the [preprocessor cache] mode is safe to use in the absolute majority of cases.
+
+Preprocessor cache mode will be disabled if any of the following holds:
+
+- the configuration setting `use_preprocessor_cache_mode` is false
+- a modification time of one of the include files is too new (needed to avoid a race condition)
+- a compiler option not supported by the preprocessor cache mode is used. Currently, this is only `-Xpreprocessor` and `-Wp,*`, but if/when sccache grows to handle options then more could be added to this list.
+- the string `__TIME__` is present in the source code
+
+Configuration options and their default values:
+
+- `use_preprocessor_cache_mode`: `false`. Whether to use preprocessor cache mode entirely.
+- `file_stat_matches`: `false`. If false, only compare header files by hashing their contents. If true, will use size + ctime + mtime to check whether a file has changed. See other flags below for more control over this behavior.
+- `use_ctime_for_stat`: `true`. If true, uses the ctime (file status change on UNIX, creation time on Windows) to check that a file has/hasn't changed. Can be useful to disable when backdating modification times in a controlled manner.
+
+- `ignore_time_macros`: `false`. If true, ignore `__DATE__`, `__TIME__` and `__TIMESTAMP__` being present in the source code. Will speed up preprocessor cache mode, but can result in false positives.
+
+- `skip_system_headers`: `false`. If true, preprocessor cache mode will not cache system headers, only add them to the hash.
+
+- `hash_working_directory`: `true`. If true, will add the current working directory in the hash to distinguish two compilations from different directories.
+
+See where to write the config in [the configuration doc](Configuration.md).
+
+*Note that preprocessor caching is currently only implemented for GCC and Clang and when using local storage.*
diff --git a/src/cache/cache.rs b/src/cache/cache.rs
@@ -27,6 +27,7 @@ use crate::cache::redis::RedisCache;
 use crate::cache::s3::S3Cache;
 #[cfg(feature = "webdav")]
 use crate::cache::webdav::WebdavCache;
+use crate::compiler::PreprocessorCacheEntry;
 use crate::config::Config;
 #[cfg(any(
     feature = "azure",
@@ -40,6 +41,7 @@ use crate::config::Config;
 use crate::config::{self, CacheType};
 use async_trait::async_trait;
 use fs_err as fs;
+use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::io::{self, Cursor, Read, Seek, Write};
 use std::path::{Path, PathBuf};
@@ -361,6 +363,72 @@ pub trait Storage: Send + Sync {
 
     /// Get the maximum storage size, if applicable.
     async fn max_size(&self) -> Result<Option<u64>>;
+
+    /// Return the config for preprocessor cache mode if applicable
+    fn preprocessor_cache_mode_config(&self) -> PreprocessorCacheModeConfig {
+        // Disabled by default, only enabled in local mode
+        PreprocessorCacheModeConfig::default()
+    }
+    /// Return the preprocessor cache entry for a given preprocessor key,
+    /// if it exists.
+    /// Only applicable when using preprocessor cache mode.
+    fn get_preprocessor_cache_entry(
+        &self,
+        _key: &str,
+    ) -> Result<Option<Box<dyn crate::lru_disk_cache::ReadSeek>>> {
+        Ok(None)
+    }
+    /// Insert a preprocessor cache entry at the given preprocessor key,
+    /// overwriting the entry if it exists.
+    /// Only applicable when using preprocessor cache mode.
+    fn put_preprocessor_cache_entry(
+        &self,
+        _key: &str,
+        _preprocessor_cache_entry: PreprocessorCacheEntry,
+    ) -> Result<()> {
+        Ok(())
+    }
+}
+
+/// Configuration switches for preprocessor cache mode.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
+#[serde(default)]
+pub struct PreprocessorCacheModeConfig {
+    /// Whether to use preprocessor cache mode entirely
+    pub use_preprocessor_cache_mode: bool,
+    /// If false (default), only compare header files by hashing their contents.
+    /// If true, will use size + ctime + mtime to check whether a file has changed.
+    /// See other flags below for more control over this behavior.
+    pub file_stat_matches: bool,
+    /// If true (default), uses the ctime (file status change on UNIX,
+    /// creation time on Windows) to check that a file has/hasn't changed.
+    /// Can be useful to disable when backdating modification times
+    /// in a controlled manner.
+    pub use_ctime_for_stat: bool,
+    /// If true, ignore `__DATE__`, `__TIME__` and `__TIMESTAMP__` being present
+    /// in the source code. Will speed up preprocessor cache mode,
+    /// but can result in false positives.
+    pub ignore_time_macros: bool,
+    /// If true, preprocessor cache mode will not cache system headers, only
+    /// add them to the hash.
+    pub skip_system_headers: bool,
+    /// If true (default), will add the current working directory in the hash to
+    /// distinguish two compilations from different directories.
+    pub hash_working_directory: bool,
+}
+
+impl Default for PreprocessorCacheModeConfig {
+    fn default() -> Self {
+        Self {
+            use_preprocessor_cache_mode: false,
+            file_stat_matches: false,
+            use_ctime_for_stat: true,
+            ignore_time_macros: false,
+            skip_system_headers: false,
+            hash_working_directory: true,
+        }
+    }
 }
 
 /// Implement storage for operator.
@@ -455,8 +523,7 @@ impl Storage for opendal::Operator {
 }
 
 /// Normalize key `abcdef` into `a/b/c/abcdef`
-#[allow(dead_code)]
-fn normalize_key(key: &str) -> String {
+pub(in crate::cache) fn normalize_key(key: &str) -> String {
     format!("{}/{}/{}/{}", &key[0..1], &key[1..2], &key[2..3], &key)
 }
 
@@ -574,8 +641,14 @@ pub fn storage_from_config(
     }
 
     let (dir, size) = (&config.fallback_cache.dir, config.fallback_cache.size);
+    let preprocessor_cache_mode_config = config.fallback_cache.preprocessor_cache_mode;
     debug!("Init disk cache with dir {:?}, size {}", dir, size);
-    Ok(Arc::new(DiskCache::new(dir, size, pool)))
+    Ok(Arc::new(DiskCache::new(
+        dir,
+        size,
+        pool,
+        preprocessor_cache_mode_config,
+    )))
 }
 
 #[cfg(test)]

diff --git a/src/cache/disk.rs b/src/cache/disk.rs
@@ -13,8 +13,9 @@
 // limitations under the License.
 
 use crate::cache::{Cache, CacheRead, CacheWrite, Storage};
-use crate::lru_disk_cache::Error as LruError;
+use crate::compiler::PreprocessorCacheEntry;
 use crate::lru_disk_cache::LruDiskCache;
+use crate::lru_disk_cache::{Error as LruError, ReadSeek};
 use async_trait::async_trait;
 use std::ffi::{OsStr, OsString};
 use std::path::{Path, PathBuf};
@@ -23,6 +24,8 @@ use std::time::{Duration, Instant};
 
 use crate::errors::*;
 
+use super::{normalize_key, PreprocessorCacheModeConfig};
+
 enum LazyDiskCache {
     Uninit { root: OsString, max_size: u64 },
     Init(LruDiskCache),
@@ -60,6 +63,8 @@ pub struct DiskCache {
     lru: Arc<Mutex<LazyDiskCache>>,
     /// Thread pool to execute disk I/O
     pool: tokio::runtime::Handle,
+    preprocessor_cache_mode_config: PreprocessorCacheModeConfig,
+    preprocessor_cache: Arc<Mutex<LazyDiskCache>>,
 }
 
 impl DiskCache {
@@ -68,13 +73,21 @@ impl DiskCache {
         root: T,
         max_size: u64,
         pool: &tokio::runtime::Handle,
+        preprocessor_cache_mode_config: PreprocessorCacheModeConfig,
     ) -> DiskCache {
         DiskCache {
             lru: Arc::new(Mutex::new(LazyDiskCache::Uninit {
                 root: root.as_ref().to_os_string(),
                 max_size,
             })),
             pool: pool.clone(),
+            preprocessor_cache_mode_config,
+            preprocessor_cache: Arc::new(Mutex::new(LazyDiskCache::Uninit {
+                root: Path::new(root.as_ref())
+                    .join("preprocessor")
+                    .into_os_string(),
+                max_size,
+            })),
         }
     }
 }
@@ -140,4 +153,32 @@ impl Storage for DiskCache {
     async fn max_size(&self) -> Result<Option<u64>> {
         Ok(self.lru.lock().unwrap().get().map(|l| l.capacity()))
     }
+    fn preprocessor_cache_mode_config(&self) -> PreprocessorCacheModeConfig {
+        self.preprocessor_cache_mode_config
+    }
+    fn get_preprocessor_cache_entry(&self, key: &str) -> Result<Option<Box<dyn ReadSeek>>> {
+        let key = normalize_key(key);
+        Ok(self
+            .preprocessor_cache
+            .lock()
+            .unwrap()
+            .get_or_init()?
+            .get(key)
+            .ok())
+    }
+    fn put_preprocessor_cache_entry(
+        &self,
+        key: &str,
+        preprocessor_cache_entry: PreprocessorCacheEntry,
+    ) -> Result<()> {
+        let key = normalize_key(key);
+        let mut buf = vec![];
+        preprocessor_cache_entry.serialize_to(&mut buf)?;
+        Ok(self
+            .preprocessor_cache
+            .lock()
+            .unwrap()
+            .get_or_init()?
+            .insert_bytes(key, &buf)?)
+    }
 }
diff --git a/src/cmdline.rs b/src/cmdline.rs
@@ -84,6 +84,7 @@ pub enum Command {
         /// The environment variables to use for execution.
         env_vars: Vec<(OsString, OsString)>,
     },
+    DebugPreprocessorCacheEntries,
 }
 
 fn flag_infer_long_and_short(name: &'static str) -> Arg {
@@ -130,6 +131,9 @@ fn get_clap_command() -> clap::Command {
             flag_infer_long("start-server")
                 .help("start background server")
                 .action(ArgAction::SetTrue),
+            flag_infer_long("debug-preprocessor-cache")
+                .help("show all preprocessor cache entries")
+                .action(ArgAction::SetTrue),
             flag_infer_long("stop-server")
                 .help("stop background server")
                 .action(ArgAction::SetTrue),
@@ -161,6 +165,7 @@ fn get_clap_command() -> clap::Command {
             ArgGroup::new("one_and_only_one")
                 .args([
                     "dist-auth",
+                    "debug-preprocessor-cache",
                     "dist-status",
                     "show-stats",
                     "show-adv-stats",
@@ -259,6 +264,8 @@ pub fn try_parse() -> Result<Command> {
                 Ok(Command::ShowStats(fmt, true))
             } else if matches.get_flag("start-server") {
                 Ok(Command::StartServer)
+            } else if matches.get_flag("debug-preprocessor-cache") {
+                Ok(Command::DebugPreprocessorCacheEntries)
             } else if matches.get_flag("stop-server") {
                 Ok(Command::StopServer)
             } else if matches.get_flag("zero-stats") {

diff --git a/src/commands.rs b/src/commands.rs
@@ -15,7 +15,7 @@
 use crate::client::{connect_to_server, connect_with_retry, ServerConnection};
 use crate::cmdline::{Command, StatsFormat};
 use crate::compiler::ColorMode;
-use crate::config::Config;
+use crate::config::{default_disk_cache_dir, Config};
 use crate::jobserver::Client;
 use crate::mock_command::{CommandChild, CommandCreatorSync, ProcessCommandCreator, RunCommand};
 use crate::protocol::{Compile, CompileFinished, CompileResponse, Request, Response};
@@ -37,6 +37,7 @@ use std::time::Duration;
 use strip_ansi_escapes::Writer;
 use tokio::io::AsyncReadExt;
 use tokio::runtime::Runtime;
+use walkdir::WalkDir;
 use which::which_in;
 
 use crate::errors::*;
@@ -613,6 +614,24 @@ pub fn run_command(cmd: Command) -> Result<i32> {
                 StatsFormat::Json => serde_json::to_writer(&mut io::stdout(), &stats)?,
             }
         }
+        Command::DebugPreprocessorCacheEntries => {
+            trace!("Command::DebugPreprocessorCacheEntries");
+            let entries_dir = default_disk_cache_dir().join("preprocessor");
+            for entry in WalkDir::new(entries_dir).sort_by_file_name().into_iter() {
+                let preprocessor_cache_entry_file = entry?;
+                let path = preprocessor_cache_entry_file.path();
+                if !path.is_file() {
+                    continue;
+                }
+                println!("=========================");
+                println!("Showing preprocessor entry file {}", &path.display());
+                let contents = std::fs::read(path)?;
+                let preprocessor_cache_entry =
+                    crate::compiler::PreprocessorCacheEntry::read(&contents)?;
+                println!("{:#?}", preprocessor_cache_entry);
+                println!("=========================");
+            }
+        }
         Command::InternalStartServer => {
             trace!("Command::InternalStartServer");
             if env::var("SCCACHE_ERROR_LOG").is_ok() {