Degrade gracefully when scanning a password-protected zip files (#39)

mmaitre314 · maitre-matt · web-flow · commit 75e60f2c02f3 · 2025-03-09T16:06:59.000-07:00
Co-authored-by: Matthieu Maitre &lt;mmaitre@microsoft.com&gt;
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = picklescan
-version = 0.0.23
+version = 0.0.24
 author = Matthieu Maitre
 author_email = mmaitre314@users.noreply.github.com
 description = Security scanner detecting Python Pickle files performing suspicious actions
diff --git a/src/picklescan/scanner.py b/src/picklescan/scanner.py
@@ -160,6 +160,8 @@ def __str__(self) -> str:
 _pytorch_file_extensions = {".bin", ".pt", ".pth", ".ckpt"}
 _pickle_file_extensions = {".pkl", ".pickle", ".joblib", ".dat", ".data"}
 _zip_file_extensions = {".zip", ".npz", ".7z"}
+# Pickle files do not actually have magic bytes, but v2+ files
+# start with a PROTO (\x80) opcode followed by a byte with the protocol version
 _pickle_magic_bytes = {
     b"\x80\x00",
     b"\x80\x01",
@@ -168,6 +170,7 @@ def __str__(self) -> str:
     b"\x80\x04",
     b"\x80\x05",
 }
+_numpy_magic_bytes = b"\x93NUMPY"
 
 
 def _is_7z_file(f: IO[bytes]) -> bool:
@@ -364,37 +367,36 @@ def scan_7z_bytes(data: IO[bytes], file_id) -> ScanResult:
             return result
 
 
-def get_magic_bytes_from_zipfile(zip: zipfile.ZipFile, num_bytes=8):
-    magic_bytes = {}
-    for file_info in zip.infolist():
-        with zip.open(file_info.filename) as f:
-            magic_bytes[file_info.filename] = f.read(num_bytes)
-
-    return magic_bytes
-
-
 def scan_zip_bytes(data: IO[bytes], file_id) -> ScanResult:
     result = ScanResult([])
 
     with RelaxedZipFile(data, "r") as zip:
-        magic_bytes = get_magic_bytes_from_zipfile(zip)
         file_names = zip.namelist()
         _log.debug("Files in zip archive %s: %s", file_id, file_names)
         for file_name in file_names:
-            magic_number = magic_bytes.get(file_name, b"")
-            file_ext = os.path.splitext(file_name)[1]
-            if file_ext in _pickle_file_extensions or any(
-                magic_number.startswith(mn) for mn in _pickle_magic_bytes
-            ):
-                _log.debug("Scanning file %s in zip archive %s", file_name, file_id)
-                with zip.open(file_name, "r") as file:
-                    result.merge(scan_pickle_bytes(file, f"{file_id}:{file_name}"))
-            elif file_ext in _numpy_file_extensions or magic_number.startswith(
-                b"\x93NUMPY"
-            ):
-                _log.debug("Scanning file %s in zip archive %s", file_name, file_id)
+            try:
                 with zip.open(file_name, "r") as file:
-                    result.merge(scan_numpy(file, f"{file_id}:{file_name}"))
+                    magic_bytes = file.read(8)
+                file_ext = os.path.splitext(file_name)[1]
+
+                if file_ext in _pickle_file_extensions or any(
+                    magic_bytes.startswith(mn) for mn in _pickle_magic_bytes
+                ):
+                    _log.debug("Scanning file %s in zip archive %s", file_name, file_id)
+                    with zip.open(file_name, "r") as file:
+                        result.merge(scan_pickle_bytes(file, f"{file_id}:{file_name}"))
+
+                elif file_ext in _numpy_file_extensions or magic_bytes.startswith(
+                    _numpy_magic_bytes
+                ):
+                    _log.debug("Scanning file %s in zip archive %s", file_name, file_id)
+                    with zip.open(file_name, "r") as file:
+                        result.merge(scan_numpy(file, f"{file_id}:{file_name}"))
+            except (zipfile.BadZipFile, RuntimeError) as e:
+                # Log decompression issues (password protected, corrupted, etc.)
+                _log.warning(
+                    "Invalid file %s in zip archive %s: %s", file_name, file_id, str(e)
+                )
 
     return result
 
diff --git a/tests/data/benign_password_protected.zip b/tests/data/benign_password_protected.zip