feat: add support for 7z files (#35)

McPatate · web-flow · commit 25d753f4b9a2 · 2025-02-12T14:30:08.000-08:00
* feat: add support for 7z files

* feat: update actions versions

* refactor: remove unused imports

* feat(ci): bump actions versions

* fix: PR comments

* fix(ci): install picklescan with 7z dependency in tests
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -27,25 +27,25 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+      uses: github/codeql-action/init@v3
       with:
         languages: ${{ matrix.language }}
         # If you wish to specify custom queries, you can do so here or in a config file.
         # By default, queries listed here will override any specified in a config file.
         # Prefix the list here with "+" to use these queries and those in the config file.
-        
+
         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
         # queries: security-extended,security-and-quality
 
-        
+
     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
-      uses: github/codeql-action/autobuild@v2
+      uses: github/codeql-action/autobuild@v3
 
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
@@ -58,4 +58,4 @@ jobs:
     #   ./location_of_script_within_repo/buildscript.sh
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+      uses: github/codeql-action/analyze@v3
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -10,9 +10,9 @@ jobs:
   publish:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: "3.9"
       - name: Install dependencies
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -12,24 +12,24 @@ jobs:
   test:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: "3.9"
       - name: Install packages
         run: |
           python -m pip install --upgrade pip
           python -m pip install -r requirements.txt
-          python -m pip install -e .
+          python -m pip install -e '.[7z]'
       - name: Check code format
         run: black --check src tests
       - name: Lint with flake8
         run: python -m flake8 . --count --show-source --statistics
       - name: Test with pytest
         run: python -m pytest tests --cov=picklescan --doctest-modules --junitxml=junit/test-results.xml --cov-report=xml --cov-report=html
       - name: Archive test results
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: test
           path: |
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ requests==2.31.0
 aiohttp==3.9.1
 black==22.8.0
 numpy>1.24.0
+py7zr==0.22.0
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = picklescan
-version = 0.0.19
+version = 0.0.20
 author = Matthieu Maitre
 author_email = mmaitre314@users.noreply.github.com
 description = Security scanner detecting Python Pickle files performing suspicious actions
@@ -21,6 +21,9 @@ packages = find:
 python_requires = >=3.9
 install_requires =
 
+[options.extras_require]
+7z=py7zr
+
 [options.packages.find]
 where = src
 
diff --git a/src/picklescan/scanner.py b/src/picklescan/scanner.py
@@ -7,6 +7,7 @@
 import os
 import pickletools
 from tarfile import TarError
+from tempfile import TemporaryDirectory
 from typing import IO, List, Optional, Set, Tuple
 import urllib.parse
 import zipfile
@@ -151,7 +152,23 @@ def __str__(self) -> str:
 _numpy_file_extensions = {".npy"}  # Note: .npz is handled as zip files
 _pytorch_file_extensions = {".bin", ".pt", ".pth", ".ckpt"}
 _pickle_file_extensions = {".pkl", ".pickle", ".joblib", ".dat", ".data"}
-_zip_file_extensions = {".zip", ".npz"}
+_zip_file_extensions = {".zip", ".npz", ".7z"}
+
+
+def _is_7z_file(f: IO[bytes]) -> bool:
+    read_bytes = []
+    start = f.tell()
+
+    byte = f.read(1)
+    while byte != b"":
+        read_bytes.append(byte)
+        if len(read_bytes) == 6:
+            break
+        byte = f.read(1)
+    f.seek(start)
+
+    local_header_magic_number = [b"7", b"z", b"\xbc", b"\xaf", b"\x27", b"\x1c"]
+    return read_bytes == local_header_magic_number
 
 
 def _http_get(url) -> bytes:
@@ -307,12 +324,37 @@ def scan_pickle_bytes(data: IO[bytes], file_id, multiple_pickles=True) -> ScanRe
     return _build_scan_result_from_raw_globals(raw_globals, file_id)
 
 
+# XXX: it appears there is not way to get the byte stream for a given file within the 7z archive and thus forcing us to unzip to disk before scanning
+def scan_7z_bytes(data: IO[bytes], file_id) -> ScanResult:
+    try:
+        import py7zr
+    except ImportError:
+        raise Exception(
+            "py7zr is required to scan 7z archives, install picklescan using: 'pip install picklescan[7z]'"
+        )
+    result = ScanResult([])
+
+    with py7zr.SevenZipFile(data, mode="r") as archive:
+        file_names = archive.getnames()
+        targets = [f for f in file_names if f.endswith(tuple(_pickle_file_extensions))]
+        _log.debug("Files in 7z archive %s: %s", file_id, targets)
+        with TemporaryDirectory() as tmpdir:
+            archive.extract(path=tmpdir, targets=targets)
+            for file_name in targets:
+                file_path = os.path.join(tmpdir, file_name)
+                _log.debug("Scanning file %s in 7z archive %s", file_name, file_id)
+                if os.path.isfile(file_path):
+                    result.merge(scan_file_path(file_path))
+
+            return result
+
+
 def scan_zip_bytes(data: IO[bytes], file_id) -> ScanResult:
     result = ScanResult([])
 
     with zipfile.ZipFile(data, "r") as zip:
         file_names = zip.namelist()
-        _log.debug("Files in archive %s: %s", file_id, file_names)
+        _log.debug("Files in zip archive %s: %s", file_id, file_names)
         for file_name in file_names:
             file_ext = os.path.splitext(file_name)[1]
             if file_ext in _pickle_file_extensions:
@@ -361,6 +403,8 @@ def scan_pytorch(data: IO[bytes], file_id) -> ScanResult:
     # new pytorch format
     if _is_zipfile(data):
         return scan_zip_bytes(data, file_id)
+    elif _is_7z_file(data):
+        return scan_7z_bytes(data, file_id)
     # old pytorch format
     else:
         scan_result = ScanResult([])
@@ -395,11 +439,12 @@ def scan_bytes(data: IO[bytes], file_id, file_ext: Optional[str] = None) -> Scan
     else:
         is_zip = zipfile.is_zipfile(data)
         data.seek(0)
-        return (
-            scan_zip_bytes(data, file_id)
-            if is_zip
-            else scan_pickle_bytes(data, file_id)
-        )
+        if is_zip:
+            return scan_zip_bytes(data, file_id)
+        elif _is_7z_file(data):
+            return scan_7z_bytes(data, file_id)
+        else:
+            return scan_pickle_bytes(data, file_id)
 
 
 def scan_huggingface_model(repo_id):
diff --git a/tests/data/malicious1.7z b/tests/data/malicious1.7z
diff --git a/tests/test_scanner.py b/tests/test_scanner.py
@@ -4,10 +4,12 @@
 import importlib
 import io
 import os
-import runpy
+import pathlib
 import pickle
+import py7zr
 import pytest
 import requests
+import runpy
 import socket
 import subprocess
 import sys
@@ -177,6 +179,18 @@ def initialize_data_file(path, data):
             file.write(data)
 
 
+def initialize_7z_file(archive_path, file_name):
+    file_path = f"{_root_path}/data/malicious1.pkl"
+    with open(file_path, "wb") as f:
+        pickle.dump(Malicious1(), f, protocol=4)
+
+    if not os.path.exists(archive_path):
+        with py7zr.SevenZipFile(archive_path, "w") as archive:
+            archive.write(file_path, file_name)
+
+    pathlib.Path.unlink(pathlib.Path(file_path))
+
+
 def initialize_zip_file(path, file_name, data):
     if not os.path.exists(path):
         with zipfile.ZipFile(path, "w") as zip:
@@ -399,6 +413,11 @@ def initialize_pickle_files():
     initialize_pickle_file(f"{_root_path}/data/malicious15a.pkl", Malicious15(), 2)
     initialize_pickle_file(f"{_root_path}/data/malicious15b.pkl", Malicious15(), 4)
 
+    initialize_7z_file(
+        f"{_root_path}/data/malicious1.7z",
+        "data.pkl",
+    )
+
     initialize_zip_file(
         f"{_root_path}/data/malicious1.zip",
         "data.pkl",
@@ -732,10 +751,11 @@ def test_scan_directory_path():
             Global("bdb", "Bdb", SafetyLevel.Dangerous),
             Global("bdb", "Bdb.run", SafetyLevel.Dangerous),
             Global("builtins", "exec", SafetyLevel.Dangerous),
+            Global("builtins", "eval", SafetyLevel.Dangerous),
         ],
-        scanned_files=31,
-        issues_count=31,
-        infected_files=26,
+        scanned_files=32,
+        issues_count=32,
+        infected_files=27,
         scan_err=True,
     )
     compare_scan_results(scan_directory_path(f"{_root_path}/data/"), sr)