Skip to content

Commit 25d753f

Browse files
authored
feat: add support for 7z files (#35)
* feat: add support for 7z files * feat: update actions versions * refactor: remove unused imports * feat(ci): bump actions versions * fix: PR comments * fix(ci): install picklescan with 7z dependency in tests
1 parent 78debf9 commit 25d753f

File tree

8 files changed

+93
-24
lines changed

8 files changed

+93
-24
lines changed

.github/workflows/codeql-analysis.yml

+6-6
Original file line numberDiff line numberDiff line change
@@ -27,25 +27,25 @@ jobs:
2727

2828
steps:
2929
- name: Checkout repository
30-
uses: actions/checkout@v3
30+
uses: actions/checkout@v4
3131

3232
# Initializes the CodeQL tools for scanning.
3333
- name: Initialize CodeQL
34-
uses: github/codeql-action/init@v2
34+
uses: github/codeql-action/init@v3
3535
with:
3636
languages: ${{ matrix.language }}
3737
# If you wish to specify custom queries, you can do so here or in a config file.
3838
# By default, queries listed here will override any specified in a config file.
3939
# Prefix the list here with "+" to use these queries and those in the config file.
40-
40+
4141
# Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
4242
# queries: security-extended,security-and-quality
4343

44-
44+
4545
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
4646
# If this step fails, then you should remove it and run the build manually (see below)
4747
- name: Autobuild
48-
uses: github/codeql-action/autobuild@v2
48+
uses: github/codeql-action/autobuild@v3
4949

5050
# ℹ️ Command-line programs to run using the OS shell.
5151
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
@@ -58,4 +58,4 @@ jobs:
5858
# ./location_of_script_within_repo/buildscript.sh
5959

6060
- name: Perform CodeQL Analysis
61-
uses: github/codeql-action/analyze@v2
61+
uses: github/codeql-action/analyze@v3

.github/workflows/publish.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ jobs:
1010
publish:
1111
runs-on: ubuntu-latest
1212
steps:
13-
- uses: actions/checkout@v3
13+
- uses: actions/checkout@v4
1414
- name: Set up Python
15-
uses: actions/setup-python@v4
15+
uses: actions/setup-python@v5
1616
with:
1717
python-version: "3.9"
1818
- name: Install dependencies

.github/workflows/test.yml

+4-4
Original file line numberDiff line numberDiff line change
@@ -12,24 +12,24 @@ jobs:
1212
test:
1313
runs-on: ubuntu-latest
1414
steps:
15-
- uses: actions/checkout@v3
15+
- uses: actions/checkout@v4
1616
- name: Set up Python
17-
uses: actions/setup-python@v4
17+
uses: actions/setup-python@v5
1818
with:
1919
python-version: "3.9"
2020
- name: Install packages
2121
run: |
2222
python -m pip install --upgrade pip
2323
python -m pip install -r requirements.txt
24-
python -m pip install -e .
24+
python -m pip install -e '.[7z]'
2525
- name: Check code format
2626
run: black --check src tests
2727
- name: Lint with flake8
2828
run: python -m flake8 . --count --show-source --statistics
2929
- name: Test with pytest
3030
run: python -m pytest tests --cov=picklescan --doctest-modules --junitxml=junit/test-results.xml --cov-report=xml --cov-report=html
3131
- name: Archive test results
32-
uses: actions/upload-artifact@v3
32+
uses: actions/upload-artifact@v4
3333
with:
3434
name: test
3535
path: |

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ requests==2.31.0
66
aiohttp==3.9.1
77
black==22.8.0
88
numpy>1.24.0
9+
py7zr==0.22.0

setup.cfg

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = picklescan
3-
version = 0.0.19
3+
version = 0.0.20
44
author = Matthieu Maitre
55
author_email = mmaitre314@users.noreply.github.com
66
description = Security scanner detecting Python Pickle files performing suspicious actions
@@ -21,6 +21,9 @@ packages = find:
2121
python_requires = >=3.9
2222
install_requires =
2323

24+
[options.extras_require]
25+
7z=py7zr
26+
2427
[options.packages.find]
2528
where = src
2629

src/picklescan/scanner.py

+52-7
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import os
88
import pickletools
99
from tarfile import TarError
10+
from tempfile import TemporaryDirectory
1011
from typing import IO, List, Optional, Set, Tuple
1112
import urllib.parse
1213
import zipfile
@@ -151,7 +152,23 @@ def __str__(self) -> str:
151152
_numpy_file_extensions = {".npy"} # Note: .npz is handled as zip files
152153
_pytorch_file_extensions = {".bin", ".pt", ".pth", ".ckpt"}
153154
_pickle_file_extensions = {".pkl", ".pickle", ".joblib", ".dat", ".data"}
154-
_zip_file_extensions = {".zip", ".npz"}
155+
_zip_file_extensions = {".zip", ".npz", ".7z"}
156+
157+
158+
def _is_7z_file(f: IO[bytes]) -> bool:
159+
read_bytes = []
160+
start = f.tell()
161+
162+
byte = f.read(1)
163+
while byte != b"":
164+
read_bytes.append(byte)
165+
if len(read_bytes) == 6:
166+
break
167+
byte = f.read(1)
168+
f.seek(start)
169+
170+
local_header_magic_number = [b"7", b"z", b"\xbc", b"\xaf", b"\x27", b"\x1c"]
171+
return read_bytes == local_header_magic_number
155172

156173

157174
def _http_get(url) -> bytes:
@@ -307,12 +324,37 @@ def scan_pickle_bytes(data: IO[bytes], file_id, multiple_pickles=True) -> ScanRe
307324
return _build_scan_result_from_raw_globals(raw_globals, file_id)
308325

309326

327+
# XXX: it appears there is not way to get the byte stream for a given file within the 7z archive and thus forcing us to unzip to disk before scanning
328+
def scan_7z_bytes(data: IO[bytes], file_id) -> ScanResult:
329+
try:
330+
import py7zr
331+
except ImportError:
332+
raise Exception(
333+
"py7zr is required to scan 7z archives, install picklescan using: 'pip install picklescan[7z]'"
334+
)
335+
result = ScanResult([])
336+
337+
with py7zr.SevenZipFile(data, mode="r") as archive:
338+
file_names = archive.getnames()
339+
targets = [f for f in file_names if f.endswith(tuple(_pickle_file_extensions))]
340+
_log.debug("Files in 7z archive %s: %s", file_id, targets)
341+
with TemporaryDirectory() as tmpdir:
342+
archive.extract(path=tmpdir, targets=targets)
343+
for file_name in targets:
344+
file_path = os.path.join(tmpdir, file_name)
345+
_log.debug("Scanning file %s in 7z archive %s", file_name, file_id)
346+
if os.path.isfile(file_path):
347+
result.merge(scan_file_path(file_path))
348+
349+
return result
350+
351+
310352
def scan_zip_bytes(data: IO[bytes], file_id) -> ScanResult:
311353
result = ScanResult([])
312354

313355
with zipfile.ZipFile(data, "r") as zip:
314356
file_names = zip.namelist()
315-
_log.debug("Files in archive %s: %s", file_id, file_names)
357+
_log.debug("Files in zip archive %s: %s", file_id, file_names)
316358
for file_name in file_names:
317359
file_ext = os.path.splitext(file_name)[1]
318360
if file_ext in _pickle_file_extensions:
@@ -361,6 +403,8 @@ def scan_pytorch(data: IO[bytes], file_id) -> ScanResult:
361403
# new pytorch format
362404
if _is_zipfile(data):
363405
return scan_zip_bytes(data, file_id)
406+
elif _is_7z_file(data):
407+
return scan_7z_bytes(data, file_id)
364408
# old pytorch format
365409
else:
366410
scan_result = ScanResult([])
@@ -395,11 +439,12 @@ def scan_bytes(data: IO[bytes], file_id, file_ext: Optional[str] = None) -> Scan
395439
else:
396440
is_zip = zipfile.is_zipfile(data)
397441
data.seek(0)
398-
return (
399-
scan_zip_bytes(data, file_id)
400-
if is_zip
401-
else scan_pickle_bytes(data, file_id)
402-
)
442+
if is_zip:
443+
return scan_zip_bytes(data, file_id)
444+
elif _is_7z_file(data):
445+
return scan_7z_bytes(data, file_id)
446+
else:
447+
return scan_pickle_bytes(data, file_id)
403448

404449

405450
def scan_huggingface_model(repo_id):

tests/data/malicious1.7z

198 Bytes
Binary file not shown.

tests/test_scanner.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
import importlib
55
import io
66
import os
7-
import runpy
7+
import pathlib
88
import pickle
9+
import py7zr
910
import pytest
1011
import requests
12+
import runpy
1113
import socket
1214
import subprocess
1315
import sys
@@ -177,6 +179,18 @@ def initialize_data_file(path, data):
177179
file.write(data)
178180

179181

182+
def initialize_7z_file(archive_path, file_name):
183+
file_path = f"{_root_path}/data/malicious1.pkl"
184+
with open(file_path, "wb") as f:
185+
pickle.dump(Malicious1(), f, protocol=4)
186+
187+
if not os.path.exists(archive_path):
188+
with py7zr.SevenZipFile(archive_path, "w") as archive:
189+
archive.write(file_path, file_name)
190+
191+
pathlib.Path.unlink(pathlib.Path(file_path))
192+
193+
180194
def initialize_zip_file(path, file_name, data):
181195
if not os.path.exists(path):
182196
with zipfile.ZipFile(path, "w") as zip:
@@ -399,6 +413,11 @@ def initialize_pickle_files():
399413
initialize_pickle_file(f"{_root_path}/data/malicious15a.pkl", Malicious15(), 2)
400414
initialize_pickle_file(f"{_root_path}/data/malicious15b.pkl", Malicious15(), 4)
401415

416+
initialize_7z_file(
417+
f"{_root_path}/data/malicious1.7z",
418+
"data.pkl",
419+
)
420+
402421
initialize_zip_file(
403422
f"{_root_path}/data/malicious1.zip",
404423
"data.pkl",
@@ -732,10 +751,11 @@ def test_scan_directory_path():
732751
Global("bdb", "Bdb", SafetyLevel.Dangerous),
733752
Global("bdb", "Bdb.run", SafetyLevel.Dangerous),
734753
Global("builtins", "exec", SafetyLevel.Dangerous),
754+
Global("builtins", "eval", SafetyLevel.Dangerous),
735755
],
736-
scanned_files=31,
737-
issues_count=31,
738-
infected_files=26,
756+
scanned_files=32,
757+
issues_count=32,
758+
infected_files=27,
739759
scan_err=True,
740760
)
741761
compare_scan_results(scan_directory_path(f"{_root_path}/data/"), sr)

0 commit comments

Comments
 (0)