diff --git a/main.py b/main.py new file mode 100755 index 0000000..318303e --- /dev/null +++ b/main.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +# imports +import argparse +from datetime import datetime +from dateutil.relativedelta import relativedelta +import pandas as pd +import time + +import dxpy as dx + + +def get_credentials(path: str) -> str: + """reads DNAnexus token from file + + Args: + path (str): path to a file with DNAnexus auth token. + + Returns: + str: DNAnexus token stripped of newline characters + """ + + with open(f"{path}", "r") as file: + auth_token = file.read().rstrip() + + return auth_token + + +def dx_login(token: str): + """Function to set authentication for DNAneuxs + + Args: + token (str): DNAnexus token_ + """ + try: + dx_security_context = {"auth_token_type": "Bearer", "auth_token": str(token)} + + dx.set_security_context(dx_security_context) + print(dx.api.system_whoami()) + except dx.exceptions.InvalidAuthentication as err: + raise dx.exceptions.InvalidAuthentication( + f"DNAnexus Authentication failed: {err}" + ) + + +##find tar files +def find_files(project: str, older_than: int) -> list: + """function to wrap dx api methods that can find + tar files older than a given date in unix epoch milliseconds + + + Args: + project (str): DNAnexus project id + older_than (int): unix epoch time in milliseconds + + Returns: + list: contains the meta dater for each tar file found + """ + print(f"older than:{older_than}") + results = list( + dx.find_data_objects( + project=project, + name_mode="regexp", + name="^run.*.tar.gz$", + created_before=older_than, + describe={ + "fields": {"name": True, "id": True, "project": True, "size": True} + }, + ) + ) + print(len(results)) + return results + + +##output tar file details +def tar_details(files: list) -> pd.DataFrame: + """a method for extracting the needed information from the tar file meta data + + + Args: + files (list): list of tar file metadata + + Returns: + list: list where each item contains the name, + file id and project id for a corisponding file in the input list + """ + name = [] + file = [] + project = [] + size = [] + for x in files: + name = name + [x["describe"]["name"]] + file = file + [x["id"]] + project = project + [x["project"]] + size = size + [x["describe"]["size"]] + data = pd.DataFrame({"name": name, "file": file, "project": project, "size": size}) + + print(f"Total size of data: {sizeof_fmt(data["size"].sum())}") + return data + + +##delete tar files + +##check date + + +##get date for deletion(6 months ago) +### TODO: need a better way of adjusting this +def get_time_limit() -> int: + """a method to get a timestamp in unix milliseconds + + + Returns: + int: unix epoch time in miliseconds + """ + # 15778458 is 6 months in seconds, dx uses unix epoch in milliseconds + # 86400 ia 1 day + now = datetime.now() - relativedelta(months=6) + limit = int(time.mktime(now.timetuple())) + + return limit * 1000 + + +# inputs +## argumets or read from config? + + +def parse_args() -> argparse.Namespace: + """parse command line arguments + + + Returns: + namespace: input command line arguments + """ + + parser = argparse.ArgumentParser() + + parser.add_argument("--token-file", help="a file containing dx login token") + + parser.add_argument("--project", help="DNANexus project id") + + parser.add_argument( + "--output", + help="destination of output file containing DNANexus files to be deleted", + ) + + return parser.parse_args() + + +def sizeof_fmt(num) -> str: + """ + Function to turn bytes to human readable file size format. + + Taken from https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size + + Parameters + ---------- + num : int + total size in bytes + + Returns + ------- + str + file size in human-readable format + """ + for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: + if abs(num) < 1024.0: + return f"{num:3.2f}{unit}B" + num /= 1024.0 + return f"{num:.2f}YiB" + + +# get/check credetials +def main(): + + args = parse_args() + + print(args.token_file) + auth_token = get_credentials(args.token_file) + project = args.project + output = args.output + + dx_login(auth_token) + + # get old tar files + timelimit = get_time_limit() + tars = find_files(project, timelimit) + + details = tar_details(tars) + + # record files for deletion + details.to_csv(output, header=False, index=False) + + +if __name__ == "__main__": + main() diff --git a/tests/tests.py b/tests/tests.py new file mode 100644 index 0000000..726a94c --- /dev/null +++ b/tests/tests.py @@ -0,0 +1,129 @@ +import unittest +from unittest import mock +from unittest.mock import patch +import sys +import time + +from main import get_time_limit, find_files, tar_details + + +class TestGetTimeLimit(unittest.TestCase): + + def test_time_is_int(self): + limit = get_time_limit() + assert isinstance(limit, int) + + # test limit is in milliseconds? + + +class TestFindFiles(unittest.TestCase): + # test find files + ## mock/patch dx.api.system_find_data_objects + + @patch("main.dx.api.system_find_data_objects") + def test_filter_files_by_date(self, mock_find): + + now = round(time.time()) * 1000 + + # reduced version of the dx api output + mock_find.return_value = { + "results": [ + { + "project": "project-XXXXXXXXX", + "id": "file-AAAAAAAAAAA", + "describe": { + "id": "file-AAAAAAAAAAA", + "project": "project-XXXXXXXXX", + "class": "file", + "name": "run.A_RUN_NAME.lane.all_004.tar.gz", + "folder": "/fake_runfolder_01/runs", + "created": 1728913404000, + "modified": 1728913406925, + "createdBy": {"user": "user-jsims"}, + "media": "application/gzip", + "archivalState": "live", + }, + }, + { + "project": "project-XXXXXXXXX", + "id": "file-BBBBBBBBBBB", + "describe": { + "id": "file-BBBBBBBBBBB", + "project": "project-XXXXXXXXX", + "class": "file", + "name": "run.A_RUN_NAME.lane.all_001.tar.gz", + "folder": "/fake_runfolder_01/runs", + f"created": {now}, + f"modified": {now}, + "createdBy": {"user": "user-jsims"}, + "media": "application/gzip", + "archivalState": "live", + }, + }, + ], + "next": {"project": "project-XXXXXXXXX", "id": "file-CCCCCCCCCCCC"}, + } + + expectd_results = [ + { + "project": "project-XXXXXXXXX", + "id": "file-AAAAAAAAAAA", + "describe": { + "id": "file-AAAAAAAAAAA", + "project": "project-XXXXXXXXX", + "class": "file", + "name": "run.A_RUN_NAME.lane.all_004.tar.gz", + "folder": "/fake_runfolder_01/runs", + "created": 1728913404000, + "modified": 1728913406925, + "createdBy": {"user": "user-jsims"}, + "media": "application/gzip", + "archivalState": "live", + }, + } + ] + + files = find_files("fake-project", 1728913404001) + self.assertEqual([files[0]], expectd_results) + + # test tar details + ##test project + ##test csv format + ## independent test of object/tar file age? + + +class TestTarDetails(unittest.TestCase): + def test_csv_details_extraction(self): + + found_tars = [ + { + "project": "project-XXXXXXXXX", + "id": "file-AAAAAAAAAAA", + "describe": { + "id": "file-AAAAAAAAAAA", + "project": "project-XXXXXXXXX", + "class": "file", + "name": "run.A_RUN_NAME.lane.all_004.tar.gz", + "folder": "/fake_runfolder_01/runs", + "created": 1728913404000, + "modified": 1728913406925, + "createdBy": {"user": "user-jsims"}, + "media": "application/gzip", + "archivalState": "live", + }, + } + ] + + expected_details = [ + "run.A_RUN_NAME.lane.all_004.tar.gz,file-AAAAAAAAAAA,project-XXXXXXXXX" + ] + + self.assertEqual(tar_details(found_tars), expected_details) + + # test output of tar details + ## test lines = n of tar files + ## test csv format + + +if __name__ == "__main__": + unittest.main()