Merge pull request #1 from eastgenomics/dev

Inital work on basic function of tar finding script (#1)
eastgenomics · Nov 27, 2024 · 1473bde · 1473bde
2 parents 632cf5d + 302c9b4
commit 1473bde
Show file tree

Hide file tree

Showing 2 changed files with 324 additions and 0 deletions.
diff --git a/main.py b/main.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+# imports
+import argparse
+from datetime import datetime
+from dateutil.relativedelta import relativedelta
+import pandas as pd
+import time
+
+import dxpy as dx
+
+
+def get_credentials(path: str) -> str:
+    """reads DNAnexus token from file
+
+    Args:
+        path (str): path to a file with DNAnexus auth token.
+
+    Returns:
+        str: DNAnexus token stripped of newline characters
+    """
+
+    with open(f"{path}", "r") as file:
+        auth_token = file.read().rstrip()
+
+    return auth_token
+
+
+def dx_login(token: str):
+    """Function to set authentication for DNAneuxs
+
+    Args:
+        token (str): DNAnexus token_
+    """
+    try:
+        dx_security_context = {"auth_token_type": "Bearer", "auth_token": str(token)}
+
+        dx.set_security_context(dx_security_context)
+        print(dx.api.system_whoami())
+    except dx.exceptions.InvalidAuthentication as err:
+        raise dx.exceptions.InvalidAuthentication(
+            f"DNAnexus Authentication failed: {err}"
+        )
+
+
+##find tar files
+def find_files(project: str, older_than: int) -> list:
+    """function to wrap dx api methods that can find
+    tar files older than a given date in unix epoch milliseconds
+
+
+    Args:
+        project (str): DNAnexus project id
+        older_than (int): unix epoch time in milliseconds
+
+    Returns:
+        list: contains the meta dater for each tar file found
+    """
+    print(f"older than:{older_than}")
+    results = list(
+        dx.find_data_objects(
+            project=project,
+            name_mode="regexp",
+            name="^run.*.tar.gz$",
+            created_before=older_than,
+            describe={
+                "fields": {"name": True, "id": True, "project": True, "size": True}
+            },
+        )
+    )
+    print(len(results))
+    return results
+
+
+##output tar file details
+def tar_details(files: list) -> pd.DataFrame:
+    """a method for extracting the needed information from the tar file meta data
+
+
+    Args:
+        files (list): list of tar file metadata
+
+    Returns:
+        list: list where each item contains the name,
+              file id and project id for a corisponding file in the input list
+    """
+    name = []
+    file = []
+    project = []
+    size = []
+    for x in files:
+        name = name + [x["describe"]["name"]]
+        file = file + [x["id"]]
+        project = project + [x["project"]]
+        size = size + [x["describe"]["size"]]
+    data = pd.DataFrame({"name": name, "file": file, "project": project, "size": size})
+
+    print(f"Total size of data: {sizeof_fmt(data["size"].sum())}")
+    return data
+
+
+##delete tar files
+
+##check date
+
+
+##get date for deletion(6 months ago)
+### TODO: need a better way of adjusting this
+def get_time_limit() -> int:
+    """a method to get a timestamp in unix milliseconds
+
+
+    Returns:
+        int: unix epoch time in miliseconds
+    """
+    # 15778458 is 6 months in seconds, dx uses unix epoch in milliseconds
+    # 86400 ia 1 day
+    now = datetime.now() - relativedelta(months=6)
+    limit = int(time.mktime(now.timetuple()))
+
+    return limit * 1000
+
+
+# inputs
+## argumets or read from config?
+
+
+def parse_args() -> argparse.Namespace:
+    """parse command line arguments
+
+
+    Returns:
+        namespace: input command line arguments
+    """
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--token-file", help="a file containing dx login token")
+
+    parser.add_argument("--project", help="DNANexus project id")
+
+    parser.add_argument(
+        "--output",
+        help="destination of output file containing DNANexus files to be deleted",
+    )
+
+    return parser.parse_args()
+
+
+def sizeof_fmt(num) -> str:
+    """
+    Function to turn bytes to human readable file size format.
+
+    Taken from https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size
+
+    Parameters
+    ----------
+    num : int
+        total size in bytes
+
+    Returns
+    -------
+    str
+        file size in human-readable format
+    """
+    for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
+        if abs(num) < 1024.0:
+            return f"{num:3.2f}{unit}B"
+        num /= 1024.0
+    return f"{num:.2f}YiB"
+
+
+# get/check credetials
+def main():
+
+    args = parse_args()
+
+    print(args.token_file)
+    auth_token = get_credentials(args.token_file)
+    project = args.project
+    output = args.output
+
+    dx_login(auth_token)
+
+    # get old tar files
+    timelimit = get_time_limit()
+    tars = find_files(project, timelimit)
+
+    details = tar_details(tars)
+
+    # record files for deletion
+    details.to_csv(output, header=False, index=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/tests.py b/tests/tests.py
@@ -0,0 +1,129 @@
+import unittest
+from unittest import mock
+from unittest.mock import patch
+import sys
+import time
+
+from main import get_time_limit, find_files, tar_details
+
+
+class TestGetTimeLimit(unittest.TestCase):
+
+    def test_time_is_int(self):
+        limit = get_time_limit()
+        assert isinstance(limit, int)
+
+    # test limit is in milliseconds?
+
+
+class TestFindFiles(unittest.TestCase):
+    # test find files
+    ## mock/patch dx.api.system_find_data_objects
+
+    @patch("main.dx.api.system_find_data_objects")
+    def test_filter_files_by_date(self, mock_find):
+
+        now = round(time.time()) * 1000
+
+        # reduced version of the dx api output
+        mock_find.return_value = {
+            "results": [
+                {
+                    "project": "project-XXXXXXXXX",
+                    "id": "file-AAAAAAAAAAA",
+                    "describe": {
+                        "id": "file-AAAAAAAAAAA",
+                        "project": "project-XXXXXXXXX",
+                        "class": "file",
+                        "name": "run.A_RUN_NAME.lane.all_004.tar.gz",
+                        "folder": "/fake_runfolder_01/runs",
+                        "created": 1728913404000,
+                        "modified": 1728913406925,
+                        "createdBy": {"user": "user-jsims"},
+                        "media": "application/gzip",
+                        "archivalState": "live",
+                    },
+                },
+                {
+                    "project": "project-XXXXXXXXX",
+                    "id": "file-BBBBBBBBBBB",
+                    "describe": {
+                        "id": "file-BBBBBBBBBBB",
+                        "project": "project-XXXXXXXXX",
+                        "class": "file",
+                        "name": "run.A_RUN_NAME.lane.all_001.tar.gz",
+                        "folder": "/fake_runfolder_01/runs",
+                        f"created": {now},
+                        f"modified": {now},
+                        "createdBy": {"user": "user-jsims"},
+                        "media": "application/gzip",
+                        "archivalState": "live",
+                    },
+                },
+            ],
+            "next": {"project": "project-XXXXXXXXX", "id": "file-CCCCCCCCCCCC"},
+        }
+
+        expectd_results = [
+            {
+                "project": "project-XXXXXXXXX",
+                "id": "file-AAAAAAAAAAA",
+                "describe": {
+                    "id": "file-AAAAAAAAAAA",
+                    "project": "project-XXXXXXXXX",
+                    "class": "file",
+                    "name": "run.A_RUN_NAME.lane.all_004.tar.gz",
+                    "folder": "/fake_runfolder_01/runs",
+                    "created": 1728913404000,
+                    "modified": 1728913406925,
+                    "createdBy": {"user": "user-jsims"},
+                    "media": "application/gzip",
+                    "archivalState": "live",
+                },
+            }
+        ]
+
+        files = find_files("fake-project", 1728913404001)
+        self.assertEqual([files[0]], expectd_results)
+
+    # test tar details
+    ##test project
+    ##test csv format
+    ## independent test of object/tar file age?
+
+
+class TestTarDetails(unittest.TestCase):
+    def test_csv_details_extraction(self):
+
+        found_tars = [
+            {
+                "project": "project-XXXXXXXXX",
+                "id": "file-AAAAAAAAAAA",
+                "describe": {
+                    "id": "file-AAAAAAAAAAA",
+                    "project": "project-XXXXXXXXX",
+                    "class": "file",
+                    "name": "run.A_RUN_NAME.lane.all_004.tar.gz",
+                    "folder": "/fake_runfolder_01/runs",
+                    "created": 1728913404000,
+                    "modified": 1728913406925,
+                    "createdBy": {"user": "user-jsims"},
+                    "media": "application/gzip",
+                    "archivalState": "live",
+                },
+            }
+        ]
+
+        expected_details = [
+            "run.A_RUN_NAME.lane.all_004.tar.gz,file-AAAAAAAAAAA,project-XXXXXXXXX"
+        ]
+
+        self.assertEqual(tar_details(found_tars), expected_details)
+
+    # test output of tar details
+    ## test lines = n of tar files
+    ## test csv format
+
+
+if __name__ == "__main__":
+    unittest.main()