Skip to content

Commit

Permalink
Merge pull request #1 from eastgenomics/dev
Browse files Browse the repository at this point in the history
Inital work on basic function of tar finding script (#1)
  • Loading branch information
jethror1 authored Nov 27, 2024
2 parents 632cf5d + 302c9b4 commit 1473bde
Show file tree
Hide file tree
Showing 2 changed files with 324 additions and 0 deletions.
195 changes: 195 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#!/usr/bin/env python3
# imports
import argparse
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import time

import dxpy as dx


def get_credentials(path: str) -> str:
"""reads DNAnexus token from file
Args:
path (str): path to a file with DNAnexus auth token.
Returns:
str: DNAnexus token stripped of newline characters
"""

with open(f"{path}", "r") as file:
auth_token = file.read().rstrip()

return auth_token


def dx_login(token: str):
"""Function to set authentication for DNAneuxs
Args:
token (str): DNAnexus token_
"""
try:
dx_security_context = {"auth_token_type": "Bearer", "auth_token": str(token)}

dx.set_security_context(dx_security_context)
print(dx.api.system_whoami())
except dx.exceptions.InvalidAuthentication as err:
raise dx.exceptions.InvalidAuthentication(
f"DNAnexus Authentication failed: {err}"
)


##find tar files
def find_files(project: str, older_than: int) -> list:
"""function to wrap dx api methods that can find
tar files older than a given date in unix epoch milliseconds
Args:
project (str): DNAnexus project id
older_than (int): unix epoch time in milliseconds
Returns:
list: contains the meta dater for each tar file found
"""
print(f"older than:{older_than}")
results = list(
dx.find_data_objects(
project=project,
name_mode="regexp",
name="^run.*.tar.gz$",
created_before=older_than,
describe={
"fields": {"name": True, "id": True, "project": True, "size": True}
},
)
)
print(len(results))
return results


##output tar file details
def tar_details(files: list) -> pd.DataFrame:
"""a method for extracting the needed information from the tar file meta data
Args:
files (list): list of tar file metadata
Returns:
list: list where each item contains the name,
file id and project id for a corisponding file in the input list
"""
name = []
file = []
project = []
size = []
for x in files:
name = name + [x["describe"]["name"]]
file = file + [x["id"]]
project = project + [x["project"]]
size = size + [x["describe"]["size"]]
data = pd.DataFrame({"name": name, "file": file, "project": project, "size": size})

print(f"Total size of data: {sizeof_fmt(data["size"].sum())}")
return data


##delete tar files

##check date


##get date for deletion(6 months ago)
### TODO: need a better way of adjusting this
def get_time_limit() -> int:
"""a method to get a timestamp in unix milliseconds
Returns:
int: unix epoch time in miliseconds
"""
# 15778458 is 6 months in seconds, dx uses unix epoch in milliseconds
# 86400 ia 1 day
now = datetime.now() - relativedelta(months=6)
limit = int(time.mktime(now.timetuple()))

return limit * 1000


# inputs
## argumets or read from config?


def parse_args() -> argparse.Namespace:
"""parse command line arguments
Returns:
namespace: input command line arguments
"""

parser = argparse.ArgumentParser()

parser.add_argument("--token-file", help="a file containing dx login token")

parser.add_argument("--project", help="DNANexus project id")

parser.add_argument(
"--output",
help="destination of output file containing DNANexus files to be deleted",
)

return parser.parse_args()


def sizeof_fmt(num) -> str:
"""
Function to turn bytes to human readable file size format.
Taken from https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size
Parameters
----------
num : int
total size in bytes
Returns
-------
str
file size in human-readable format
"""
for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
if abs(num) < 1024.0:
return f"{num:3.2f}{unit}B"
num /= 1024.0
return f"{num:.2f}YiB"


# get/check credetials
def main():

args = parse_args()

print(args.token_file)
auth_token = get_credentials(args.token_file)
project = args.project
output = args.output

dx_login(auth_token)

# get old tar files
timelimit = get_time_limit()
tars = find_files(project, timelimit)

details = tar_details(tars)

# record files for deletion
details.to_csv(output, header=False, index=False)


if __name__ == "__main__":
main()
129 changes: 129 additions & 0 deletions tests/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import unittest
from unittest import mock
from unittest.mock import patch
import sys
import time

from main import get_time_limit, find_files, tar_details


class TestGetTimeLimit(unittest.TestCase):

def test_time_is_int(self):
limit = get_time_limit()
assert isinstance(limit, int)

# test limit is in milliseconds?


class TestFindFiles(unittest.TestCase):
# test find files
## mock/patch dx.api.system_find_data_objects

@patch("main.dx.api.system_find_data_objects")
def test_filter_files_by_date(self, mock_find):

now = round(time.time()) * 1000

# reduced version of the dx api output
mock_find.return_value = {
"results": [
{
"project": "project-XXXXXXXXX",
"id": "file-AAAAAAAAAAA",
"describe": {
"id": "file-AAAAAAAAAAA",
"project": "project-XXXXXXXXX",
"class": "file",
"name": "run.A_RUN_NAME.lane.all_004.tar.gz",
"folder": "/fake_runfolder_01/runs",
"created": 1728913404000,
"modified": 1728913406925,
"createdBy": {"user": "user-jsims"},
"media": "application/gzip",
"archivalState": "live",
},
},
{
"project": "project-XXXXXXXXX",
"id": "file-BBBBBBBBBBB",
"describe": {
"id": "file-BBBBBBBBBBB",
"project": "project-XXXXXXXXX",
"class": "file",
"name": "run.A_RUN_NAME.lane.all_001.tar.gz",
"folder": "/fake_runfolder_01/runs",
f"created": {now},
f"modified": {now},
"createdBy": {"user": "user-jsims"},
"media": "application/gzip",
"archivalState": "live",
},
},
],
"next": {"project": "project-XXXXXXXXX", "id": "file-CCCCCCCCCCCC"},
}

expectd_results = [
{
"project": "project-XXXXXXXXX",
"id": "file-AAAAAAAAAAA",
"describe": {
"id": "file-AAAAAAAAAAA",
"project": "project-XXXXXXXXX",
"class": "file",
"name": "run.A_RUN_NAME.lane.all_004.tar.gz",
"folder": "/fake_runfolder_01/runs",
"created": 1728913404000,
"modified": 1728913406925,
"createdBy": {"user": "user-jsims"},
"media": "application/gzip",
"archivalState": "live",
},
}
]

files = find_files("fake-project", 1728913404001)
self.assertEqual([files[0]], expectd_results)

# test tar details
##test project
##test csv format
## independent test of object/tar file age?


class TestTarDetails(unittest.TestCase):
def test_csv_details_extraction(self):

found_tars = [
{
"project": "project-XXXXXXXXX",
"id": "file-AAAAAAAAAAA",
"describe": {
"id": "file-AAAAAAAAAAA",
"project": "project-XXXXXXXXX",
"class": "file",
"name": "run.A_RUN_NAME.lane.all_004.tar.gz",
"folder": "/fake_runfolder_01/runs",
"created": 1728913404000,
"modified": 1728913406925,
"createdBy": {"user": "user-jsims"},
"media": "application/gzip",
"archivalState": "live",
},
}
]

expected_details = [
"run.A_RUN_NAME.lane.all_004.tar.gz,file-AAAAAAAAAAA,project-XXXXXXXXX"
]

self.assertEqual(tar_details(found_tars), expected_details)

# test output of tar details
## test lines = n of tar files
## test csv format


if __name__ == "__main__":
unittest.main()

0 comments on commit 1473bde

Please sign in to comment.