Skip to content

Commit

Permalink
Merge branch 'main' into scanpy-1.10
Browse files Browse the repository at this point in the history
  • Loading branch information
bgruening authored Aug 25, 2024
2 parents c8952a5 + 7f32f58 commit 8002c40
Show file tree
Hide file tree
Showing 383 changed files with 121,978 additions and 24,702 deletions.
Original file line number Diff line number Diff line change
@@ -1,25 +1,63 @@
#!/usr/bin/env python

import argparse
import gzip
import json
import os
import shutil
import sys
import tarfile
from datetime import datetime
from urllib.parse import urlparse
from urllib.request import Request
from urllib.request import urlopen
from urllib.request import HTTPError, Request, urlopen

# rather provide the urls based on the release, less error potential for the admins !
urls = {
"202": {
"full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz",
"meta_ar": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_metadata_r202.tar.gz",
"meta_bac": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_metadata_r202.tar.gz",
},
"207": {
"full": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/auxillary_files/gtdbtk_r207_data.tar.gz",
"meta_ar": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_metadata_r207.tar.gz",
"meta_bac": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_metadata_r207.tar.gz",
},
"214": {
"full": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/auxillary_files/gtdbtk_r214_data.tar.gz",
"meta_ar": "https://data.gtdb.ecogenomic.org/releases/release214/214.1/ar53_metadata_r214.tsv.gz",
"meta_bac": "https://data.gtdb.ecogenomic.org/releases/release214/214.1/bac120_metadata_r214.tsv.gz",
},
"220": {
"full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz",
"meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz",
"meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz",
},
}

def url_download(url, target_directory):

def is_urlfile(url):
# Check if online file exists
try:
r = urlopen(url) # response
return r.getcode() < 400
except HTTPError:
return False


def url_download(url, target_directory, meta):

# download the url
url_parts = urlparse(url)
tarball = os.path.abspath(os.path.join(target_directory, os.path.basename(url_parts.path)))
tarball = os.path.abspath(
os.path.join(target_directory, os.path.basename(url_parts.path))
)
src = None
dst = None
try:
req = Request(url)
src = urlopen(req)
with open(tarball, 'wb') as dst:
with open(tarball, "wb") as dst:
while True:
chunk = src.read(2**10)
if chunk:
Expand All @@ -31,54 +69,143 @@ def url_download(url, target_directory):
finally:
if src is not None:
src.close()
if tarfile.is_tarfile(tarball):
fh = tarfile.open(tarball, 'r:*')

# extract the metadata
if meta:
# extract the content of *.tar.gz into the target dir
if tarfile.is_tarfile(tarball):
fh = tarfile.open(tarball, "r:*")
fh.extractall(target_directory)
fh.close()
os.remove(tarball)
return target_directory # return path to output folder
# extract the content of *.gz into the target dir
elif ".gz" in tarball:
with gzip.open(tarball, "rb") as f_in:
unzipped_file = tarball.strip(".gz")
with open(unzipped_file, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(tarball)
folder_of_unzipped_file = os.path.dirname(unzipped_file)
return folder_of_unzipped_file
else:
sys.exit(
"No correct input format for metadata file, must be .tar.gz or .gz"
)
else:
return tarball
fh.extractall(target_directory)
fh.close()
os.remove(tarball)
# The tarball extraction will create a directory named
# something like release202 in the target_directory, so
# we need to move the items in that directory to the
# target directory.
subdir = next(os.walk(target_directory))[1][0]
subdir_path = os.path.join(target_directory, subdir)
items = os.listdir(subdir_path)
for item in items:
item_path = os.path.join(subdir_path, item)
shutil.move(item_path, target_directory)
os.rmdir(subdir_path)
return target_directory


def download(database_id, database_name, url, out_file):
# handle the DB
# extract the content of the folder in the tar.gz into the target dir
if tarfile.is_tarfile(tarball):
fh = tarfile.open(tarball, "r:*")
fh.extractall(target_directory)
fh.close()
os.remove(tarball)
else:
# handle the test case for the DB
return tarball

fh.extractall(target_directory)
fh.close()
os.remove(tarball)
# The tarball extraction will create a directory named
# something like release202 in the target_directory, so
# we need to move the items in that directory to the
# target directory.
subdir = next(os.walk(target_directory))[1][0]
subdir_path = os.path.join(target_directory, subdir)
items = os.listdir(subdir_path)
for item in items:
item_path = os.path.join(subdir_path, item)
shutil.move(item_path, target_directory)
os.rmdir(subdir_path)
return target_directory


def download(database_name, release, meta, test, out_file):

with open(out_file) as fh:
params = json.load(fh)

target_directory = params['output_data'][0]['extra_files_path']
target_directory = params["output_data"][0]["extra_files_path"]
os.makedirs(target_directory)
file_path = url_download(url, target_directory)

if test:
# switch the DB to use the test case
urls[release][
"full"
] = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt"

# make use of the test to check if all urls exists
for _version, items in urls.items():
for url in items.values():
assert is_urlfile(url)

# download both taxonomy metadata tables
if meta:
url = urls[release]["meta_ar"]
file_path = url_download(url, target_directory, meta)
url = urls[release]["meta_bac"]
file_path = url_download(url, target_directory, meta)
# download the full DB
else:
url = urls[release]["full"]
file_path = url_download(url, target_directory, meta)

time = datetime.utcnow().strftime("%Y-%m-%d")

data_manager_json = {"data_tables": {}}
data_manager_entry = {}
data_manager_entry['value'] = database_id
data_manager_entry['name'] = database_name
data_manager_entry['path'] = file_path
data_manager_json["data_tables"]["gtdbtk_database"] = data_manager_entry
data_manager_entry["value"] = f"{database_name}_release_{release}_downloaded_{time}"
data_manager_entry["name"] = database_name
data_manager_entry["path"] = file_path
data_manager_entry["version"] = release

with open(out_file, 'w') as fh:
# store in dedicated metadata table
if meta:
data_manager_json["data_tables"][
"gtdbtk_database_metadata_versioned"
] = data_manager_entry
else:
data_manager_json["data_tables"][
"gtdbtk_database_versioned"
] = data_manager_entry

with open(out_file, "w") as fh:
json.dump(data_manager_json, fh, sort_keys=True)


parser = argparse.ArgumentParser()

parser.add_argument('--database_name', dest='database_name', help='GTDB-Tk database display name')
parser.add_argument('--database_id', dest='database_id', help='Unique GTDB-Tk database id')
parser.add_argument('--url', dest='url', help='URL to download GTDB-Tk databse version')
parser.add_argument('--out_file', dest='out_file', help='JSON output file')
parser.add_argument(
"--database_name", dest="database_name", help="GTDB-Tk database display name"
)

parser.add_argument("--version", dest="version", help="DB version")

parser.add_argument(
"--release", dest="release", help="Release of the GTDB-Tk database version"
)
parser.add_argument("--out_file", dest="out_file", help="JSON output file")
parser.add_argument(
"--meta",
dest="meta",
action="store_true",
help="Store meta data flag",
)

parser.add_argument(
"--test",
dest="test",
action="store_true",
help="Run test",
)

args = parser.parse_args()

download(args.database_id, args.database_name, args.url, args.out_file)
download(
args.database_name,
args.release,
args.meta,
args.test,
args.out_file,
)
Original file line number Diff line number Diff line change
Expand Up @@ -11,41 +11,71 @@
<command>
<![CDATA[
python '$__tool_directory__/gtdbtk_database_installer.py'
--database_id '$database_id'
--database_name '$database_name'
--url '$url'
--release '$release'
--out_file '$out_file'
$meta
$test
]]>
</command>
<inputs>
<param name="database_name" type="text" value="" label="Database name or description" help="This value will be displayed in the GTDB-Tk Database select list"/>
<param name="database_id" type="text" value="" label="Database id" help="This value must be unique with no whitespace allowed - use underscores"/>
<param
name="url"
type="text"
value="https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz"
label="URL for GTDB release"
help="This should point to a GTDB release tarball. A table of available databases and their version compatability can be found at https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data."
/>
<param name="meta" type="boolean" truevalue="--meta" falsevalue="" checked="false" label="Only store GTDBTK metadata in a dedicated data table. " />
<param name="test" type="hidden" value="" checked="false" label="Run a dry test run !" />
<param name="release" type="select" multiple="false" label="GTDB Release">
<option value="202">202</option>
<option value="207">207</option>
<option value="214">214</option>
<option value="220">220</option>
</param>
</inputs>
<outputs>
<data name="out_file" format="data_manager_json"/>
</outputs>
<tests>
<test>
<!-- TODO -->
<!-- Not actually installing a huge GTDB-Tk database -->
<param name="database_id" value="release202"/>
<!-- but it will check if all urls exist -->
<param name="release" value="202"/>
<param name="database_name" value="GTDB-Tk database release 202"/>
<param name="url" value="https://data.gtdb.ecogenomic.org/releases/release202/202.0/VERSION"/>
<param name="test" value="--test"/>
<output name="out_file">
<assert_contents>
<has_text text="GTDB-Tk database release 202"/>
<has_text text="release202"/>
<has_text text="release_202"/>
</assert_contents>
</output>
</test>
<test>
<!-- Test meta data download with tsv.gz-->
<param name="release" value="220"/>
<param name="database_name" value="GTDB-Tk database release 220 metadata"/>
<param name="meta" value="true"/>
<output name="out_file">
<assert_contents>
<has_text text="GTDB-Tk database release 220 metadata"/>
<has_text text="release_220"/>
</assert_contents>
</output>
</test>
<test>
<!-- Test meta data download with tar.gz -->
<param name="release" value="207"/>
<param name="database_name" value="GTDB-Tk database release 207 metadata"/>
<param name="meta" value="true"/>
<output name="out_file">
<assert_contents>
<has_text text="GTDB-Tk database release 207 metadata"/>
<has_text text="release_207"/>
</assert_contents>
</output>
</test>
</tests>
<help>
This data manager downloads the DB required for GTDB-Tk tools such as
the `gtdbtk classify_wf`. The meta options allows downloading only the metadata for the
corresponding DB, which is used by tools like `gtdb_to_taxdump`.
</help>
<citations>
<citation type="doi">doi.org/10.1038/s41587-020-0501-8</citation>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,29 @@
<data_managers>
<data_manager tool_file="data_manager/gtdbtk_database_installer.xml" id="gtdbtk_database_installer">
<data_table name="gtdbtk_database">
<data_table name="gtdbtk_database_versioned">
<output>
<column name="value"/>
<column name="name"/>
<column name="version"/>
<column name="path" output_ref="out_file">
<move type="directory" relativize_symlinks="True">
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database/${value}</target>
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database_versioned/${value}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database/${value}</value_translation>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database_versioned/${value}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
</output>
</data_table>
<data_table name="gtdbtk_database_metadata_versioned">
<output>
<column name="value"/>
<column name="name"/>
<column name="version"/>
<column name="path" output_ref="out_file">
<move type="directory" relativize_symlinks="True">
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database_metadata_versioned/${value}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database_metadata_versioned/${value}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
</output>
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# This is a sample file distributed with Galaxy that enables tools
# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc
# file has this format (longer white space characters are TAB characters):
#
# <unique_build_id> <display_name> <version> <directory_path>
Loading

0 comments on commit 8002c40

Please sign in to comment.