From 8ce26e64eaa391b15a32df3c97a0a675bc34510f Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 7 Jan 2020 15:53:16 -0600 Subject: [PATCH 001/400] initial kofam.py with imports based on pfam.py --- anvio/kofam.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 anvio/kofam.py diff --git a/anvio/kofam.py b/anvio/kofam.py new file mode 100644 index 0000000000..a2b61f879f --- /dev/null +++ b/anvio/kofam.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +# -*- coding: utf-8 +""" + This file contains KofamSetup and Kofam classes. + +""" + +import os +import gzip +import shutil +import requests + +import anvio +import anvio.dbops as dbops +import anvio.utils as utils +import anvio.terminal as terminal +import anvio.filesnpaths as filesnpaths + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__maintainer__ = "Özcan Esen" +__email__ = "ozcanesen@gmail.com" + +run = terminal.Run() +progress = terminal.Progress() +pp = terminal.pretty_print From bc0fd3986e0ade17c4d00c64f32043190bbdd7d5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 7 Jan 2020 16:02:41 -0600 Subject: [PATCH 002/400] kofam setup script backbone based on anvi-setup-pfams --- bin/anvi-setup-kegg-kofams | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 bin/anvi-setup-kegg-kofams diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams new file mode 100755 index 0000000000..21d2b9d52c --- /dev/null +++ b/bin/anvi-setup-kegg-kofams @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import sys + +import anvio +import anvio.kofam as kofam + +from anvio.errors import ConfigError, FilesNPathsError + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__maintainer__ = "Özcan Esen" +__email__ = "ozcanesen@gmail.com" +#__provides__ = ## TODO: fill this in +__description__ = "Download and setup KEGG KOfam HMM profiles." From 8d12ed84fb1fb67f3aaaf809a93469a6b7dcf4ce Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 7 Jan 2020 16:07:16 -0600 Subject: [PATCH 003/400] add kofam_data_dir parameter --- anvio/kofam.py | 8 ++++++++ bin/anvi-setup-kegg-kofams | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index a2b61f879f..ba584d29a8 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -26,3 +26,11 @@ run = terminal.Run() progress = terminal.Progress() pp = terminal.pretty_print + + +class KofamSetup(object): + def __init__(self, args, run=run, progress=progress): + self.args = args + self.run = run + self.progress = progress + self.kofam_data_dir = args.kofam_data_dir diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index 21d2b9d52c..97afabf8ed 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -16,3 +16,13 @@ __maintainer__ = "Özcan Esen" __email__ = "ozcanesen@gmail.com" #__provides__ = ## TODO: fill this in __description__ = "Download and setup KEGG KOfam HMM profiles." + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description=__description__) + parser.add_argument('--kofam-data-dir', default=None, type=str, help="The directory for KEGG KOfam HMM profiles to be stored. If you leave it\ + as is without specifying anything, the default destination for the data directory will be used to set things\ + up. The advantage of it is that everyone will be using a single data directory, but then you may need\ + superuser privileges to do it. Using this parameter you can choose the location of the data directory somewhere\ + you like. However, when it is time to run Kofam, you will need to remember that path and provide it to the program.") From 264be894c864badfccd96382639ab16f105b88d5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 7 Jan 2020 16:14:13 -0600 Subject: [PATCH 004/400] make KEGG data directory; check for hmmpress --- anvio/kofam.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index ba584d29a8..301d19ee2e 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -34,3 +34,9 @@ def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress self.kofam_data_dir = args.kofam_data_dir + + filesnpaths.is_program_exists('hmmpress') + + # default directory will be called KEGG and will store the KEGG Module data as well + if not self.kofam_data_dir: + self.kofam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') From f1ac05101bbdc1293ea30ddfc500232777479323 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 7 Jan 2020 16:21:16 -0600 Subject: [PATCH 005/400] add reset flag, check for existing kofam profile --- anvio/kofam.py | 10 ++++++++++ bin/anvi-setup-kegg-kofams | 3 +++ 2 files changed, 13 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 301d19ee2e..612666bc53 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -40,3 +40,13 @@ def __init__(self, args, run=run, progress=progress): # default directory will be called KEGG and will store the KEGG Module data as well if not self.kofam_data_dir: self.kofam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') + + if not args.reset: + self.is_database_exists() + + filesnpaths.gen_output_directory(self.kofam_data_dir, delete_if_exists=args.reset) + + + def is_database_exists(self): + if os.path.exists(os.path.join(self.kofam_data_dir, 'K00001.hmm')): # we arbitrarily check for the first profile + raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kofam_data_dir) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index 97afabf8ed..12c47e69e7 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -26,3 +26,6 @@ if __name__ == '__main__': up. The advantage of it is that everyone will be using a single data directory, but then you may need\ superuser privileges to do it. Using this parameter you can choose the location of the data directory somewhere\ you like. However, when it is time to run Kofam, you will need to remember that path and provide it to the program.") + parser.add_argument('--reset', default=False, action="store_true", help="This program by default attempts to use previously\ + downloaded files in your KEGG KOfam data directory if there are any. If something is wrong for some reason you\ + can use this parameter to tell anvi'o to remove everything, and start over.") From 2d66f32e9f868fef64937793acc7ab84889f2086 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 7 Jan 2020 16:26:42 -0600 Subject: [PATCH 006/400] add url for kofam download --- anvio/kofam.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 612666bc53..155a150435 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -46,6 +46,11 @@ def __init__(self, args, run=run, progress=progress): filesnpaths.gen_output_directory(self.kofam_data_dir, delete_if_exists=args.reset) + # ftp path for HMM profiles and KO list + # for ko list, add /ko_list.gz to end of url + # for profiles, add /profiles.tar.gz to end of url + self.database_url = "ftp://ftp.genome.jp/pub/db/kofam" + def is_database_exists(self): if os.path.exists(os.path.join(self.kofam_data_dir, 'K00001.hmm')): # we arbitrarily check for the first profile From 1755314116fbc75a3b64ed577bf66c4fe879bfe7 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 7 Jan 2020 16:51:11 -0600 Subject: [PATCH 007/400] add download code --- anvio/kofam.py | 49 +++++++++++++++++++++++++++++++++++++- bin/anvi-setup-kegg-kofams | 13 ++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 155a150435..c8af057c8d 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -48,10 +48,57 @@ def __init__(self, args, run=run, progress=progress): # ftp path for HMM profiles and KO list # for ko list, add /ko_list.gz to end of url - # for profiles, add /profiles.tar.gz to end of url + # for profiles, add /profiles.tar.gz to end of url self.database_url = "ftp://ftp.genome.jp/pub/db/kofam" + self.files = ['ko_list.gz', 'profiles.tar.gz'] def is_database_exists(self): if os.path.exists(os.path.join(self.kofam_data_dir, 'K00001.hmm')): # we arbitrarily check for the first profile raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kofam_data_dir) + + def download(self): + self.run.info("Database URL", self.database_url) + + for file_name in self.files: + utils.download_file(self.database_url + '/' + file_name, + os.path.join(self.kofam_data_dir, file_name), progress=self.progress, run=self.run) + + self.confirm_downloaded_files() + self.decompress_files() + + + def confirm_downloaded_files(self): + print("Not implemented yet") + # try: + # checksums_file = read_remote_file(self.database_url + '/md5_checksums', is_gzip=False).strip() + # checksums = {} + # except: + # self.run.warning("Checksum file '%s' is not available in FTP, Anvi'o won't be able to verify downloaded files." % (self.database_url + '/md5_checksums')) + # return + # + # for line in checksums_file.split('\n'): + # checksum, file_name = [item.strip() for item in line.strip().split()] + # checksums[file_name] = checksum + # + # for file_name in self.files: + # if not filesnpaths.is_file_exists(os.path.join(self.pfam_data_dir, file_name), dont_raise=True): + # # TO DO: Fix messages :( + # raise ConfigError("Have missing file %s, please run --reset" % file_name) + # + # hash_on_disk = utils.get_file_md5(os.path.join(self.pfam_data_dir, file_name)) + # expected_hash = checksums[file_name] + # + # if not expected_hash == hash_on_disk: + # # TO DO: Fix messages :( + # raise ConfigError("Please run with --reset, one file hash doesn't match. %s" % file_name) + + + def decompress_files(self): + # Decompressing Pfam-A.hmm.gz is not necessary, HMMer class works with .gz + print("Not implemented yet") + # for file_name in ['Pfam.version.gz', 'Pfam-A.clans.tsv.gz']: + # full_path = os.path.join(self.pfam_data_dir, file_name) + # + # utils.gzip_decompress_file(full_path) + # os.remove(full_path) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index 12c47e69e7..b0de5eef21 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -29,3 +29,16 @@ if __name__ == '__main__': parser.add_argument('--reset', default=False, action="store_true", help="This program by default attempts to use previously\ downloaded files in your KEGG KOfam data directory if there are any. If something is wrong for some reason you\ can use this parameter to tell anvi'o to remove everything, and start over.") + + args = anvio.get_args(parser) + + try: + setup = kofam.KofamSetup(args) + setup.download() + + except ConfigError as e: + print(e) + sys.exit(-1) + except FilesNPathsError as e: + print(e) + sys.exit(-1) From e5be322b30887887ed3be05065580343cbdc3a82 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 8 Jan 2020 17:04:39 -0600 Subject: [PATCH 008/400] get rid of download verification, got no checksums --- anvio/kofam.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index c8af057c8d..df0e6cd8e4 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -64,36 +64,9 @@ def download(self): utils.download_file(self.database_url + '/' + file_name, os.path.join(self.kofam_data_dir, file_name), progress=self.progress, run=self.run) - self.confirm_downloaded_files() self.decompress_files() - def confirm_downloaded_files(self): - print("Not implemented yet") - # try: - # checksums_file = read_remote_file(self.database_url + '/md5_checksums', is_gzip=False).strip() - # checksums = {} - # except: - # self.run.warning("Checksum file '%s' is not available in FTP, Anvi'o won't be able to verify downloaded files." % (self.database_url + '/md5_checksums')) - # return - # - # for line in checksums_file.split('\n'): - # checksum, file_name = [item.strip() for item in line.strip().split()] - # checksums[file_name] = checksum - # - # for file_name in self.files: - # if not filesnpaths.is_file_exists(os.path.join(self.pfam_data_dir, file_name), dont_raise=True): - # # TO DO: Fix messages :( - # raise ConfigError("Have missing file %s, please run --reset" % file_name) - # - # hash_on_disk = utils.get_file_md5(os.path.join(self.pfam_data_dir, file_name)) - # expected_hash = checksums[file_name] - # - # if not expected_hash == hash_on_disk: - # # TO DO: Fix messages :( - # raise ConfigError("Please run with --reset, one file hash doesn't match. %s" % file_name) - - def decompress_files(self): # Decompressing Pfam-A.hmm.gz is not necessary, HMMer class works with .gz print("Not implemented yet") From 1be4df0363ddf98db4fe2927576fb1c86bb03f54 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 8 Jan 2020 21:07:33 -0600 Subject: [PATCH 009/400] note to self lol --- anvio/kofam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index df0e6cd8e4..f9120baeb2 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -54,7 +54,7 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): - if os.path.exists(os.path.join(self.kofam_data_dir, 'K00001.hmm')): # we arbitrarily check for the first profile + if os.path.exists(os.path.join(self.kofam_data_dir, 'K00001.hmm')): # TODO: update this after determining final structure raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kofam_data_dir) def download(self): From 4a96746b513b12f099593b57f3971f9314fafbe4 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 8 Jan 2020 21:08:01 -0600 Subject: [PATCH 010/400] add in decompress function --- anvio/kofam.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index f9120baeb2..9d1e4fe114 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -68,10 +68,9 @@ def download(self): def decompress_files(self): - # Decompressing Pfam-A.hmm.gz is not necessary, HMMer class works with .gz print("Not implemented yet") - # for file_name in ['Pfam.version.gz', 'Pfam-A.clans.tsv.gz']: - # full_path = os.path.join(self.pfam_data_dir, file_name) - # - # utils.gzip_decompress_file(full_path) - # os.remove(full_path) + for file_name in self.files: + full_path = os.path.join(self.pfam_data_dir, file_name) + + utils.gzip_decompress_file(full_path) + os.remove(full_path) From d9df9e08ef56856d9917aa172781066db641acbf Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 9 Jan 2020 08:59:46 -0600 Subject: [PATCH 011/400] fix decompress function to work with tar files --- anvio/kofam.py | 8 ++++++-- anvio/utils.py | 23 +++++++++++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 9d1e4fe114..1295e84d78 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -68,9 +68,13 @@ def download(self): def decompress_files(self): - print("Not implemented yet") for file_name in self.files: - full_path = os.path.join(self.pfam_data_dir, file_name) + full_path = os.path.join(self.kofam_data_dir, file_name) + + if full_path.endswith("tar.gz"): # extract tar file instead of doing gzip + utils.tar_extract_file(full_path, output_file_path = self.kofam_data_dir, keep_original=False) + else: + utils.gzip_decompress_file(full_path, keep_original=False) utils.gzip_decompress_file(full_path) os.remove(full_path) diff --git a/anvio/utils.py b/anvio/utils.py index 5679cc0263..344d094069 100644 --- a/anvio/utils.py +++ b/anvio/utils.py @@ -6,6 +6,7 @@ import os import sys import gzip +import tarfile import time import copy import socket @@ -147,7 +148,7 @@ def run_processes(self, processes_to_run, progress=Progress(verbose=False)): def get_total_memory_usage(): if not PSUTIL_OK: return None - + current_process = psutil.Process(os.getpid()) mem = current_process.memory_info().rss for child in current_process.children(recursive=True): @@ -385,6 +386,25 @@ def gzip_decompress_file(input_file_path, output_file_path=None, keep_original=T return output_file_path +def tar_extract_file(input_file_path, output_file_path=None, keep_original=True): + filesnpaths.is_file_exists(input_file_path) + + if not tarfile.is_tarfile(input_file_path): + raise ConfigError("the tar_extract_file function is terribly upset because your input file ('%s') is\ + apparently not a tar file 🤷") + + if not output_file_path: + raise ConfigError("the tar_extract_file function is displeased because an output file path has not been specified.\ + If you are seeing this message, you are probably a developer, so go fix your code please, and \ + everyone will be happy then.") + + tf = tarfile.open(input_file_path) + tf.extractall(path = output_file_path) + + if not keep_original: + os.remove(input_file_path) + + class RunInDirectory(object): """ Run any block of code in a specified directory. Return to original directory @@ -3054,4 +3074,3 @@ def send(self, to, subject, content): self.progress.end() self.run.info('E-mail', 'Successfully sent to "%s"' % to) - From e8e4352df56d6e4683b6cdd0a2a038460f1c6b9d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 10 Jan 2020 15:52:13 -0600 Subject: [PATCH 012/400] add KEGG directory to ignore list --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8001f47052..e40fbb5b4e 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ diamond-log-file.txt anvio/data/misc/SCG_TAXONOMY/GTDB/SCG_SEARCH_DATABASES/*.dmnd anvio/tests/sandbox/test_visualize_split_coverages/TEST_OUTDIR +anvio/data/misc/KEGG/ From cf71424b10d8087beac1d1fb2bf9023ac08cf680 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 10 Jan 2020 23:40:46 -0600 Subject: [PATCH 013/400] now user can choose whether to keep hmm profiles decompressed or not --- anvio/pfam.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index 0efafb46d1..6a11bab821 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -55,6 +55,7 @@ def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress self.pfam_data_dir = args.pfam_data_dir + self.keep_compressed = args.keep_compressed filesnpaths.is_program_exists('hmmpress') @@ -71,7 +72,7 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): - if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')): + if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')) or os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm')): raise ConfigError("It seems you already have Pfam database installed in '%s', please use --reset flag if you want to re-download it." % self.pfam_data_dir) @@ -127,14 +128,24 @@ def confirm_downloaded_files(self): def decompress_files(self): - # Decompressing Pfam-A.hmm.gz is not necessary, HMMer class works with .gz - - for file_name in ['Pfam.version.gz', 'Pfam-A.clans.tsv.gz']: - full_path = os.path.join(self.pfam_data_dir, file_name) + if self.keep_compressed: + # Some folks may want the old behavior of this program that kept the HMM profiles compressed. + # This block preserves that way of doing things. + for file_name in ['Pfam.version.gz', 'Pfam-A.clans.tsv.gz']: + full_path = os.path.join(self.pfam_data_dir, file_name) + utils.gzip_decompress_file(full_path, keep_original=False) + else: + for file_name in self.files: + full_path = os.path.join(self.pfam_data_dir, file_name) - utils.gzip_decompress_file(full_path) - os.remove(full_path) + if full_path.endswith('.gz'): + utils.gzip_decompress_file(full_path) + os.remove(full_path) + for file_path in glob.glob(os.path.join(self.pfam_data_dir, '*')): + if file_path.endswith('.hmm'): + print("HMMPRESS Not implemented here yet") + #TODO HMMPRESS HERE. class Pfam(object): def __init__(self, args, run=run, progress=progress): From 89d1a61b1d68d55ea4c290130fae0cf5e8184cc5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 10 Jan 2020 23:42:32 -0600 Subject: [PATCH 014/400] check for compression to determine whether to run hmmscan in place or not --- anvio/pfam.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index 6a11bab821..ccef5f1f40 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -165,7 +165,7 @@ def __init__(self, args, run=run, progress=progress): if not self.pfam_data_dir: self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/Pfam') - self.is_database_exists() + self.run_in_place = self.is_database_exists() self.run.info('Pfam database directory', self.pfam_data_dir) @@ -174,8 +174,13 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): - if not os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')): + if not (os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')) or os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm'))): raise ConfigError("It seems you do not have Pfam database installed, please run 'anvi-setup-pfams' to download it.") + # here we check if the HMM profile is compressed or not so we can adjust hmmscan behavior accordingly + in_place = True + if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')): + in_place = False + return in_place def get_version(self): From 3f71f2d69bf507380e90b05a2d3015a14dfab60a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 10 Jan 2020 23:43:49 -0600 Subject: [PATCH 015/400] pass correct hmm_file and in_place param to hmmscan --- anvio/pfam.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index ccef5f1f40..05bed98460 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -220,7 +220,11 @@ def get_function_from_catalog(self, accession, ok_if_missing_from_catalog=False) def process(self): - hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz') + hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm') + # this file may be compressed if keep_compressed was set to True during setup + # and if the file is compressed, we cannot run in place + if not self.run_in_place: + hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz') # initialize contigs database class Args: pass @@ -241,7 +245,7 @@ class Args: pass # run hmmscan hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads) - hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None, len(self.function_catalog), hmm_file, None, '--cut_ga') + hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None, len(self.function_catalog), hmm_file, None, '--cut_ga', in_place=self.run_in_place) if not hmm_hits_file: run.info_single("The HMM search returned no hits :/ So there is nothing to add to the contigs database. But\ From 1cbd28a94120e772c5fc20dcc9bc36f60495209e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 10 Jan 2020 23:44:39 -0600 Subject: [PATCH 016/400] add keep_compressed param to pfam setup script --- bin/anvi-setup-pfams | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/anvi-setup-pfams b/bin/anvi-setup-pfams index b525336611..34fa4a4fd4 100755 --- a/bin/anvi-setup-pfams +++ b/bin/anvi-setup-pfams @@ -31,6 +31,10 @@ if __name__ == '__main__': parser.add_argument('--reset', default=False, action="store_true", help="This program by default attempts to use previously\ downloaded files in your Pfam data directory if there are any. If something is wrong for some reason you\ can use this to tell anvi'o to remove everything, and start over.") + parser.add_argument('--keep_compressed', default=False, help="Whether to keep Pfam HMM profiles in gzipped format. By default, these\ + files are decompressed and hmmpressed after being downloaded. If you want them to stay compressed to save space,\ + you can use this parameter; however, please keep in mind that this will increase the processing time for `anvi-run-pfams`\ + since the profiles will need to be decompressed and hmmpressed every time `anvi-run-pfams` is run.") args = anvio.get_args(parser) From 04b34a483e0bf4f51c9c9dfae1d6f268a3f189c2 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 10 Jan 2020 23:50:41 -0600 Subject: [PATCH 017/400] in_place param added to hmmscan; we use it to preserve the old way of running hmmscan --- anvio/drivers/hmmer.py | 53 +++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index 3435b1e458..c0f02049d2 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -46,12 +46,11 @@ def __init__(self, target_files_dict, num_threads_to_use=1, progress=progress, r part_file_name = os.path.join(tmp_dir, os.path.basename(target_files_dict[source])) # create splitted fasta files inside tmp directory - self.target_files_dict[source] = utils.split_fasta(target_files_dict[source], + self.target_files_dict[source] = utils.split_fasta(target_files_dict[source], parts=self.num_threads_to_use, prefix=part_file_name) - - def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms): + def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms, in_place=False): target = ':'.join([alphabet, context]) if target not in self.target_files_dict: @@ -81,29 +80,33 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.info('Temporary work dir', tmp_dir) self.run.info('Log file', log_file_path) - self.progress.new('Unpacking the model into temporary work directory') - self.progress.update('...') - hmm_file_path = os.path.join(tmp_dir, source + '_hmm.txt') - hmm_file = open(hmm_file_path, 'wb') - hmm_file.write(gzip.open(hmm, 'rb').read()) - hmm_file.close() - self.progress.end() + if not in_place: + self.progress.new('Unpacking the model into temporary work directory') + self.progress.update('...') + hmm_file_path = os.path.join(tmp_dir, source + '_hmm.txt') # referenced below, likely needs to move + hmm_file = open(hmm_file_path, 'wb') + hmm_file.write(gzip.open(hmm, 'rb').read()) + hmm_file.close() + self.progress.end() - self.progress.new('Processing') - self.progress.update('Compressing the pfam model') - - cmd_line = ['hmmpress', hmm_file_path] - ret_val = utils.run_command(cmd_line, log_file_path) - - if ret_val: - raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\ - installed is either not up-to-date enough, or too new :/ Just to make sure what went\ - wrong please take a look at the log file ('%s'). Please visit %s to see what\ - is the latest version availalbe if you think updating HMMER can resolve it. You can\ - learn which version of HMMER you have on your system by typing 'hmmpress -h'."\ - % (log_file_path, 'http://hmmer.janelia.org/download.html')) - self.progress.end() + self.progress.new('Processing') + self.progress.update('Compressing the pfam model') + cmd_line = ['hmmpress', hmm_file_path] + ret_val = utils.run_command(cmd_line, log_file_path) + + if ret_val: + raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\ + installed is either not up-to-date enough, or too new :/ Just to make sure what went\ + wrong please take a look at the log file ('%s'). Please visit %s to see what\ + is the latest version availalbe if you think updating HMMER can resolve it. You can\ + learn which version of HMMER you have on your system by typing 'hmmpress -h'."\ + % (log_file_path, 'http://hmmer.janelia.org/download.html')) + self.progress.end() + else: + # check if bunhc of files with different extensions are + # in the same directory with hmm + print("HMMSCAN run in place not implemented yet") workers = [] @@ -199,5 +202,3 @@ def hmmscan_worker(self, part_file, cmd_line, shitty_output_file, log_file, merg def clean_tmp_dirs(self): for tmp_dir in self.tmp_dirs: shutil.rmtree(tmp_dir) - - From 560e787c55f4dd0b3d9972ea1809b3afaf41b17e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 10 Jan 2020 23:55:45 -0600 Subject: [PATCH 018/400] import glob --- anvio/pfam.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anvio/pfam.py b/anvio/pfam.py index 05bed98460..62767be412 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -9,6 +9,7 @@ import shutil import requests from io import BytesIO +import glob import anvio import anvio.dbops as dbops From 3095a572d8a168b495f86739b0c43d01ebb85fa8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 13 Jan 2020 09:37:41 -0600 Subject: [PATCH 019/400] adding some descriptive comments --- anvio/kofam.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 1295e84d78..32843bb4ca 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -29,6 +29,15 @@ class KofamSetup(object): + """ Class for setting up KEGG Kofam HMM profiles. It performs sanity checks and downloads, unpacks, and prepares + the profiles for later use by `hmmscan`. + + Parameters + ========== + args: Namespace object + All the arguments supplied by user to anvi-setup-kegg-kofams + """ + def __init__(self, args, run=run, progress=progress): self.args = args self.run = run @@ -54,10 +63,12 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): + """This function determines whether the user has already downloaded the Kofam HMM profiles.""" if os.path.exists(os.path.join(self.kofam_data_dir, 'K00001.hmm')): # TODO: update this after determining final structure raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kofam_data_dir) def download(self): + """This function downloads the Kofam profiles.""" self.run.info("Database URL", self.database_url) for file_name in self.files: @@ -68,6 +79,7 @@ def download(self): def decompress_files(self): + """This function decompresses the Kofam profiles.""" for file_name in self.files: full_path = os.path.join(self.kofam_data_dir, file_name) From 8bf46bf92a91d64e75de6f8ea36e544fd6e6a701 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 13 Jan 2020 09:41:07 -0600 Subject: [PATCH 020/400] download function only downloads now --- anvio/kofam.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 32843bb4ca..88f7e3c058 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -75,8 +75,6 @@ def download(self): utils.download_file(self.database_url + '/' + file_name, os.path.join(self.kofam_data_dir, file_name), progress=self.progress, run=self.run) - self.decompress_files() - def decompress_files(self): """This function decompresses the Kofam profiles.""" @@ -87,6 +85,3 @@ def decompress_files(self): utils.tar_extract_file(full_path, output_file_path = self.kofam_data_dir, keep_original=False) else: utils.gzip_decompress_file(full_path, keep_original=False) - - utils.gzip_decompress_file(full_path) - os.remove(full_path) From 2772b359b4924771def5eaa782b76b0c987bcac0 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 13 Jan 2020 09:42:24 -0600 Subject: [PATCH 021/400] now we use a setup function instead of doing everything from download() --- anvio/kofam.py | 7 +++++++ bin/anvi-setup-kegg-kofams | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 88f7e3c058..836707d684 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -85,3 +85,10 @@ def decompress_files(self): utils.tar_extract_file(full_path, output_file_path = self.kofam_data_dir, keep_original=False) else: utils.gzip_decompress_file(full_path, keep_original=False) + + + def setup_profiles(self): + """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" + self.download() + self.decompress_files() + # TODO: add concatenation and hmmpress diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index b0de5eef21..de9865c9b3 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -34,7 +34,7 @@ if __name__ == '__main__': try: setup = kofam.KofamSetup(args) - setup.download() + setup.setup_profiles() except ConfigError as e: print(e) From db2d3fa1b1034e57ebecf6bfc5b14798fd4cbba4 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 13 Jan 2020 10:13:11 -0600 Subject: [PATCH 022/400] add a setup warning for when --keep-compressed is set to true --- anvio/pfam.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvio/pfam.py b/anvio/pfam.py index 62767be412..1385a4a080 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -132,6 +132,8 @@ def decompress_files(self): if self.keep_compressed: # Some folks may want the old behavior of this program that kept the HMM profiles compressed. # This block preserves that way of doing things. + self.run.warning("Just to let you know, you elected to keep the Pfam profiles in gzipped format. This is fine and will save you some space,\ + but you may experience slower processing times for `anvi-run-pfams`. You have been warned. :)") for file_name in ['Pfam.version.gz', 'Pfam-A.clans.tsv.gz']: full_path = os.path.join(self.pfam_data_dir, file_name) utils.gzip_decompress_file(full_path, keep_original=False) From d91ec14f51be0ab5d5824ba0b216e2d1d34901c8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 13 Jan 2020 15:50:18 -0600 Subject: [PATCH 023/400] fix indent --- anvio/pfam.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index 1385a4a080..7d3ccc8a84 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -145,10 +145,10 @@ def decompress_files(self): utils.gzip_decompress_file(full_path) os.remove(full_path) - for file_path in glob.glob(os.path.join(self.pfam_data_dir, '*')): - if file_path.endswith('.hmm'): - print("HMMPRESS Not implemented here yet") - #TODO HMMPRESS HERE. + for file_path in glob.glob(os.path.join(self.pfam_data_dir, '*.hmm')): + print("HMMPRESS Not implemented here yet") + print(file_path) + #TODO HMMPRESS HERE. class Pfam(object): def __init__(self, args, run=run, progress=progress): From 195d38b8a5a24451a1393e3539e27734a51a6586 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 13 Jan 2020 15:50:46 -0600 Subject: [PATCH 024/400] skeleton hmmpress function --- anvio/kofam.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 836707d684..1321085122 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -86,6 +86,13 @@ def decompress_files(self): else: utils.gzip_decompress_file(full_path, keep_original=False) + def run_hmmpress(self): + """This function concatenates the Kofam profiles and runs hmmpress on them.""" + print("Not implemented yet") + log_file_path = os.path.join(self.kofam_data_dir, '00_hmmpress_log.txt') + cmd_line = ["cat", os.path.join(self.kofam_data_dir, 'profiles/*.hmm')] + ret_val = utils.run_command(cmd_line, log_file_path) + # TODO: finish me - need sanity check for all files, concat, and press def setup_profiles(self): """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" From 96949643e6c9e02560d94b8e5eaba990f7a4d243 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 13 Jan 2020 15:54:22 -0600 Subject: [PATCH 025/400] we shall not track the Pfam data --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e40fbb5b4e..b2ec9a204e 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ anvio/data/misc/SCG_TAXONOMY/GTDB/SCG_SEARCH_DATABASES/*.dmnd anvio/tests/sandbox/test_visualize_split_coverages/TEST_OUTDIR anvio/data/misc/KEGG/ +anvio/data/misc/Pfam/ From f2ceaa492bd25d4980fcf68b8511df92c658d3ae Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 10:44:45 +0800 Subject: [PATCH 026/400] add in call for hmmpress --- anvio/pfam.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index 7d3ccc8a84..e143788962 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -146,9 +146,13 @@ def decompress_files(self): os.remove(full_path) for file_path in glob.glob(os.path.join(self.pfam_data_dir, '*.hmm')): - print("HMMPRESS Not implemented here yet") - print(file_path) - #TODO HMMPRESS HERE. + cmd_line = ['hmmpress', file_path] + log_file_path = os.path.join(self.pfam_data_dir, '00_hmmpress_log.txt') + ret_val = utils.run_command(cmd_line, log_file_path) + + if ret_val: + raise ConfigError("Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. \ + Check out the log file ('%s') to see what went wrong." % (log_file_path)) class Pfam(object): def __init__(self, args, run=run, progress=progress): From 8aba45022a3649c9a405333e30b7c58c0228dd47 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 11:33:49 +0800 Subject: [PATCH 027/400] fix flag hyphen --- bin/anvi-setup-pfams | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/anvi-setup-pfams b/bin/anvi-setup-pfams index 34fa4a4fd4..f10ea37e39 100755 --- a/bin/anvi-setup-pfams +++ b/bin/anvi-setup-pfams @@ -31,7 +31,7 @@ if __name__ == '__main__': parser.add_argument('--reset', default=False, action="store_true", help="This program by default attempts to use previously\ downloaded files in your Pfam data directory if there are any. If something is wrong for some reason you\ can use this to tell anvi'o to remove everything, and start over.") - parser.add_argument('--keep_compressed', default=False, help="Whether to keep Pfam HMM profiles in gzipped format. By default, these\ + parser.add_argument('--keep-compressed', default=False, help="Whether to keep Pfam HMM profiles in gzipped format. By default, these\ files are decompressed and hmmpressed after being downloaded. If you want them to stay compressed to save space,\ you can use this parameter; however, please keep in mind that this will increase the processing time for `anvi-run-pfams`\ since the profiles will need to be decompressed and hmmpressed every time `anvi-run-pfams` is run.") From 5a23f4015087d4eb53b8ae124b08dcb9c3046e11 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 11:34:41 +0800 Subject: [PATCH 028/400] allow skipping of file check when testing using --debug --- anvio/pfam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index e143788962..a124f6f96f 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -63,7 +63,7 @@ def __init__(self, args, run=run, progress=progress): if not self.pfam_data_dir: self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/Pfam') - if not args.reset: + if not args.reset or not anvio.DEBUG: self.is_database_exists() filesnpaths.gen_output_directory(self.pfam_data_dir, delete_if_exists=args.reset) From 12c2bdb25e0375dba350eac4516df7f277b7cc38 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 11:49:43 +0800 Subject: [PATCH 029/400] oopsie. this should be an and --- anvio/pfam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index a124f6f96f..52f2f5ffb0 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -63,7 +63,7 @@ def __init__(self, args, run=run, progress=progress): if not self.pfam_data_dir: self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/Pfam') - if not args.reset or not anvio.DEBUG: + if not args.reset and not anvio.DEBUG: self.is_database_exists() filesnpaths.gen_output_directory(self.pfam_data_dir, delete_if_exists=args.reset) From 842f91cf91ecac79db837689562009ebacd59308 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 12:04:43 +0800 Subject: [PATCH 030/400] add some sanity checks --- anvio/pfam.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/anvio/pfam.py b/anvio/pfam.py index 52f2f5ffb0..2eb7ac49bf 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -136,6 +136,14 @@ def decompress_files(self): but you may experience slower processing times for `anvi-run-pfams`. You have been warned. :)") for file_name in ['Pfam.version.gz', 'Pfam-A.clans.tsv.gz']: full_path = os.path.join(self.pfam_data_dir, file_name) + if not os.path.exists(full_path) and os.path.exists(full_path[:-3]): + self.run.warning("It seems the file at %s is already de-compressed. Perhaps you already downloaded the Pfam profiles. \ + If you want to re-do the Pfam setup, please run this program again and use both the --reset and --keep-compressed flags." \ + % (full_path[:-3])) + continue + elif not os.path.exists(full_path): + raise ConfigError("Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running \ + this program using the --reset flag." % (full_path)) utils.gzip_decompress_file(full_path, keep_original=False) else: for file_name in self.files: From 6234c5061c16f048242a14d9ee94e2c4abade21a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 12:07:57 +0800 Subject: [PATCH 031/400] add sanity checks to other case. hmmpress on pfam profiles after download is now working nicely --- anvio/pfam.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/anvio/pfam.py b/anvio/pfam.py index 2eb7ac49bf..cc36fafdb0 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -150,6 +150,14 @@ def decompress_files(self): full_path = os.path.join(self.pfam_data_dir, file_name) if full_path.endswith('.gz'): + if not os.path.exists(full_path) and os.path.exists(full_path[:-3]): + self.run.warning("It seems the file at %s is already de-compressed. Perhaps you already downloaded the Pfam profiles. \ + If you want to re-do the Pfam setup, please run this program again and use the --reset flag." \ + % (full_path[:-3])) + continue + elif not os.path.exists(full_path): + raise ConfigError("Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running \ + this program using the --reset flag." % (full_path)) utils.gzip_decompress_file(full_path) os.remove(full_path) From f938b7590361f5b0f301c0e26432ba77eb0798a4 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 12:44:40 +0800 Subject: [PATCH 032/400] if successful remove log --- anvio/pfam.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/pfam.py b/anvio/pfam.py index cc36fafdb0..092aa7e9d8 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -169,6 +169,9 @@ def decompress_files(self): if ret_val: raise ConfigError("Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. \ Check out the log file ('%s') to see what went wrong." % (log_file_path)) + else: + # getting rid of the log file because hmmpress was successful + os.remove(log_file_path) class Pfam(object): def __init__(self, args, run=run, progress=progress): From 790a3d4f52cccc9af6f93f9539d7c2999b184769 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 13:47:48 +0800 Subject: [PATCH 033/400] add message when in_place is set to false --- anvio/pfam.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvio/pfam.py b/anvio/pfam.py index 092aa7e9d8..d976053f7c 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -206,6 +206,8 @@ def is_database_exists(self): in_place = True if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')): in_place = False + self.run.warning("Anvi'o has detected that your Pfam database is currently compressed. It will be unpacked before \ + running HMMs.") return in_place From b5f176c7823daaecb28ff2798d7e5202984051d8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 13:59:57 +0800 Subject: [PATCH 034/400] we only create tmp dir if not running in place --- anvio/drivers/hmmer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index c0f02049d2..aae969a4b8 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -73,14 +73,14 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) - # we want to create hmm files in the same direcotry - tmp_dir = os.path.dirname(self.target_files_dict[target][0]) - log_file_path = os.path.join(tmp_dir, '00_log.txt') - - self.run.info('Temporary work dir', tmp_dir) - self.run.info('Log file', log_file_path) - if not in_place: + # we want to create hmm files in the same direcotry + tmp_dir = os.path.dirname(self.target_files_dict[target][0]) + log_file_path = os.path.join(tmp_dir, '00_log.txt') + + self.run.info('Temporary work dir', tmp_dir) + self.run.info('Log file', log_file_path) + self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, source + '_hmm.txt') # referenced below, likely needs to move From 9042fe607d456a3a8bafa31d6149236c7f5bf4d1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 14:18:18 +0800 Subject: [PATCH 035/400] add function to check that all hmmpress files are there when we expect them --- anvio/drivers/hmmer.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index aae969a4b8..a9fffb971c 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -6,6 +6,7 @@ import gzip import shutil from threading import Thread, Lock +import glob import anvio import anvio.utils as utils @@ -50,6 +51,28 @@ def __init__(self, target_files_dict, num_threads_to_use=1, progress=progress, r parts=self.num_threads_to_use, prefix=part_file_name) + def verify_hmmpress_output(self, hmm_path): + """This function verifies that the HMM profiles located at hmm_path have been successfully hmmpressed. + What this means is that every .hmm profile in the directory has an associated .h3f, .h3i, .h3m, and + .h3p file. + + PARAMETERS: + hmm_path string, the path at which the HMM profiles are located + + RETURNS: N/A + + """ + for file_path in glob.glob(os.path.join(hmm_path, '*.hmm')): + base_path = file_path[:-3] + expected_extensions = ['h3f', 'h3i', 'h3m', 'h3p'] + for ext in expected_extensions: + if not os.path.exists(base_path + ext): + raise ConfigError("It appears that hmmpress was not properly run on the hmm profiles at %s. The \ + file %s does not exist. It is likely that you will have to set up your profiles \ + again by running a program such as `anvi-setup-pfams` or `anvi-setup-kegg-kofams`. \ + We are very sorry about this." % (hmm_path, base_path + ext) + + def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms, in_place=False): target = ':'.join([alphabet, context]) @@ -80,7 +103,7 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.info('Temporary work dir', tmp_dir) self.run.info('Log file', log_file_path) - + self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, source + '_hmm.txt') # referenced below, likely needs to move From 731e3cfbaf547b717dd06f85c60b2928538f077b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 14:51:57 +0800 Subject: [PATCH 036/400] call to verify hmmpress --- anvio/drivers/hmmer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index a9fffb971c..dc92c16b0e 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -127,8 +127,10 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode % (log_file_path, 'http://hmmer.janelia.org/download.html')) self.progress.end() else: - # check if bunhc of files with different extensions are - # in the same directory with hmm + # check if all hmmpress files are in the HMM directory + self.verify_hmmpress_output(hmm) + # we may want to throw a more descriptive error *here* instead of failing in the verify function + print("HMMSCAN run in place not implemented yet") From e06832aee493680fb5ee888e4f0b496a86f5ee8b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Jan 2020 23:48:19 +0800 Subject: [PATCH 037/400] fix missing parenthesis --- anvio/drivers/hmmer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index dc92c16b0e..cd8932daa8 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -70,7 +70,7 @@ def verify_hmmpress_output(self, hmm_path): raise ConfigError("It appears that hmmpress was not properly run on the hmm profiles at %s. The \ file %s does not exist. It is likely that you will have to set up your profiles \ again by running a program such as `anvi-setup-pfams` or `anvi-setup-kegg-kofams`. \ - We are very sorry about this." % (hmm_path, base_path + ext) + We are very sorry about this." % (hmm_path, base_path + ext)) def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms, in_place=False): From fee274ff6591e67bc6122fc4236dc15482e742ae Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 00:03:09 +0800 Subject: [PATCH 038/400] move some variables around to accomodate both in_place and not --- anvio/drivers/hmmer.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index cd8932daa8..2c2fb8d12d 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -96,14 +96,19 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) - if not in_place: - # we want to create hmm files in the same direcotry - tmp_dir = os.path.dirname(self.target_files_dict[target][0]) - log_file_path = os.path.join(tmp_dir, '00_log.txt') + # set up variables for later call to hmmscan - values will be filled in depending on whether hmmscan is run in place or not + hmm_file_path = None + + # results go in the tmp directory no matter whether run in place or not; + # if we don't run in place then the unpacked hmm profiles will go to tmp too + tmp_dir = os.path.dirname(self.target_files_dict[target][0]) + log_file_path = os.path.join(tmp_dir, '00_log.txt') - self.run.info('Temporary work dir', tmp_dir) - self.run.info('Log file', log_file_path) + self.run.info('Temporary work dir', tmp_dir) + self.run.info('Log file', log_file_path) + if not in_place: + # we want to create hmm files in the same directory self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, source + '_hmm.txt') # referenced below, likely needs to move @@ -129,7 +134,9 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode else: # check if all hmmpress files are in the HMM directory self.verify_hmmpress_output(hmm) - # we may want to throw a more descriptive error *here* instead of failing in the verify function + # we may want to throw a more descriptive error *here* instead of failing in the verify function + + hmm_file_path = hmm print("HMMSCAN run in place not implemented yet") From a4f373a1755bc708dd51cb2233ed00bed3b31ba2 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 01:12:22 +0800 Subject: [PATCH 039/400] make keep-compressed a boolean flag --- bin/anvi-setup-pfams | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/anvi-setup-pfams b/bin/anvi-setup-pfams index f10ea37e39..4bb326c76d 100755 --- a/bin/anvi-setup-pfams +++ b/bin/anvi-setup-pfams @@ -31,7 +31,7 @@ if __name__ == '__main__': parser.add_argument('--reset', default=False, action="store_true", help="This program by default attempts to use previously\ downloaded files in your Pfam data directory if there are any. If something is wrong for some reason you\ can use this to tell anvi'o to remove everything, and start over.") - parser.add_argument('--keep-compressed', default=False, help="Whether to keep Pfam HMM profiles in gzipped format. By default, these\ + parser.add_argument('--keep-compressed', default=False, action="store_true", help="Whether to keep Pfam HMM profiles in gzipped format. By default, these\ files are decompressed and hmmpressed after being downloaded. If you want them to stay compressed to save space,\ you can use this parameter; however, please keep in mind that this will increase the processing time for `anvi-run-pfams`\ since the profiles will need to be decompressed and hmmpressed every time `anvi-run-pfams` is run.") From b75b8c64eeae379215d5c1a965e815681ed1e21b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 01:13:59 +0800 Subject: [PATCH 040/400] add descriptive output during hmm verification --- anvio/drivers/hmmer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index 2c2fb8d12d..3b36f4a8f5 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -133,13 +133,12 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.progress.end() else: # check if all hmmpress files are in the HMM directory + self.run.warning('Verifying that %s HMM profiles have been set up properly' % source, lc='green') self.verify_hmmpress_output(hmm) # we may want to throw a more descriptive error *here* instead of failing in the verify function hmm_file_path = hmm - print("HMMSCAN run in place not implemented yet") - workers = [] merged_file_buffer = io.StringIO() From 33072e8e72f9559d0d3073ab812d7ff2a11b45af Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 02:12:55 +0800 Subject: [PATCH 041/400] add some descriptive comments --- anvio/pfam.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/anvio/pfam.py b/anvio/pfam.py index d976053f7c..04c11cf8c0 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -191,6 +191,9 @@ def __init__(self, args, run=run, progress=progress): if not self.pfam_data_dir: self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/Pfam') + # here, in the process of checking whether Pfam has been downloaded into the pfam_data_dir, + # we also determine whether the profiles have been decompressed + hmmpressed already, + # in which case hmmscan should be run in-place self.run_in_place = self.is_database_exists() self.run.info('Pfam database directory', self.pfam_data_dir) @@ -200,6 +203,13 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): + """ + This function verifies that pfam_data_dir contains the Pfam hmm profiles and checks whether they are compressed or not. + + PARAMETERS: N/A + + RETURNS: in_place, boolean, whether hmmscan should be run in-place (.hmm already unpacked) or not + """ if not (os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')) or os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm'))): raise ConfigError("It seems you do not have Pfam database installed, please run 'anvi-setup-pfams' to download it.") # here we check if the HMM profile is compressed or not so we can adjust hmmscan behavior accordingly From 6db0cfbc7e0d2408f17eb61808af36af66e8a9fc Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 08:28:10 +0800 Subject: [PATCH 042/400] eliminate --keep-compressed option --- anvio/pfam.py | 60 ++++++++++++++++---------------------------- bin/anvi-setup-pfams | 4 --- 2 files changed, 22 insertions(+), 42 deletions(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index 04c11cf8c0..e240775e90 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -56,7 +56,6 @@ def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress self.pfam_data_dir = args.pfam_data_dir - self.keep_compressed = args.keep_compressed filesnpaths.is_program_exists('hmmpress') @@ -73,10 +72,11 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): - if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')) or os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm')): + if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm')): raise ConfigError("It seems you already have Pfam database installed in '%s', please use --reset flag if you want to re-download it." % self.pfam_data_dir) + def get_remote_version(self): content = read_remote_file(self.database_url + '/Pfam.version.gz') @@ -129,49 +129,33 @@ def confirm_downloaded_files(self): def decompress_files(self): - if self.keep_compressed: - # Some folks may want the old behavior of this program that kept the HMM profiles compressed. - # This block preserves that way of doing things. - self.run.warning("Just to let you know, you elected to keep the Pfam profiles in gzipped format. This is fine and will save you some space,\ - but you may experience slower processing times for `anvi-run-pfams`. You have been warned. :)") - for file_name in ['Pfam.version.gz', 'Pfam-A.clans.tsv.gz']: - full_path = os.path.join(self.pfam_data_dir, file_name) + + for file_name in self.files: + full_path = os.path.join(self.pfam_data_dir, file_name) + + if full_path.endswith('.gz'): if not os.path.exists(full_path) and os.path.exists(full_path[:-3]): self.run.warning("It seems the file at %s is already de-compressed. Perhaps you already downloaded the Pfam profiles. \ - If you want to re-do the Pfam setup, please run this program again and use both the --reset and --keep-compressed flags." \ + If you want to re-do the Pfam setup, please run this program again and use the --reset flag." \ % (full_path[:-3])) continue elif not os.path.exists(full_path): raise ConfigError("Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running \ this program using the --reset flag." % (full_path)) - utils.gzip_decompress_file(full_path, keep_original=False) - else: - for file_name in self.files: - full_path = os.path.join(self.pfam_data_dir, file_name) - - if full_path.endswith('.gz'): - if not os.path.exists(full_path) and os.path.exists(full_path[:-3]): - self.run.warning("It seems the file at %s is already de-compressed. Perhaps you already downloaded the Pfam profiles. \ - If you want to re-do the Pfam setup, please run this program again and use the --reset flag." \ - % (full_path[:-3])) - continue - elif not os.path.exists(full_path): - raise ConfigError("Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running \ - this program using the --reset flag." % (full_path)) - utils.gzip_decompress_file(full_path) - os.remove(full_path) - - for file_path in glob.glob(os.path.join(self.pfam_data_dir, '*.hmm')): - cmd_line = ['hmmpress', file_path] - log_file_path = os.path.join(self.pfam_data_dir, '00_hmmpress_log.txt') - ret_val = utils.run_command(cmd_line, log_file_path) - - if ret_val: - raise ConfigError("Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. \ - Check out the log file ('%s') to see what went wrong." % (log_file_path)) - else: - # getting rid of the log file because hmmpress was successful - os.remove(log_file_path) + utils.gzip_decompress_file(full_path) + os.remove(full_path) + + for file_path in glob.glob(os.path.join(self.pfam_data_dir, '*.hmm')): + cmd_line = ['hmmpress', file_path] + log_file_path = os.path.join(self.pfam_data_dir, '00_hmmpress_log.txt') + ret_val = utils.run_command(cmd_line, log_file_path) + + if ret_val: + raise ConfigError("Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. \ + Check out the log file ('%s') to see what went wrong." % (log_file_path)) + else: + # getting rid of the log file because hmmpress was successful + os.remove(log_file_path) class Pfam(object): def __init__(self, args, run=run, progress=progress): diff --git a/bin/anvi-setup-pfams b/bin/anvi-setup-pfams index 4bb326c76d..b525336611 100755 --- a/bin/anvi-setup-pfams +++ b/bin/anvi-setup-pfams @@ -31,10 +31,6 @@ if __name__ == '__main__': parser.add_argument('--reset', default=False, action="store_true", help="This program by default attempts to use previously\ downloaded files in your Pfam data directory if there are any. If something is wrong for some reason you\ can use this to tell anvi'o to remove everything, and start over.") - parser.add_argument('--keep-compressed', default=False, action="store_true", help="Whether to keep Pfam HMM profiles in gzipped format. By default, these\ - files are decompressed and hmmpressed after being downloaded. If you want them to stay compressed to save space,\ - you can use this parameter; however, please keep in mind that this will increase the processing time for `anvi-run-pfams`\ - since the profiles will need to be decompressed and hmmpressed every time `anvi-run-pfams` is run.") args = anvio.get_args(parser) From c12226ebb83ae87ccb66d58a489896e0b8dac196 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 08:30:32 +0800 Subject: [PATCH 043/400] if user already has the compressed profiles and tries to run setup again, we ask them to use --reset --- anvio/pfam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index e240775e90..2b7665a01f 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -72,7 +72,7 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): - if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm')): + if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm') or os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz'))): raise ConfigError("It seems you already have Pfam database installed in '%s', please use --reset flag if you want to re-download it." % self.pfam_data_dir) From 67e6559c04ceea961fb036822f35610855debccc Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 08:56:26 +0800 Subject: [PATCH 044/400] now we unpack Pfam profiles if they are gzipped, and hmmscan only runs in place --- anvio/drivers/hmmer.py | 42 +++++------------------------------------- anvio/pfam.py | 31 ++++++++++++++----------------- 2 files changed, 19 insertions(+), 54 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index 3b36f4a8f5..4cd02ec557 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -73,7 +73,7 @@ def verify_hmmpress_output(self, hmm_path): We are very sorry about this." % (hmm_path, base_path + ext)) - def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms, in_place=False): + def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms): target = ':'.join([alphabet, context]) if target not in self.target_files_dict: @@ -96,48 +96,16 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) - # set up variables for later call to hmmscan - values will be filled in depending on whether hmmscan is run in place or not - hmm_file_path = None - - # results go in the tmp directory no matter whether run in place or not; - # if we don't run in place then the unpacked hmm profiles will go to tmp too tmp_dir = os.path.dirname(self.target_files_dict[target][0]) log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('Log file', log_file_path) - if not in_place: - # we want to create hmm files in the same directory - self.progress.new('Unpacking the model into temporary work directory') - self.progress.update('...') - hmm_file_path = os.path.join(tmp_dir, source + '_hmm.txt') # referenced below, likely needs to move - hmm_file = open(hmm_file_path, 'wb') - hmm_file.write(gzip.open(hmm, 'rb').read()) - hmm_file.close() - self.progress.end() - - self.progress.new('Processing') - self.progress.update('Compressing the pfam model') - - cmd_line = ['hmmpress', hmm_file_path] - ret_val = utils.run_command(cmd_line, log_file_path) - - if ret_val: - raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\ - installed is either not up-to-date enough, or too new :/ Just to make sure what went\ - wrong please take a look at the log file ('%s'). Please visit %s to see what\ - is the latest version availalbe if you think updating HMMER can resolve it. You can\ - learn which version of HMMER you have on your system by typing 'hmmpress -h'."\ - % (log_file_path, 'http://hmmer.janelia.org/download.html')) - self.progress.end() - else: - # check if all hmmpress files are in the HMM directory - self.run.warning('Verifying that %s HMM profiles have been set up properly' % source, lc='green') - self.verify_hmmpress_output(hmm) - # we may want to throw a more descriptive error *here* instead of failing in the verify function - hmm_file_path = hmm + # check if all hmmpress files are in the HMM directory + self.verify_hmmpress_output(hmm) + # we may want to throw a more descriptive error *here* instead of failing in the verify function workers = [] @@ -163,7 +131,7 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode '-o', output_file, *noise_cutoff_terms.split(), '--cpu', cores_per_process, '--tblout', shitty_file, - hmm_file_path, part_file] + hmm, part_file] t = Thread(target=self.hmmscan_worker, args=(part_file, cmd_line, diff --git a/anvio/pfam.py b/anvio/pfam.py index 2b7665a01f..ec1acb7a23 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -135,13 +135,15 @@ def decompress_files(self): if full_path.endswith('.gz'): if not os.path.exists(full_path) and os.path.exists(full_path[:-3]): - self.run.warning("It seems the file at %s is already de-compressed. Perhaps you already downloaded the Pfam profiles. \ - If you want to re-do the Pfam setup, please run this program again and use the --reset flag." \ + self.run.warning("It seems the file at %s is already decompressed. You are probably seeing \ + this message because Pfams was set up previously on this computer. Hakuna Matata. Anvi'o will \ + simply skip decompressing this file at this time. But if you think there is an issue, you can \ + re-do the Pfam setup by running `anvi-setup-pfams` again and using the --reset flag." \ % (full_path[:-3])) continue elif not os.path.exists(full_path): raise ConfigError("Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running \ - this program using the --reset flag." % (full_path)) + `anvi-setup-pfams` using the --reset flag." % (full_path)) utils.gzip_decompress_file(full_path) os.remove(full_path) @@ -176,9 +178,8 @@ def __init__(self, args, run=run, progress=progress): self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/Pfam') # here, in the process of checking whether Pfam has been downloaded into the pfam_data_dir, - # we also determine whether the profiles have been decompressed + hmmpressed already, - # in which case hmmscan should be run in-place - self.run_in_place = self.is_database_exists() + # we also decompress and hmmpress the profile if it is currently gzipped + self.is_database_exists() self.run.info('Pfam database directory', self.pfam_data_dir) @@ -189,20 +190,20 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): """ This function verifies that pfam_data_dir contains the Pfam hmm profiles and checks whether they are compressed or not. + If they are compressed, we decompress them and run hmmpress. PARAMETERS: N/A - RETURNS: in_place, boolean, whether hmmscan should be run in-place (.hmm already unpacked) or not + RETURNS: N/A """ if not (os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')) or os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm'))): raise ConfigError("It seems you do not have Pfam database installed, please run 'anvi-setup-pfams' to download it.") - # here we check if the HMM profile is compressed or not so we can adjust hmmscan behavior accordingly - in_place = True + # here we check if the HMM profile is compressed so we can decompress it for next time if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')): - in_place = False - self.run.warning("Anvi'o has detected that your Pfam database is currently compressed. It will be unpacked before \ + self.run.warning("Anvi'o has detected that your Pfam database is currently compressed. It will now be unpacked before \ running HMMs.") - return in_place + self.decompress_files() + def get_version(self): @@ -243,10 +244,6 @@ def get_function_from_catalog(self, accession, ok_if_missing_from_catalog=False) def process(self): hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm') - # this file may be compressed if keep_compressed was set to True during setup - # and if the file is compressed, we cannot run in place - if not self.run_in_place: - hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz') # initialize contigs database class Args: pass @@ -267,7 +264,7 @@ class Args: pass # run hmmscan hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads) - hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None, len(self.function_catalog), hmm_file, None, '--cut_ga', in_place=self.run_in_place) + hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None, len(self.function_catalog), hmm_file, None, '--cut_ga') if not hmm_hits_file: run.info_single("The HMM search returned no hits :/ So there is nothing to add to the contigs database. But\ From 3556c5248532fe609e78902a22d32a2628b2ab88 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 09:39:56 +0800 Subject: [PATCH 045/400] decompress function is from a different class so we use new code here instead --- anvio/pfam.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index ec1acb7a23..7b9e7f26e9 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -201,9 +201,19 @@ def is_database_exists(self): # here we check if the HMM profile is compressed so we can decompress it for next time if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')): self.run.warning("Anvi'o has detected that your Pfam database is currently compressed. It will now be unpacked before \ - running HMMs.") - self.decompress_files() + running HMMs.")) + utils.gzip_decompress_file(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz'), keep_original=False) + cmd_line = ['hmmpress', os.path.join(self.pfam_data_dir, 'Pfam-A.hmm')] + log_file_path = os.path.join(self.pfam_data_dir, '00_hmmpress_log.txt') + ret_val = utils.run_command(cmd_line, log_file_path) + + if ret_val: + raise ConfigError("Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. \ + Check out the log file ('%s') to see what went wrong." % (log_file_path)) + else: + # getting rid of the log file because hmmpress was successful + os.remove(log_file_path) def get_version(self): From 8fc6a5c23c8839f6161254435f3c60197d06fb14 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 09:41:09 +0800 Subject: [PATCH 046/400] errant parenthesis --- anvio/pfam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index 7b9e7f26e9..ddb0112225 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -201,7 +201,7 @@ def is_database_exists(self): # here we check if the HMM profile is compressed so we can decompress it for next time if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')): self.run.warning("Anvi'o has detected that your Pfam database is currently compressed. It will now be unpacked before \ - running HMMs.")) + running HMMs.") utils.gzip_decompress_file(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz'), keep_original=False) cmd_line = ['hmmpress', os.path.join(self.pfam_data_dir, 'Pfam-A.hmm')] From 095acff11373965b75f2552f0e54771b817c720d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 12:18:53 +0800 Subject: [PATCH 047/400] this is an initial attempt at an hmm file sanity check. it needs to be reworked because there are too many KO numbers that are skipped. the ko_list file should be processed first. So this function does not work at this point, but it will be fixed later --- anvio/kofam.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 1321085122..54819ceec4 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -16,6 +16,8 @@ import anvio.terminal as terminal import anvio.filesnpaths as filesnpaths +from anvio.errors import ConfigError, FilesNPathsError + __author__ = "Developers of anvi'o (see AUTHORS.txt)" __copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" __license__ = "GPL 3.0" @@ -86,13 +88,28 @@ def decompress_files(self): else: utils.gzip_decompress_file(full_path, keep_original=False) + def confirm_downloaded_files(self): + """This function verifies that all Kofam profiles have been properly downloaded. It is intended to be run + after the files have been decompressed. The profiles directory should contain hmm files from K00001.hmm to + K23763.hmm with some exceptions.""" + skip_list = [17, 47, 56, 80, 92, 110] # the KO profiles that don't exist, based on ko_list + for k in range(1,23764): # there is likely a better way to do this. Perhaps we should process the ko_list file into a dict first + if k not in skip_list: + hmm_path = os.path.join(self.kofam_data_dir, "profiles/K%05d.hmm" % k) + if not os.path.exists(hmm_path): + raise ConfigError("The KOfam HMM profile at %s does not exist. This probably means that something went wrong \ + while downloading the KOfam database. Please run `anvi-setup-kegg-kofams` with the --reset \ + flag." % (hmm_path)) + + def run_hmmpress(self): """This function concatenates the Kofam profiles and runs hmmpress on them.""" - print("Not implemented yet") + self.progress.new('Preparing Kofam HMM Profiles') log_file_path = os.path.join(self.kofam_data_dir, '00_hmmpress_log.txt') - cmd_line = ["cat", os.path.join(self.kofam_data_dir, 'profiles/*.hmm')] - ret_val = utils.run_command(cmd_line, log_file_path) - # TODO: finish me - need sanity check for all files, concat, and press + self.progress.update('Verifying that the Kofam directory at %s contains all HMM profiles' % self.kofam_data_dir) + self.confirm_downloaded_files() + + self.progress.end() def setup_profiles(self): """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" From 0608bdc55af8f6f2760dc0898c5285f1639c6900 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 12:19:22 +0800 Subject: [PATCH 048/400] add a way to skip checking database structure --- anvio/kofam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 54819ceec4..067de6ec12 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -52,7 +52,7 @@ def __init__(self, args, run=run, progress=progress): if not self.kofam_data_dir: self.kofam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') - if not args.reset: + if not args.reset and not anvio.DEBUG: self.is_database_exists() filesnpaths.gen_output_directory(self.kofam_data_dir, delete_if_exists=args.reset) From f7ecee3a800c338e2eb2318b6e2921d48b41a7a5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 12:20:20 +0800 Subject: [PATCH 049/400] fix path to kofam profiles. this is likely not the final path to check for, but at least it works at this point --- anvio/kofam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 067de6ec12..39c815d33c 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -66,7 +66,7 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): """This function determines whether the user has already downloaded the Kofam HMM profiles.""" - if os.path.exists(os.path.join(self.kofam_data_dir, 'K00001.hmm')): # TODO: update this after determining final structure + if os.path.exists(os.path.join(self.kofam_data_dir, 'profiles/K00001.hmm')): # TODO: update this after determining final structure raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kofam_data_dir) def download(self): From a4039cfe5914f2b01edaf6714e0bcde5abb09917 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 13:03:27 +0800 Subject: [PATCH 050/400] skeleton code for concatenating profiles; does not work yet --- anvio/kofam.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 39c815d33c..79e13e8b7b 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -106,13 +106,19 @@ def run_hmmpress(self): """This function concatenates the Kofam profiles and runs hmmpress on them.""" self.progress.new('Preparing Kofam HMM Profiles') log_file_path = os.path.join(self.kofam_data_dir, '00_hmmpress_log.txt') + self.progress.update('Verifying that the Kofam directory at %s contains all HMM profiles' % self.kofam_data_dir) self.confirm_downloaded_files() + self.progress.update('Concatenating HMM profiles into one file...') + concat_file_path = os.path.join(self.kofam_data_dir, 'Kofam.hmm') ## this should be a self variable from base class + utils.concatenate_files(concat_file_path, self.hmm_list, remove_concatenated_files=True) # self.hmm_list should be a self variable from base class + self.progress.end() def setup_profiles(self): """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" self.download() self.decompress_files() + # TODO: set up ko_list dict, file list # TODO: add concatenation and hmmpress From 385fb28494aecebd9090476e7edf3035b1011c6d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 13:04:36 +0800 Subject: [PATCH 051/400] put myself as maintainer --- anvio/kofam.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 79e13e8b7b..ba4205e334 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -22,8 +22,8 @@ __copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" __license__ = "GPL 3.0" __version__ = anvio.__version__ -__maintainer__ = "Özcan Esen" -__email__ = "ozcanesen@gmail.com" +__maintainer__ = "Iva Veseli" +__email__ = "iveseli@uchicago.edu" run = terminal.Run() progress = terminal.Progress() From 6ffe5ecf9a7150c066a641bb1cb98a93ada24cae Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Jan 2020 14:41:09 +0800 Subject: [PATCH 052/400] add a base class to handle common needs like ko_list processing --- anvio/kofam.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index ba4205e334..cd77e42d3d 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -30,7 +30,16 @@ pp = terminal.pretty_print -class KofamSetup(object): +class KofamContext(object): + """ + The purpose of this base class is to define shared functions and file paths for all KOfam operations. + """ + def __init__(self, args): + # shared variables for all KOfam subclasses + self.kofam_hmm_file = "Kofam.hmm" # name of file containing concatenated KOfam hmms + + +class KofamSetup(KofamContext): """ Class for setting up KEGG Kofam HMM profiles. It performs sanity checks and downloads, unpacks, and prepares the profiles for later use by `hmmscan`. @@ -46,6 +55,9 @@ def __init__(self, args, run=run, progress=progress): self.progress = progress self.kofam_data_dir = args.kofam_data_dir + # init the base class + KofamContext.__init__(self, self.args) + filesnpaths.is_program_exists('hmmpress') # default directory will be called KEGG and will store the KEGG Module data as well @@ -111,7 +123,7 @@ def run_hmmpress(self): self.confirm_downloaded_files() self.progress.update('Concatenating HMM profiles into one file...') - concat_file_path = os.path.join(self.kofam_data_dir, 'Kofam.hmm') ## this should be a self variable from base class + concat_file_path = os.path.join(self.kofam_data_dir, self.kofam_hmm_file) ## this should be a self variable from base class utils.concatenate_files(concat_file_path, self.hmm_list, remove_concatenated_files=True) # self.hmm_list should be a self variable from base class self.progress.end() From 7e75e26a91a7592328d795e7d42f01e4c0bcda4a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 11:12:39 +0800 Subject: [PATCH 053/400] now we have a base class to handle common issues like reading the ko_list file --- anvio/kofam.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index cd77e42d3d..e4eaf4ca06 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -35,8 +35,33 @@ class KofamContext(object): The purpose of this base class is to define shared functions and file paths for all KOfam operations. """ def __init__(self, args): + A = lambda x: args.__dict__[x] if x in args.__dict__ else None + # default directory will be called KEGG and will store the KEGG Module data as well + self.kofam_data_dir = A('kofam_data_dir') or os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') + # shared variables for all KOfam subclasses - self.kofam_hmm_file = "Kofam.hmm" # name of file containing concatenated KOfam hmms + self.kofam_hmm_file_path = os.path.join(self.kofam_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms + self.ko_list_file_path = os.path.join(self.kofam_data_dir, "ko_list") + + """ + The ko_list file (which is downloaded along with the KOfam HMM profiles) contains important + information for each KEGG Orthology number (KO, or knum), incuding pre-defined scoring thresholds + for limiting HMM hits and annotation information. + + It looks something like this: + + knum threshold score_type profile_type F-measure nseq nseq_used alen mlen eff_nseq re/pos definition + K00001 329.57 domain trim 0.231663 1473 1069 1798 371 17.12 0.590 alcohol dehydrogenase [EC:1.1.1.1] + + Since this information is useful for both the setup process (we need to know all the knums) and HMM process, + all Kofam subclasses need to have access to this dictionary. + + This is a dictionary (indexed by knum) of dictionaries(indexed by column name). + Here is an example of the dictionary structure: + self.ko_dict[K00001][threshold] = 329.57 + """ + self.ko_dict = utils.get_TAB_delimited_file_as_dictionary(self.ko_list_file_path) + class KofamSetup(KofamContext): @@ -53,17 +78,12 @@ def __init__(self, args, run=run, progress=progress): self.args = args self.run = run self.progress = progress - self.kofam_data_dir = args.kofam_data_dir # init the base class KofamContext.__init__(self, self.args) filesnpaths.is_program_exists('hmmpress') - # default directory will be called KEGG and will store the KEGG Module data as well - if not self.kofam_data_dir: - self.kofam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') - if not args.reset and not anvio.DEBUG: self.is_database_exists() From f4cb129b11ac064ca4f93dbbe1361a3f2129a26d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 11:21:52 +0800 Subject: [PATCH 054/400] modify download confirmation to go through ko_list numbers --- anvio/kofam.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index e4eaf4ca06..7c555d9cb9 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -123,15 +123,14 @@ def decompress_files(self): def confirm_downloaded_files(self): """This function verifies that all Kofam profiles have been properly downloaded. It is intended to be run after the files have been decompressed. The profiles directory should contain hmm files from K00001.hmm to - K23763.hmm with some exceptions.""" - skip_list = [17, 47, 56, 80, 92, 110] # the KO profiles that don't exist, based on ko_list - for k in range(1,23764): # there is likely a better way to do this. Perhaps we should process the ko_list file into a dict first - if k not in skip_list: - hmm_path = os.path.join(self.kofam_data_dir, "profiles/K%05d.hmm" % k) - if not os.path.exists(hmm_path): - raise ConfigError("The KOfam HMM profile at %s does not exist. This probably means that something went wrong \ - while downloading the KOfam database. Please run `anvi-setup-kegg-kofams` with the --reset \ - flag." % (hmm_path)) + K23763.hmm with some exceptions; all KO numbers from ko_list file should be included.""" + ko_nums = self.ko_dict.keys() + for k in ko_nums: + hmm_path = os.path.join(self.kofam_data_dir, "profiles/%s.hmm" % k) + if not os.path.exists(hmm_path): + raise ConfigError("The KOfam HMM profile at %s does not exist. This probably means that something went wrong \ + while downloading the KOfam database. Please run `anvi-setup-kegg-kofams` with the --reset \ + flag." % (hmm_path)) def run_hmmpress(self): From 568181c7c01e0f0269d1b1fc5e046be7b3bc36dc Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 19:09:58 +0800 Subject: [PATCH 055/400] add function to determine which KOs to skip when confirming downloaded profiles --- anvio/kofam.py | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 7c555d9cb9..17b2fa627b 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -58,11 +58,38 @@ def __init__(self, args): This is a dictionary (indexed by knum) of dictionaries(indexed by column name). Here is an example of the dictionary structure: - self.ko_dict[K00001][threshold] = 329.57 + self.ko_dict["K00001"]["threshold"] = 329.57 """ self.ko_dict = utils.get_TAB_delimited_file_as_dictionary(self.ko_list_file_path) + self.ko_skip_list = self.get_ko_skip_list() + def get_ko_skip_list(self): + """ + The purpose of this function is to determine which KO numbers have no associated data in the ko_list file. + That is, their ko_list entries look like this, with hypens in all but the first and last columns: + + K14936 - - - - - - - - - - small nucleolar RNA snR191 + K15035 - - - - - - - - - - transfer-messenger RNA + K15841 - - - - - - - - - - small regulatory RNA GlmY + K15851 - - - - - - - - - - quorum regulatory RNA Qrr + K16736 - - - - - - - - - - bantam + K16863 - - - - - - - - - - microRNA 21 + These are RNAs. + + Returns: skip_list list of strings, each string is a KO number + """ + col_names_to_check = ["threshold","score_type","profile_type","F-measure","nseq","nseq_used","alen","mlen","eff_nseq","re/pos"] + skip_list = [] + for k in self.ko_dict.keys(): + should_skip = True + for c in col_names_to_check: + if not self.ko_dict[k][c] == "-": + should_skip = False + break # here we stop checking this KO num because we already found a value in our columns of interest + if should_skip: # should be True unless we found a value above + skip_list.append(k) + return skip_list class KofamSetup(KofamContext): """ Class for setting up KEGG Kofam HMM profiles. It performs sanity checks and downloads, unpacks, and prepares @@ -126,11 +153,12 @@ def confirm_downloaded_files(self): K23763.hmm with some exceptions; all KO numbers from ko_list file should be included.""" ko_nums = self.ko_dict.keys() for k in ko_nums: - hmm_path = os.path.join(self.kofam_data_dir, "profiles/%s.hmm" % k) - if not os.path.exists(hmm_path): - raise ConfigError("The KOfam HMM profile at %s does not exist. This probably means that something went wrong \ - while downloading the KOfam database. Please run `anvi-setup-kegg-kofams` with the --reset \ - flag." % (hmm_path)) + if k not in self.ko_skip_list: + hmm_path = os.path.join(self.kofam_data_dir, "profiles/%s.hmm" % k) + if not os.path.exists(hmm_path): + raise ConfigError("The KOfam HMM profile at %s does not exist. This probably means that something went wrong \ + while downloading the KOfam database. Please run `anvi-setup-kegg-kofams` with the --reset \ + flag." % (hmm_path)) def run_hmmpress(self): From ff889858badb3531823e892ff0dc51c15b978832 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 19:21:03 +0800 Subject: [PATCH 056/400] add HMM profile concatenation --- anvio/kofam.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 17b2fa627b..948354362a 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -170,8 +170,9 @@ def run_hmmpress(self): self.confirm_downloaded_files() self.progress.update('Concatenating HMM profiles into one file...') - concat_file_path = os.path.join(self.kofam_data_dir, self.kofam_hmm_file) ## this should be a self variable from base class - utils.concatenate_files(concat_file_path, self.hmm_list, remove_concatenated_files=True) # self.hmm_list should be a self variable from base class + concat_file_path = os.path.join(self.kofam_data_dir, self.kofam_hmm_file_path) + hmm_list = [os.path.join(self.kofam_data_dir, "profiles/%s.hmm" % k) for k in self.ko_dict.keys() if k not in self.ko_skip_list] + utils.concatenate_files(concat_file_path, hmm_list, remove_concatenated_files=False) self.progress.end() From 26570bca29a56a342c275e32646aa591e9b42711 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 19:24:36 +0800 Subject: [PATCH 057/400] delete original HMM profiles after concatenation --- anvio/kofam.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 948354362a..20645fe86e 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -174,6 +174,10 @@ def run_hmmpress(self): hmm_list = [os.path.join(self.kofam_data_dir, "profiles/%s.hmm" % k) for k in self.ko_dict.keys() if k not in self.ko_skip_list] utils.concatenate_files(concat_file_path, hmm_list, remove_concatenated_files=False) + # there is no reason to keep the original HMM profiles around, unless we are debugging + if not anvio.DEBUG: + os.remove(os.path.join(self.kofam_data_dir, "profiles")) + self.progress.end() def setup_profiles(self): From 6a5344d9fb1a1c589662dc76469dacb3b795f01d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 19:26:04 +0800 Subject: [PATCH 058/400] sorry. we should shutil.rmtree instead --- anvio/kofam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 20645fe86e..08f36f6864 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -176,7 +176,7 @@ def run_hmmpress(self): # there is no reason to keep the original HMM profiles around, unless we are debugging if not anvio.DEBUG: - os.remove(os.path.join(self.kofam_data_dir, "profiles")) + shutil.rmtree((os.path.join(self.kofam_data_dir, "profiles"))) self.progress.end() From e6729b2f748f7f5ceee130447f12e81a2a28336b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 19:33:29 +0800 Subject: [PATCH 059/400] add in hmmpress on concatenated hmms --- anvio/kofam.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 08f36f6864..d2dc49a77c 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -170,14 +170,25 @@ def run_hmmpress(self): self.confirm_downloaded_files() self.progress.update('Concatenating HMM profiles into one file...') - concat_file_path = os.path.join(self.kofam_data_dir, self.kofam_hmm_file_path) hmm_list = [os.path.join(self.kofam_data_dir, "profiles/%s.hmm" % k) for k in self.ko_dict.keys() if k not in self.ko_skip_list] - utils.concatenate_files(concat_file_path, hmm_list, remove_concatenated_files=False) + utils.concatenate_files(self.kofam_hmm_file_path, hmm_list, remove_concatenated_files=False) # there is no reason to keep the original HMM profiles around, unless we are debugging if not anvio.DEBUG: shutil.rmtree((os.path.join(self.kofam_data_dir, "profiles"))) + self.progress.update('Running hmmpress...') + cmd_line = ['hmmpress', self.kofam_hmm_file_path] + log_file_path = os.path.join(self.kofam_data_dir, '00_hmmpress_log.txt') + ret_val = utils.run_command(cmd_line, log_file_path) + + if ret_val: + raise ConfigError("Hmm. There was an error while running `hmmpress` on the Kofam HMM profiles. \ + Check out the log file ('%s') to see what went wrong." % (log_file_path)) + else: + # getting rid of the log file because hmmpress was successful + os.remove(log_file_path) + self.progress.end() def setup_profiles(self): From 546387aa474bd052f9b00cca7ee7fe1e281510d5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 19:34:46 +0800 Subject: [PATCH 060/400] add hmmpress to setup function --- anvio/kofam.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index d2dc49a77c..33ea090e76 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -195,5 +195,4 @@ def setup_profiles(self): """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" self.download() self.decompress_files() - # TODO: set up ko_list dict, file list - # TODO: add concatenation and hmmpress + self.run_hmmpress() From 0a91dc09963470791ff016912276cb6fb15b011a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 19:44:02 +0800 Subject: [PATCH 061/400] oh poop. forgot that ko_list does not include all the .hmm files. now we concat all .hmm files instead of relying on ko_list --- anvio/kofam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 33ea090e76..6456201ac0 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -9,6 +9,7 @@ import gzip import shutil import requests +import glob import anvio import anvio.dbops as dbops @@ -170,7 +171,7 @@ def run_hmmpress(self): self.confirm_downloaded_files() self.progress.update('Concatenating HMM profiles into one file...') - hmm_list = [os.path.join(self.kofam_data_dir, "profiles/%s.hmm" % k) for k in self.ko_dict.keys() if k not in self.ko_skip_list] + hmm_list = [k for k in glob.glob(os.path.join(self.kofam_data_dir, 'profiles/*.hmm'))] utils.concatenate_files(self.kofam_hmm_file_path, hmm_list, remove_concatenated_files=False) # there is no reason to keep the original HMM profiles around, unless we are debugging From 34cffb6092c37967e96b25913bf9ed80efff65d0 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 20:01:20 +0800 Subject: [PATCH 062/400] add provides line to kofam setup script --- bin/anvi-setup-kegg-kofams | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index de9865c9b3..c21cc9dcd8 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -14,7 +14,7 @@ __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "Özcan Esen" __email__ = "ozcanesen@gmail.com" -#__provides__ = ## TODO: fill this in +__provides__ = ["kofam-data"] __description__ = "Download and setup KEGG KOfam HMM profiles." if __name__ == '__main__': From e93f9013641393a36d952ab1540269e24cf76a9b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 20:45:24 +0800 Subject: [PATCH 063/400] initial code setup for running kofam hmms --- anvio/kofam.py | 19 +++++++++++++++++ bin/anvi-run-kegg-kofams | 45 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 bin/anvi-run-kegg-kofams diff --git a/anvio/kofam.py b/anvio/kofam.py index 6456201ac0..ba1484bb5f 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -197,3 +197,22 @@ def setup_profiles(self): self.download() self.decompress_files() self.run_hmmpress() + +class KofamRunHMMs(KofamContext): + """ Class for running `hmmscan` against the KOfam database and adding the resulting hits to contigs DBs + for later metabolism prediction. + + Parameters + ========== + args: Namespace object + All the arguments supplied by user to anvi-run-kegg-kofams + """ + def __init__(self, args, run=run, progress=progress): + self.args = args + self.run = run + self.progress = progress + + # init the base class + KofamContext.__init__(self, self.args) + + filesnpaths.is_program_exists('hmmscan') diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams new file mode 100644 index 0000000000..835740b84f --- /dev/null +++ b/bin/anvi-run-kegg-kofams @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import sys + +import anvio +import anvio.terminal as terminal +import anvio.kofam as kofam + +from anvio.errors import ConfigError, FilesNPathsError +from anvio.terminal import time_program + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__maintainer__ = "Iva Veseli" +__email__ = "iveseli@uchicago.edu" +__requires__ = ['contigs-db', "kofam-data",] +#__provides__ = ## TODO: fill in +__description__ = "Run KOfam HMMs on an anvi'o contigs database." + +@time_program +def main(args): + p = kofam.KofamRunHMMs(args) + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description=__description__) + + parser.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db')) + parser.add_argument(*anvio.A('kofam-data-dir'), **anvio.K('kofam-data-dir')) + parser.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) + + args = anvio.get_args(parser) + + try: + main(args) + except ConfigError as e: + print(e) + sys.exit(-1) + except FilesNPathsError as e: + print(e) + sys.exit(-1) From 63c0f6ec5c59e19da66f5dafee8b64b22df7dee0 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 20:52:35 +0800 Subject: [PATCH 064/400] update path to check for downloaded kofam data --- anvio/kofam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index ba1484bb5f..f8855fe3ac 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -126,7 +126,7 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): """This function determines whether the user has already downloaded the Kofam HMM profiles.""" - if os.path.exists(os.path.join(self.kofam_data_dir, 'profiles/K00001.hmm')): # TODO: update this after determining final structure + if os.path.exists(self.kofam_hmm_file_path): raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kofam_data_dir) def download(self): From 5a4cf6cbe287e9f90771e710059f27e30e7c3621 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 20:59:47 +0800 Subject: [PATCH 065/400] it will be problematic if we try to setup the ko dict before downloading it. here we fix this by doing this in a separate function --- anvio/kofam.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index f8855fe3ac..bc5ed8c69a 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -44,7 +44,10 @@ def __init__(self, args): self.kofam_hmm_file_path = os.path.join(self.kofam_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms self.ko_list_file_path = os.path.join(self.kofam_data_dir, "ko_list") + def setup_ko_dict(self): """ + The purpose of this function is to process the ko_list file into usable form by Kofam sub-classes. + The ko_list file (which is downloaded along with the KOfam HMM profiles) contains important information for each KEGG Orthology number (KO, or knum), incuding pre-defined scoring thresholds for limiting HMM hits and annotation information. @@ -61,6 +64,7 @@ def __init__(self, args): Here is an example of the dictionary structure: self.ko_dict["K00001"]["threshold"] = 329.57 """ + self.ko_dict = utils.get_TAB_delimited_file_as_dictionary(self.ko_list_file_path) self.ko_skip_list = self.get_ko_skip_list() @@ -196,6 +200,7 @@ def setup_profiles(self): """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" self.download() self.decompress_files() + self.setup_ko_dict() self.run_hmmpress() class KofamRunHMMs(KofamContext): From 39de0e2a55db805d386ce6e2631ecf02dfbc007c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 21:08:35 +0800 Subject: [PATCH 066/400] sanity check for kofam data --- anvio/kofam.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index bc5ed8c69a..f6ef990a5f 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -64,7 +64,7 @@ def setup_ko_dict(self): Here is an example of the dictionary structure: self.ko_dict["K00001"]["threshold"] = 329.57 """ - + self.ko_dict = utils.get_TAB_delimited_file_as_dictionary(self.ko_list_file_path) self.ko_skip_list = self.get_ko_skip_list() @@ -221,3 +221,10 @@ def __init__(self, args, run=run, progress=progress): KofamContext.__init__(self, self.args) filesnpaths.is_program_exists('hmmscan') + + # verify that Kofam HMM profiles have been set up + if not os.path.exists(self.kofam_hmm_file_path): + raise ConfigError("Anvi'o is unable to find the Kofam.hmm file at %s. This can happen one of two ways. Either you \ + didn't specify the correct Kofam data directory using the flag --kofam-data-dir, or you haven't \ + yet set up the Kofam data by running `anvi-setup-kegg-kofams`. Hopefully you now know what to do \ + to fix this problem. :) " % self.kofam_data_dir) From ca878a0670dce5a50bb41c74d4e7b984e215e00d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 21:09:16 +0800 Subject: [PATCH 067/400] collect some args --- anvio/kofam.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index f6ef990a5f..d20b648eba 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -216,6 +216,8 @@ def __init__(self, args, run=run, progress=progress): self.args = args self.run = run self.progress = progress + self.contigs_db_path = args.contigs_db + self.num_threads = args.num_threads # init the base class KofamContext.__init__(self, self.args) From f7b5dea115c9b3fb5f62f63ea2f07faabf6ab3a9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 21:13:12 +0800 Subject: [PATCH 068/400] sanity check for contigs db --- anvio/kofam.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index d20b648eba..be0d8a1f5d 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -230,3 +230,5 @@ def __init__(self, args, run=run, progress=progress): didn't specify the correct Kofam data directory using the flag --kofam-data-dir, or you haven't \ yet set up the Kofam data by running `anvi-setup-kegg-kofams`. Hopefully you now know what to do \ to fix this problem. :) " % self.kofam_data_dir) + + utils.is_contigs_db(self.contigs_db_path) From b4c9cea37ebe5e5505170843355c0532d0f0c3f2 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 21:44:57 +0800 Subject: [PATCH 069/400] code to run hmms on kofam, similar to Pfams. not tested yet --- anvio/kofam.py | 26 ++++++++++++++++++++++++++ bin/anvi-run-kegg-kofams | 1 + 2 files changed, 27 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index be0d8a1f5d..a34f802615 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -232,3 +232,29 @@ def __init__(self, args, run=run, progress=progress): to fix this problem. :) " % self.kofam_data_dir) utils.is_contigs_db(self.contigs_db_path) + + self.setup_ko_dict() # read the ko_list file into self.ko_dict + + def process_kofam_hmms(self): + """This is a driver function for running HMMs against the KOfam database and processing the hits into the + provided contigs DB""" + + tmp_directory_path = filesnpaths.get_temp_directory_path() + contigs_db = dbops.ContigsSuperclass(self.args) # initialize contigs db + + # get AA sequences as FASTA + target_files_dict = {'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')} + contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['AA:GENE'], + simple_headers=True, + rna_alphabet=False, + report_aa_sequences=True) + + # run hmmscan + hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads) + hmm_hits_file = hmmer.run_hmmscan('KOfam', 'AA', 'GENE', None, None, len(self.ko_dict), self.kofam_hmm_file_path, None, '--cut_ga') + ## TODO: here we have an issue. the number of genes in our HMM model (len(self.ko_dict) is wrong, because ko_dict (derived from ko_list) + ## does not contain the same number of entries as there were .hmm files originally in the downloaded KOfam profiles... + ## As far as I can tell this argument is not used for much except for printing out the value, but it would be nice if it was right + + # get an instance of gene functions table + gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress) diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams index 835740b84f..ed292c3353 100644 --- a/bin/anvi-run-kegg-kofams +++ b/bin/anvi-run-kegg-kofams @@ -23,6 +23,7 @@ __description__ = "Run KOfam HMMs on an anvi'o contigs database." @time_program def main(args): p = kofam.KofamRunHMMs(args) + p.process_kofam_hmms() if __name__ == '__main__': import argparse From 7160c5a263d3fabbb5b93d9a20462af2d2f18006 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 21:50:09 +0800 Subject: [PATCH 070/400] add requisite imports --- anvio/kofam.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index a34f802615..61fd237082 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -18,6 +18,9 @@ import anvio.filesnpaths as filesnpaths from anvio.errors import ConfigError, FilesNPathsError +from anvio.drivers.hmmer import HMMer +from anvio.parsers import parser_modules +from anvio.tables.genefunctions import TableForGeneFunctions __author__ = "Developers of anvi'o (see AUTHORS.txt)" __copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" From 61cd8bb041345751736631fa4cbcdff497a94ad3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 21:50:39 +0800 Subject: [PATCH 071/400] handle case of no hmm hits --- anvio/kofam.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 61fd237082..ab59c7b51b 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -261,3 +261,12 @@ def process_kofam_hmms(self): # get an instance of gene functions table gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress) + + if not hmm_hits_file: + run.info_single("The HMM search returned no hits :/ So there is nothing to add to the contigs database. But\ + now anvi'o will add KOfam as a functional source with no hits, clean the temporary directories\ + and gracefully quit.", nl_before=1, nl_after=1) + shutil.rmtree(tmp_directory_path) + hmmer.clean_tmp_dirs() + gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) + return From eb49a8f1258b2d06406e7f93b30a870d5ce3be57 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 21:52:27 +0800 Subject: [PATCH 072/400] add parsing --- anvio/kofam.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index ab59c7b51b..59805f01b6 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -270,3 +270,7 @@ def process_kofam_hmms(self): hmmer.clean_tmp_dirs() gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) return + + # parse hmmscan output + parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE') + search_results_dict = parser.get_search_results() From 9c186b7d3a09cfe516c40672c1d9a2ef7f1702e2 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 22:01:18 +0800 Subject: [PATCH 073/400] function for looking up KO num in ko_dict --- anvio/kofam.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 59805f01b6..877f139c27 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -221,6 +221,7 @@ def __init__(self, args, run=run, progress=progress): self.progress = progress self.contigs_db_path = args.contigs_db self.num_threads = args.num_threads + self.ko_dict = None # should be set up by setup_ko_dict() # init the base class KofamContext.__init__(self, self.args) @@ -238,6 +239,21 @@ def __init__(self, args, run=run, progress=progress): self.setup_ko_dict() # read the ko_list file into self.ko_dict + def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): + if not self.ko_dict: + raise ConfigError("Oops! The ko_list file has not been properly loaded, so get_annotation_from_ko_dict() is \ + extremely displeased and unable to function properly. Please refrain from calling this \ + function until after setup_ko_dict() has been called.") + + if not knum in self.ko_dict: + if ok_if_missing_from_dict: + return "Unkown function with KO num" % knum + else: + raise ConfigError("It seems hmmscan found a KO number that does not exist\ + in the KOfam ko_list file: %s" % knum) + + return self.ko_dict[knum]['definition'] + def process_kofam_hmms(self): """This is a driver function for running HMMs against the KOfam database and processing the hits into the provided contigs DB""" From 9c28898746ea072bd81db714877973b9dc2ba44f Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 22:03:08 +0800 Subject: [PATCH 074/400] typo --- anvio/kofam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 877f139c27..5937bac966 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -247,7 +247,7 @@ def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): if not knum in self.ko_dict: if ok_if_missing_from_dict: - return "Unkown function with KO num" % knum + return "Unknown function with KO num" % knum else: raise ConfigError("It seems hmmscan found a KO number that does not exist\ in the KOfam ko_list file: %s" % knum) From 0849cb370f8ce1e2bc47ae0230eb1c0af91d13d0 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 22:05:39 +0800 Subject: [PATCH 075/400] add hmm hits to contigs db --- anvio/kofam.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 5937bac966..6334daf4ea 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -290,3 +290,24 @@ def process_kofam_hmms(self): # parse hmmscan output parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE') search_results_dict = parser.get_search_results() + + # add functions to database + functions_dict = {} + counter = 0 + for hmm_hit in search_results_dict.values(): + functions_dict[counter] = { + 'gene_callers_id': hmm_hit['gene_callers_id'], + 'source': 'KOfam', + 'accession': hmm_hit['gene_hmm_id'], + 'function': self.get_annotation_from_ko_dict(hmm_hit['gene_hmm_id'], ok_if_missing_from_catalog=True), + 'e_value': hmm_hit['e_value'], + } + + counter += 1 + + if functions_dict: + gene_function_calls_table.create(functions_dict) + else: + self.run.warning("KOfam class has no hits to process. Returning empty handed, but still adding KOfam as \ + a functional source.") + gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) From 7c0117916c854b594732db8b10d77cddb82bd151 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 22:06:29 +0800 Subject: [PATCH 076/400] removal of temp dirs --- anvio/kofam.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 6334daf4ea..68fcf525ca 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -311,3 +311,12 @@ def process_kofam_hmms(self): self.run.warning("KOfam class has no hits to process. Returning empty handed, but still adding KOfam as \ a functional source.") gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) + + if anvio.DEBUG: + run.warning("The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up\ + later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug") + else: + run.info_single('Cleaning up the temp directory (you can use `--debug` if you would\ + like to keep it for testing purposes)', nl_before=1, nl_after=1) + shutil.rmtree(tmp_directory_path) + hmmer.clean_tmp_dirs() From 88f8aa520228a29f8a81f44df15b4beddd69441b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 22:23:18 +0800 Subject: [PATCH 077/400] add --kofam-data-dir to the master arg dictionary --- anvio/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/anvio/__init__.py b/anvio/__init__.py index 725499f056..c45d1da9d1 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -634,6 +634,13 @@ def get_args(parser): 'help': "The directory path for your Pfam setup. Anvi'o will try to use the default path\ if you do not specify anything."} ), + 'kofam-data-dir': ( + ['--koam-data-dir'], + {'default': None, + 'type': str, + 'help': "The directory path for your KOfam setup. Anvi'o will try to use the default path\ + if you do not specify anything."} + ), 'hide-outlier-SNVs': ( ['--hide-outlier-SNVs'], {'default': False, From 131b1b1ef3770222ca09704c86b3c2eb0908af3e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 22:23:57 +0800 Subject: [PATCH 078/400] oopsie typo --- anvio/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index c45d1da9d1..e095041974 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -635,7 +635,7 @@ def get_args(parser): if you do not specify anything."} ), 'kofam-data-dir': ( - ['--koam-data-dir'], + ['--kofam-data-dir'], {'default': None, 'type': str, 'help': "The directory path for your KOfam setup. Anvi'o will try to use the default path\ From 3576024679b056f535ade8011b73614300d40817 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 19 Jan 2020 22:36:11 +0800 Subject: [PATCH 079/400] Code is working, but no hits for my test db. Here is a note to self about investigating more thoroughly later --- anvio/kofam.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 68fcf525ca..91cbc93ec4 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -275,6 +275,10 @@ def process_kofam_hmms(self): ## does not contain the same number of entries as there were .hmm files originally in the downloaded KOfam profiles... ## As far as I can tell this argument is not used for much except for printing out the value, but it would be nice if it was right + # TODO: this code runs nicely (error-free) but returns no HMM hits. Not sure yet if there is an issue with the code or if my partial contigs + # DB legitimately has no hits. This note is here so that I remember to investigate tomorrow when I am not brain-dead from coding in a train + # for five hours :) (ps I think looking into this noise_cutoff_terms parameter could be wise) + # get an instance of gene functions table gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress) From 6971f1e7318c2ea033d4c310fc9882fb446c8e1f Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 21 Jan 2020 13:07:23 +0800 Subject: [PATCH 080/400] alter skip list function to also get list of ko with no score threshold --- anvio/kofam.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 91cbc93ec4..aa457e05a0 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -69,11 +69,11 @@ def setup_ko_dict(self): """ self.ko_dict = utils.get_TAB_delimited_file_as_dictionary(self.ko_list_file_path) - self.ko_skip_list = self.get_ko_skip_list() + self.ko_skip_list, self.ko_no_threshold_list = self.get_ko_skip_list() def get_ko_skip_list(self): """ - The purpose of this function is to determine which KO numbers have no associated data in the ko_list file. + The purpose of this function is to determine which KO numbers have no associated data or just no score threshold in the ko_list file. That is, their ko_list entries look like this, with hypens in all but the first and last columns: K14936 - - - - - - - - - - small nucleolar RNA snR191 @@ -85,19 +85,32 @@ def get_ko_skip_list(self): These are RNAs. - Returns: skip_list list of strings, each string is a KO number + Or, their ko_list entries look like this, with no score threshold (but the rest of the data is not completely blank): + + K23749 - - - - 1 1 2266 2266 0.39 0.592 spectinabilin polyketide synthase system NorC [EC:2.3.1.290] + + Returns: + skip_list list of strings, each string is a KO number + no_threshold_list list of strings, each string is a KO number """ col_names_to_check = ["threshold","score_type","profile_type","F-measure","nseq","nseq_used","alen","mlen","eff_nseq","re/pos"] skip_list = [] + no_threshold_list = [] for k in self.ko_dict.keys(): should_skip = True + no_threshold = False for c in col_names_to_check: if not self.ko_dict[k][c] == "-": should_skip = False break # here we stop checking this KO num because we already found a value in our columns of interest + + if c == "threshold": + no_threshold = True # if we got to this line of code, there is a '-' in the threshold column if should_skip: # should be True unless we found a value above skip_list.append(k) - return skip_list + elif no_threshold: + no_threshold_list.append(k) + return skip_list, no_threshold_list class KofamSetup(KofamContext): """ Class for setting up KEGG Kofam HMM profiles. It performs sanity checks and downloads, unpacks, and prepares From b336ba8404b44dc79fc1fddefcc91fe4a74f2bd3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 21 Jan 2020 15:16:30 +0800 Subject: [PATCH 081/400] remove unwanted KOs from ko dict --- anvio/kofam.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index aa457e05a0..8fdd102cf1 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -70,6 +70,9 @@ def setup_ko_dict(self): self.ko_dict = utils.get_TAB_delimited_file_as_dictionary(self.ko_list_file_path) self.ko_skip_list, self.ko_no_threshold_list = self.get_ko_skip_list() + # here we remove KOs from the dictionary if they are in the skip list or no threshold list + [self.ko_dict.pop(ko) for ko in self.ko_skip_list] + [self.ko_dict.pop(ko) for ko in self.ko_no_threshold_list] def get_ko_skip_list(self): """ From 05f37c9104498804c5d6e1f519451d8c609db8c9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 21 Jan 2020 15:16:51 +0800 Subject: [PATCH 082/400] fix comment --- anvio/kofam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 8fdd102cf1..7e217593b5 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -174,7 +174,8 @@ def decompress_files(self): def confirm_downloaded_files(self): """This function verifies that all Kofam profiles have been properly downloaded. It is intended to be run after the files have been decompressed. The profiles directory should contain hmm files from K00001.hmm to - K23763.hmm with some exceptions; all KO numbers from ko_list file should be included.""" + K23763.hmm with some exceptions; all KO numbers from ko_list file (except those in ko_skip_list) should be + included.""" ko_nums = self.ko_dict.keys() for k in ko_nums: if k not in self.ko_skip_list: From c5da5ec479cffe55e8623371ff4674b015aac394 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 21 Jan 2020 16:25:55 +0800 Subject: [PATCH 083/400] generate text file with orphan KO data --- anvio/kofam.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 7e217593b5..45afa22a3f 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -42,6 +42,7 @@ def __init__(self, args): A = lambda x: args.__dict__[x] if x in args.__dict__ else None # default directory will be called KEGG and will store the KEGG Module data as well self.kofam_data_dir = A('kofam_data_dir') or os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') + self.orphan_data_dir = os.path.join(self.kofam_data_dir, "orphan_data") # shared variables for all KOfam subclasses self.kofam_hmm_file_path = os.path.join(self.kofam_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms @@ -70,6 +71,20 @@ def setup_ko_dict(self): self.ko_dict = utils.get_TAB_delimited_file_as_dictionary(self.ko_list_file_path) self.ko_skip_list, self.ko_no_threshold_list = self.get_ko_skip_list() + + # if we are currently setting up KOfams, we should generate a text file with the ko_list entries + # of the KOs that have no scoring threshold + if self.__class__.__name__ in ['KofamSetup']: + orphan_ko_dict = {ko:self.ko_dict[ko] for ko in self.ko_skip_list} + orphan_ko_dict.update({ko:self.ko_dict[ko] for ko in self.ko_no_threshold_list}) + + if not os.path.exists(self.orphan_data_dir): # should not happen but we check just in case + raise ConfigError("Hmm. Something is out of order. The orphan data directory %s does not exist \ + yet, but it needs to in order for the setup_ko_dict() function to work.") + orphan_ko_path = os.path.join(self.orphan_data_dir, "01_ko_fams_with_no_threshold.txt") + orphan_ko_headers = ["threshold","score_type","profile_type","F-measure","nseq","nseq_used","alen","mlen","eff_nseq","re/pos", "definition"] + utils.store_dict_as_TAB_delimited_file(orphan_ko_dict, orphan_ko_path, key_header="knum", headers=orphan_ko_headers) + # here we remove KOs from the dictionary if they are in the skip list or no threshold list [self.ko_dict.pop(ko) for ko in self.ko_skip_list] [self.ko_dict.pop(ko) for ko in self.ko_no_threshold_list] @@ -139,6 +154,7 @@ def __init__(self, args, run=run, progress=progress): self.is_database_exists() filesnpaths.gen_output_directory(self.kofam_data_dir, delete_if_exists=args.reset) + filesnpaths.gen_output_directory(self.orphan_data_dir, delete_if_exists=args.reset) # ftp path for HMM profiles and KO list # for ko list, add /ko_list.gz to end of url From 5e0c2be56d1405f09589cd7ff9c6e26092cdef9f Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 21 Jan 2020 17:05:24 +0800 Subject: [PATCH 084/400] determine which orphan KO files to move --- anvio/kofam.py | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 45afa22a3f..def56da3ec 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -10,6 +10,7 @@ import shutil import requests import glob +import re import anvio import anvio.dbops as dbops @@ -80,7 +81,7 @@ def setup_ko_dict(self): if not os.path.exists(self.orphan_data_dir): # should not happen but we check just in case raise ConfigError("Hmm. Something is out of order. The orphan data directory %s does not exist \ - yet, but it needs to in order for the setup_ko_dict() function to work.") + yet, but it needs to in order for the setup_ko_dict() function to work." % self.orphan_data_dir) orphan_ko_path = os.path.join(self.orphan_data_dir, "01_ko_fams_with_no_threshold.txt") orphan_ko_headers = ["threshold","score_type","profile_type","F-measure","nseq","nseq_used","alen","mlen","eff_nseq","re/pos", "definition"] utils.store_dict_as_TAB_delimited_file(orphan_ko_dict, orphan_ko_path, key_header="knum", headers=orphan_ko_headers) @@ -201,6 +202,40 @@ def confirm_downloaded_files(self): while downloading the KOfam database. Please run `anvi-setup-kegg-kofams` with the --reset \ flag." % (hmm_path)) + def move_orphan_files(self): + """ + This function moves the following to the orphan files directory: + - profiles that do not have ko_list entries + - profiles whose ko_list entries have no scoring threshold (in ko_no_threshold_list) + And, the following profiles should not have been downloaded, but we check if they exist and move any that do: + - profiles whose ko_list entries have no data at all (in ko_skip_list) + """ + if not os.path.exists(self.orphan_data_dir): # should not happen but we check just in case + raise ConfigError("Hmm. Something is out of order. The orphan data directory %s does not exist \ + yet, but it needs to in order for the move_orphan_files() function to work." % self.orphan_data_dir) + + no_kofam_path = os.path.join(self.orphan_data_dir, "00_hmm_profiles_with_no_ko_fams.hmm") + no_kofam_file_list = [] + no_threshold_path = os.path.join(self.orphan_data_dir, "02_hmm_profiles_with_ko_fams_with_no_threshold.txt") + no_threshold_file_list = [] + no_data_path = os.path.join(self.orphan_data_dir, "03_hmm_profiles_with_ko_fams_with_no_data.txt") + no_data_file_list = [] + + hmm_list = [k for k in glob.glob(os.path.join(self.kofam_data_dir, 'profiles/*.hmm'))] + for hmm_file in hmm_list: + ko = re.search('profiles/(K\d{5})\.hmm', hmm_file).group(1) + if ko not in self.ko_dict.keys(): + no_kofam_file_list.append(hmm_file) + elif ko in self.ko_no_threshold_list: + no_threshold_file_list.append(hmm_file) + elif ko in self.ko_skip_list: # these should not have been downloaded, but if they were we will move them + self.run.warning("Interesting. The KOfam HMM profile %s was downloaded even though its entry in the `ko_list` file\ + was mostly blank. Oh well, it will be moved to the orphan files directory at %s.", % (hmm_file, self.orphan_data_dir)) + no_data_file_list.append(hmm_file) + + + + def run_hmmpress(self): """This function concatenates the Kofam profiles and runs hmmpress on them.""" @@ -210,6 +245,9 @@ def run_hmmpress(self): self.progress.update('Verifying that the Kofam directory at %s contains all HMM profiles' % self.kofam_data_dir) self.confirm_downloaded_files() + self.progress.update('Handling orphan files') + self.move_orphan_files() + self.progress.update('Concatenating HMM profiles into one file...') hmm_list = [k for k in glob.glob(os.path.join(self.kofam_data_dir, 'profiles/*.hmm'))] utils.concatenate_files(self.kofam_hmm_file_path, hmm_list, remove_concatenated_files=False) From a0671dd0944102a8e210e2da51d62013be3c64d2 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 21 Jan 2020 17:09:45 +0800 Subject: [PATCH 085/400] concatenate and move orphan KOs --- anvio/kofam.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index def56da3ec..2c587a255c 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -216,9 +216,9 @@ def move_orphan_files(self): no_kofam_path = os.path.join(self.orphan_data_dir, "00_hmm_profiles_with_no_ko_fams.hmm") no_kofam_file_list = [] - no_threshold_path = os.path.join(self.orphan_data_dir, "02_hmm_profiles_with_ko_fams_with_no_threshold.txt") + no_threshold_path = os.path.join(self.orphan_data_dir, "02_hmm_profiles_with_ko_fams_with_no_threshold.hmm") no_threshold_file_list = [] - no_data_path = os.path.join(self.orphan_data_dir, "03_hmm_profiles_with_ko_fams_with_no_data.txt") + no_data_path = os.path.join(self.orphan_data_dir, "03_hmm_profiles_with_ko_fams_with_no_data.hmm") no_data_file_list = [] hmm_list = [k for k in glob.glob(os.path.join(self.kofam_data_dir, 'profiles/*.hmm'))] @@ -233,7 +233,13 @@ def move_orphan_files(self): was mostly blank. Oh well, it will be moved to the orphan files directory at %s.", % (hmm_file, self.orphan_data_dir)) no_data_file_list.append(hmm_file) - + # now we concatenate the orphan KO hmms into the orphan data directory + if no_kofam_file_list: + utils.concatenate_files(no_kofam_path, no_kofam_file_list, remove_concatenated_files=False) + if no_threshold_file_list: + utils.concatenate_files(no_threshold_path, no_threshold_file_list, remove_concatenated_files=False) + if no_data_file_list: + utils.concatenate_files(no_data_path, no_data_file_list, remove_concatenated_files=False) From 909cc11a53569ca2c9fd5953140242fbc98deb70 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 21 Jan 2020 23:13:17 +0800 Subject: [PATCH 086/400] only remove old files if not in debug mode --- anvio/kofam.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 2c587a255c..27b06c60e8 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -234,12 +234,13 @@ def move_orphan_files(self): no_data_file_list.append(hmm_file) # now we concatenate the orphan KO hmms into the orphan data directory + remove_old_files = not anvio.DEBUG # if we are running in debug mode, we will not remove the individual hmm files after concatenation if no_kofam_file_list: - utils.concatenate_files(no_kofam_path, no_kofam_file_list, remove_concatenated_files=False) + utils.concatenate_files(no_kofam_path, no_kofam_file_list, remove_concatenated_files=remove_old_files) if no_threshold_file_list: - utils.concatenate_files(no_threshold_path, no_threshold_file_list, remove_concatenated_files=False) + utils.concatenate_files(no_threshold_path, no_threshold_file_list, remove_concatenated_files=remove_old_files) if no_data_file_list: - utils.concatenate_files(no_data_path, no_data_file_list, remove_concatenated_files=False) + utils.concatenate_files(no_data_path, no_data_file_list, remove_concatenated_files=remove_old_files) From 9fcefe6955be5a9d1ae6082a9f8cf5a2528bf299 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 21 Jan 2020 23:20:34 +0800 Subject: [PATCH 087/400] add warning message about orphan files --- anvio/kofam.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 27b06c60e8..8ac0a7c364 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -242,6 +242,13 @@ def move_orphan_files(self): if no_data_file_list: utils.concatenate_files(no_data_path, no_data_file_list, remove_concatenated_files=remove_old_files) + # report orphan files + self.run.warning("Please note that while anvi'o was building your databases, she found %d \ + ko_fam entries that did not have any matching HMM profiles, and another %d of them \ + that did not have any threshold to remove weak hits. We have removed those %d HMM \ + profiles from the final database. You can find entries for each of these categories under the directory '%s'." + % (len(no_kofam_file_list), len(no_threshold_file_list), len(no_kofam_file_list) + len(no_threshold_file_list), self.orphan_data_dir)) + def run_hmmpress(self): From 3b10145b9e07aec3fa06c48bef73f5d4043c9852 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Jan 2020 02:52:57 +0800 Subject: [PATCH 088/400] comma --- anvio/kofam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 8ac0a7c364..5680291b9d 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -230,7 +230,7 @@ def move_orphan_files(self): no_threshold_file_list.append(hmm_file) elif ko in self.ko_skip_list: # these should not have been downloaded, but if they were we will move them self.run.warning("Interesting. The KOfam HMM profile %s was downloaded even though its entry in the `ko_list` file\ - was mostly blank. Oh well, it will be moved to the orphan files directory at %s.", % (hmm_file, self.orphan_data_dir)) + was mostly blank. Oh well, it will be moved to the orphan files directory at %s." % (hmm_file, self.orphan_data_dir)) no_data_file_list.append(hmm_file) # now we concatenate the orphan KO hmms into the orphan data directory From 7df67f0f0e61cd5d8a0668907e613901771619e8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Jan 2020 02:53:43 +0800 Subject: [PATCH 089/400] fix logic :) --- anvio/kofam.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 5680291b9d..80ccbc196a 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -225,13 +225,14 @@ def move_orphan_files(self): for hmm_file in hmm_list: ko = re.search('profiles/(K\d{5})\.hmm', hmm_file).group(1) if ko not in self.ko_dict.keys(): - no_kofam_file_list.append(hmm_file) - elif ko in self.ko_no_threshold_list: - no_threshold_file_list.append(hmm_file) - elif ko in self.ko_skip_list: # these should not have been downloaded, but if they were we will move them - self.run.warning("Interesting. The KOfam HMM profile %s was downloaded even though its entry in the `ko_list` file\ - was mostly blank. Oh well, it will be moved to the orphan files directory at %s." % (hmm_file, self.orphan_data_dir)) - no_data_file_list.append(hmm_file) + if ko in self.ko_no_threshold_list: + no_threshold_file_list.append(hmm_file) + elif ko in self.ko_skip_list: # these should not have been downloaded, but if they were we will move them + self.run.warning("Interesting. The KOfam HMM profile %s was downloaded even though its entry in the `ko_list` file\ + was mostly blank. Oh well, it will be moved to the orphan files directory at %s." % (hmm_file, self.orphan_data_dir)) + no_data_file_list.append(hmm_file) + else: + no_kofam_file_list.append(hmm_file) # now we concatenate the orphan KO hmms into the orphan data directory remove_old_files = not anvio.DEBUG # if we are running in debug mode, we will not remove the individual hmm files after concatenation From ad6208fb8a0c7ae639fb9ef49f6a339fe227154a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Jan 2020 04:44:11 +0800 Subject: [PATCH 090/400] update warning messages to be individual to situation --- anvio/kofam.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 80ccbc196a..00a0401150 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -228,8 +228,6 @@ def move_orphan_files(self): if ko in self.ko_no_threshold_list: no_threshold_file_list.append(hmm_file) elif ko in self.ko_skip_list: # these should not have been downloaded, but if they were we will move them - self.run.warning("Interesting. The KOfam HMM profile %s was downloaded even though its entry in the `ko_list` file\ - was mostly blank. Oh well, it will be moved to the orphan files directory at %s." % (hmm_file, self.orphan_data_dir)) no_data_file_list.append(hmm_file) else: no_kofam_file_list.append(hmm_file) @@ -238,10 +236,23 @@ def move_orphan_files(self): remove_old_files = not anvio.DEBUG # if we are running in debug mode, we will not remove the individual hmm files after concatenation if no_kofam_file_list: utils.concatenate_files(no_kofam_path, no_kofam_file_list, remove_concatenated_files=remove_old_files) + self.run.warning("Please note that while anvi'o was building your databases, she found %d \ + HMM profiles that did not have any matching KOfam entries. We have removed those HMM \ + profiles from the final database. You can find them under the directory '%s'." + % (len(no_kofam_file_list), self.orphan_data_dir)) + if no_threshold_file_list: utils.concatenate_files(no_threshold_path, no_threshold_file_list, remove_concatenated_files=remove_old_files) + self.run.warning("Please note that while anvi'o was building your databases, she found %d \ + KOfam entries that did not have any threshold to remove weak hits. We have removed those HMM \ + profiles from the final database. You can find them under the directory '%s'." + % (len(no_threshold_file_list), self.orphan_data_dir)) if no_data_file_list: utils.concatenate_files(no_data_path, no_data_file_list, remove_concatenated_files=remove_old_files) + self.run.warning("Please note that while anvi'o was building your databases, she found %d \ + HMM profiles that did not have any associated data (besides an annotation) in their KOfam entries. \ + We have removed those HMM profiles from the final database. You can find them under the directory '%s'." + % (len(no_data_file_list), self.orphan_data_dir)) # report orphan files self.run.warning("Please note that while anvi'o was building your databases, she found %d \ From 7df26126798d22fa90437067e2c5435fd99a9a5e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 27 Jan 2020 19:07:27 +0800 Subject: [PATCH 091/400] remove superfluous warning message --- anvio/kofam.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 00a0401150..8c9eb1a6b2 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -254,14 +254,6 @@ def move_orphan_files(self): We have removed those HMM profiles from the final database. You can find them under the directory '%s'." % (len(no_data_file_list), self.orphan_data_dir)) - # report orphan files - self.run.warning("Please note that while anvi'o was building your databases, she found %d \ - ko_fam entries that did not have any matching HMM profiles, and another %d of them \ - that did not have any threshold to remove weak hits. We have removed those %d HMM \ - profiles from the final database. You can find entries for each of these categories under the directory '%s'." - % (len(no_kofam_file_list), len(no_threshold_file_list), len(no_kofam_file_list) + len(no_threshold_file_list), self.orphan_data_dir)) - - def run_hmmpress(self): """This function concatenates the Kofam profiles and runs hmmpress on them.""" From 017451f6a889d32a03af09c157a3019046432104 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 30 Jan 2020 03:04:29 -0600 Subject: [PATCH 092/400] removed TODO; this has been resolved --- anvio/kofam.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 8c9eb1a6b2..82d0123398 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -360,9 +360,6 @@ def process_kofam_hmms(self): # run hmmscan hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads) hmm_hits_file = hmmer.run_hmmscan('KOfam', 'AA', 'GENE', None, None, len(self.ko_dict), self.kofam_hmm_file_path, None, '--cut_ga') - ## TODO: here we have an issue. the number of genes in our HMM model (len(self.ko_dict) is wrong, because ko_dict (derived from ko_list) - ## does not contain the same number of entries as there were .hmm files originally in the downloaded KOfam profiles... - ## As far as I can tell this argument is not used for much except for printing out the value, but it would be nice if it was right # TODO: this code runs nicely (error-free) but returns no HMM hits. Not sure yet if there is an issue with the code or if my partial contigs # DB legitimately has no hits. This note is here so that I remember to investigate tomorrow when I am not brain-dead from coding in a train From 71fdd76b90120a80aa0c29ed4af536a06281855c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 30 Jan 2020 03:47:19 -0600 Subject: [PATCH 093/400] do not clean up tmp dir if --debug --- anvio/kofam.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 82d0123398..df72c203f5 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -372,8 +372,13 @@ def process_kofam_hmms(self): run.info_single("The HMM search returned no hits :/ So there is nothing to add to the contigs database. But\ now anvi'o will add KOfam as a functional source with no hits, clean the temporary directories\ and gracefully quit.", nl_before=1, nl_after=1) - shutil.rmtree(tmp_directory_path) - hmmer.clean_tmp_dirs() + if not anvio.DEBUG: + shutil.rmtree(tmp_directory_path) + hmmer.clean_tmp_dirs() + else: + self.run.warning("Because you ran this script with the --debug flag, anvi'o will not clean up the temporary\ + directory located at %s. She hopes that you will be responsible for cleaning up this directory yourself \ + after you are finished debugging :)" % tmp_directory_path) gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) return From d0a5a7b56f5bc25ee9ff41aa080d34e4e8f18424 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 30 Jan 2020 03:49:18 -0600 Subject: [PATCH 094/400] fix log file output for hmmscan --- anvio/drivers/hmmer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index 4cd02ec557..c559662609 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -97,10 +97,10 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) tmp_dir = os.path.dirname(self.target_files_dict[target][0]) - log_file_path = os.path.join(tmp_dir, '00_log.txt') + log_file_path = os.path.join(tmp_dir, '*_log') self.run.info('Temporary work dir', tmp_dir) - self.run.info('Log file', log_file_path) + self.run.info('Log files', log_file_path) # check if all hmmpress files are in the HMM directory From 5416dd4becb8bcec8ecfb874047aa70314221273 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 30 Jan 2020 04:36:59 -0600 Subject: [PATCH 095/400] allow no score cutoff to be passed to hmmscan --- anvio/drivers/hmmer.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index c559662609..9cdc40eeac 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -127,11 +127,18 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode output_file = part_file + '_output' shitty_file = part_file + '_shitty' - cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', - '-o', output_file, *noise_cutoff_terms.split(), - '--cpu', cores_per_process, - '--tblout', shitty_file, - hmm, part_file] + if noise_cutoff_terms: + cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', + '-o', output_file, *noise_cutoff_terms.split(), + '--cpu', cores_per_process, + '--tblout', shitty_file, + hmm, part_file] + else: # if we didn't pass any noise cutoff terms, here we don't include them in the command line + cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', + '-o', output_file, + '--cpu', cores_per_process, + '--tblout', shitty_file, + hmm, part_file] t = Thread(target=self.hmmscan_worker, args=(part_file, cmd_line, From f130850f2ff419f4c9e29e6768099a9f611b1a21 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 30 Jan 2020 04:37:34 -0600 Subject: [PATCH 096/400] do not pass score threshold to hmmscan --- anvio/kofam.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index df72c203f5..ba1656a25c 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -359,11 +359,7 @@ def process_kofam_hmms(self): # run hmmscan hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads) - hmm_hits_file = hmmer.run_hmmscan('KOfam', 'AA', 'GENE', None, None, len(self.ko_dict), self.kofam_hmm_file_path, None, '--cut_ga') - - # TODO: this code runs nicely (error-free) but returns no HMM hits. Not sure yet if there is an issue with the code or if my partial contigs - # DB legitimately has no hits. This note is here so that I remember to investigate tomorrow when I am not brain-dead from coding in a train - # for five hours :) (ps I think looking into this noise_cutoff_terms parameter could be wise) + hmm_hits_file = hmmer.run_hmmscan('KOfam', 'AA', 'GENE', None, None, len(self.ko_dict), self.kofam_hmm_file_path, None, None) # get an instance of gene functions table gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress) From b52d02df67d77e870897b04b1adea32f99972dc9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 30 Jan 2020 04:37:53 -0600 Subject: [PATCH 097/400] fix debug header --- anvio/kofam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index ba1656a25c..1f3dcb745d 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -374,7 +374,7 @@ def process_kofam_hmms(self): else: self.run.warning("Because you ran this script with the --debug flag, anvi'o will not clean up the temporary\ directory located at %s. She hopes that you will be responsible for cleaning up this directory yourself \ - after you are finished debugging :)" % tmp_directory_path) + after you are finished debugging :)" % tmp_directory_path, header="Debug") gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) return From 9dc39afea5497c014df7fcdfa5a6a5450541e033 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 30 Jan 2020 05:05:55 -0600 Subject: [PATCH 098/400] make executable --- bin/anvi-run-kegg-kofams | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/anvi-run-kegg-kofams diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams old mode 100644 new mode 100755 From b3214b611c8e4d467e8e394a5cece64adb67d676 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 30 Jan 2020 05:08:03 -0600 Subject: [PATCH 099/400] various fixes. hmmscan now runs with hits but no filtering --- anvio/kofam.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 1f3dcb745d..44c9a018a5 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -336,7 +336,7 @@ def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): if not knum in self.ko_dict: if ok_if_missing_from_dict: - return "Unknown function with KO num" % knum + return "Unknown function with KO num %s" % knum else: raise ConfigError("It seems hmmscan found a KO number that does not exist\ in the KOfam ko_list file: %s" % knum) @@ -373,8 +373,8 @@ def process_kofam_hmms(self): hmmer.clean_tmp_dirs() else: self.run.warning("Because you ran this script with the --debug flag, anvi'o will not clean up the temporary\ - directory located at %s. She hopes that you will be responsible for cleaning up this directory yourself \ - after you are finished debugging :)" % tmp_directory_path, header="Debug") + directories located at %s and %s. Please be responsible for cleaning up this directory yourself \ + after you are finished debugging :)" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug") gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) return @@ -390,7 +390,7 @@ def process_kofam_hmms(self): 'gene_callers_id': hmm_hit['gene_callers_id'], 'source': 'KOfam', 'accession': hmm_hit['gene_hmm_id'], - 'function': self.get_annotation_from_ko_dict(hmm_hit['gene_hmm_id'], ok_if_missing_from_catalog=True), + 'function': self.get_annotation_from_ko_dict(hmm_hit['gene_hmm_id'], ok_if_missing_from_dict=True), 'e_value': hmm_hit['e_value'], } From d8f0f0ef87cce4ba59b9ddece7381b6acf7e74b8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 4 Feb 2020 10:26:01 -0600 Subject: [PATCH 100/400] save bit scores in hits dict --- anvio/parsers/hmmscan.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index 9a114e2e00..b5522877b9 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -26,7 +26,9 @@ def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE'): if self.context == "GENE": # see the HMMER user guide for details of the fields for AA sequence search, and DNA sequence search. - col_names = ['gene_name', 'gene_hmm_id', 'gene_callers_id', 'f', 'e_value', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f'] + # --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- + # target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description + col_names = ['gene_name', 'gene_hmm_id', 'gene_callers_id', 'f', 'e_value', 'bit_score', 'f', 'f', 'dom_bit_score', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f'] col_mapping = [str, str, int, str, float, str, str, str, str, str, str, str, str, str, str, str, str, str] elif self.context == "CONTIG" and (self.alphabet == "DNA" or self.alphabet == "RNA"): # 'hmm_target', 'hmm_acc', 'query_id', 'query_acc', 'hmm_from', 'hmm_to', 'alignment_from', 'alignment_to', 'envelope_from', 'envelope_to', 'seq_len', 'strand', 'e_value', 'score', 'bias', 'desc'] From 57ced0cb65a8efe7b99f9dfe9c03078934287c9a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 4 Feb 2020 10:29:19 -0600 Subject: [PATCH 101/400] and change their data types --- anvio/parsers/hmmscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index b5522877b9..11ccd772fc 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -29,7 +29,7 @@ def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE'): # --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- # target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description col_names = ['gene_name', 'gene_hmm_id', 'gene_callers_id', 'f', 'e_value', 'bit_score', 'f', 'f', 'dom_bit_score', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f'] - col_mapping = [str, str, int, str, float, str, str, str, str, str, str, str, str, str, str, str, str, str] + col_mapping = [str, str, int, str, float, float, str, str, float, str, str, str, str, str, str, str, str, str] elif self.context == "CONTIG" and (self.alphabet == "DNA" or self.alphabet == "RNA"): # 'hmm_target', 'hmm_acc', 'query_id', 'query_acc', 'hmm_from', 'hmm_to', 'alignment_from', 'alignment_to', 'envelope_from', 'envelope_to', 'seq_len', 'strand', 'e_value', 'score', 'bias', 'desc'] col_names = ['gene_name', 'gene_hmm_id', 'contig_name', 'f', 'hmm_from', 'hmm_to', 'alignment_from', 'alignment_to', 'envelope_from', 'envelope_to', 'f', 'f', 'e_value', 'f', 'f', 'f'] From 21523d3021fd3ac373023a78137210cba7b6c349 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 4 Feb 2020 10:29:49 -0600 Subject: [PATCH 102/400] b --- anvio/parsers/hmmscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index 11ccd772fc..35e103a2c3 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -35,7 +35,7 @@ def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE'): col_names = ['gene_name', 'gene_hmm_id', 'contig_name', 'f', 'hmm_from', 'hmm_to', 'alignment_from', 'alignment_to', 'envelope_from', 'envelope_to', 'f', 'f', 'e_value', 'f', 'f', 'f'] col_mapping = [str, str, str, str, str, str, int, int, int, int, str, str, float, str, str, str] else: - raise ConfigError("HMMScan driver is confused. Yor context and alphaet pair ('%s' and '%s')\ + raise ConfigError("HMMScan driver is confused. Yor context and alphabet pair ('%s' and '%s')\ does not seem to be implemented in the parser module. If you think this is\ not a mistake on your part, please get in touch with the anvi'o developers\ and watch them fix it like actual pros." % (self.context, self.alphabet)) From fb0d57d3f7d0ed36d56d4c808632fea0259ec694 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 4 Feb 2020 11:30:29 -0600 Subject: [PATCH 103/400] ko_list dict param --- anvio/kofam.py | 2 +- anvio/parsers/hmmscan.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 44c9a018a5..f0cce41080 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -380,7 +380,7 @@ def process_kofam_hmms(self): # parse hmmscan output parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE') - search_results_dict = parser.get_search_results() + search_results_dict = parser.get_search_results(ko_list_dict=self.ko_dict) # add functions to database functions_dict = {} diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index 35e103a2c3..4c1e73d7d4 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -51,7 +51,19 @@ def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE'): Parser.__init__(self, 'HMMScan', [hmm_scan_hits_txt], files_expected, files_structure) - def get_search_results(self): + def get_search_results(self, ko_list_dict = None): + """ + This function goes through the hits provided by `hmmscan` and generates an annotation dictionary with the relevant information about each hit. + If we are parsing Kofam hits, then this function makes sure only hits with a high enough bit score make it into the annotation dictionary. + + Parameters + ========== + ko_list_dict dictionary of the ko_list file; see setup_ko_dict in kofam.py for more details + + Returns + ======= + annotations_dict dictionary of annotations + """ annotations_dict = {} # this is the stuff we are going to try to fill with this: From 9e4ec97d7738777cd9cde6b87bd4c4ab2acedb77 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 4 Feb 2020 11:31:06 -0600 Subject: [PATCH 104/400] get_search_results now parses kofam hits --- anvio/parsers/hmmscan.py | 49 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index 4c1e73d7d4..5599a4f39a 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -72,11 +72,50 @@ def get_search_results(self, ko_list_dict = None): entry_id = 0 for hit in list(self.dicts['hits'].values()): if self.context == 'GENE': - entry = {'entry_id': entry_id, - 'gene_name': hit['gene_name'], - 'gene_hmm_id': hit['gene_hmm_id'], - 'gene_callers_id': hit['gene_callers_id'], - 'e_value': hit['e_value']} + # This is for KEGG Kofams. Here we only add the hit to the annotations_dict if the appropriate bit score is above the + # threshold set in ko_list_dict (which is indexed by ko num, aka gene_name in the hits dict) + if ko_list_dict and hit['gene_name'] in ko_list_dict.keys(): + knum = hit['gene_name'] + score_type = ko_list_dict[knum]['score_type'] + threshold = ko_list_dict[knum]['threshold'] + keep = True + if score_type == 'full': + if hit['bit_score'] < threshold: + keep = False + elif score_type == 'domain': + if hit['dom_bit_score'] < threshold: + keep = False + else: + self.run.warning("Oh dear. The Kofam profile %s has a strange score_type value: %s. The only accepted values \ + for this type are 'full' or 'domain', so anvi'o cannot parse the hits to this profile. All hits will be kept \ + regardless of bit score. You have been warned." % (hit['gene_name'], score_type) + + if keep: + entry = {'entry_id': entry_id, + 'gene_name': hit['gene_name'], + 'gene_hmm_id': hit['gene_hmm_id'], + 'gene_callers_id': hit['gene_callers_id'], + 'e_value': hit['e_value']} + + elif ko_list_dict and hit['gene_name'] not in ko_list_dict.keys(): + # this should never happen, in an ideal world where everything is filled with butterflies and happiness + self.run.warning("Hmm. While parsing your Kofam hits, it seems the Kofam profile %s was not found in the ko_list dictionary. \ + This should probably not ever happen, and you should contact a developer as soon as possible to figure out what \ + is going on. But for now, anvi'o is going to keep all hits to this profile. Consider those hits with a grain of salt, \ + as not all of them may be good." % hit['gene_name']) + entry = {'entry_id': entry_id, + 'gene_name': hit['gene_name'], + 'gene_hmm_id': hit['gene_hmm_id'], + 'gene_callers_id': hit['gene_callers_id'], + 'e_value': hit['e_value']} + + else: + # but in Pfams, we don't care, we just keep all hits + entry = {'entry_id': entry_id, + 'gene_name': hit['gene_name'], + 'gene_hmm_id': hit['gene_hmm_id'], + 'gene_callers_id': hit['gene_callers_id'], + 'e_value': hit['e_value']} elif self.context == 'CONTIG' and (self.alphabet == 'DNA' or self.alphabet == 'RNA'): entry = {'entry_id': entry_id, 'gene_name': hit['gene_name'], From 41cb9246c7b8d6cf2d97b9e76cae9d04f862f6ff Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 4 Feb 2020 13:33:36 -0600 Subject: [PATCH 105/400] missing ) --- anvio/parsers/hmmscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index 5599a4f39a..da4a469695 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -88,7 +88,7 @@ def get_search_results(self, ko_list_dict = None): else: self.run.warning("Oh dear. The Kofam profile %s has a strange score_type value: %s. The only accepted values \ for this type are 'full' or 'domain', so anvi'o cannot parse the hits to this profile. All hits will be kept \ - regardless of bit score. You have been warned." % (hit['gene_name'], score_type) + regardless of bit score. You have been warned." % (hit['gene_name'], score_type)) if keep: entry = {'entry_id': entry_id, From b120c6619294aaf95b3070dc52a764f0ee24a59a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 4 Feb 2020 16:30:58 -0600 Subject: [PATCH 106/400] fix parsing so that we do not add previous entries to annotation dict if current hit is weak --- anvio/parsers/hmmscan.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index da4a469695..10132d986d 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -70,7 +70,9 @@ def get_search_results(self, ko_list_dict = None): # search_table_structure = ['entry_id', 'source', 'alphabet', 'contig', 'gene_callers_id' 'gene_name', 'gene_hmm_id', 'e_value'] entry_id = 0 + num_hits_removed = 0 # a counter for the number of hits we don't add to the annotation dictionary for hit in list(self.dicts['hits'].values()): + entry = None if self.context == 'GENE': # This is for KEGG Kofams. Here we only add the hit to the annotations_dict if the appropriate bit score is above the # threshold set in ko_list_dict (which is indexed by ko num, aka gene_name in the hits dict) @@ -80,10 +82,10 @@ def get_search_results(self, ko_list_dict = None): threshold = ko_list_dict[knum]['threshold'] keep = True if score_type == 'full': - if hit['bit_score'] < threshold: + if hit['bit_score'] < float(threshold): keep = False elif score_type == 'domain': - if hit['dom_bit_score'] < threshold: + if hit['dom_bit_score'] < float(threshold): keep = False else: self.run.warning("Oh dear. The Kofam profile %s has a strange score_type value: %s. The only accepted values \ @@ -96,6 +98,8 @@ def get_search_results(self, ko_list_dict = None): 'gene_hmm_id': hit['gene_hmm_id'], 'gene_callers_id': hit['gene_callers_id'], 'e_value': hit['e_value']} + else: + num_hits_removed += 1 elif ko_list_dict and hit['gene_name'] not in ko_list_dict.keys(): # this should never happen, in an ideal world where everything is filled with butterflies and happiness @@ -127,7 +131,11 @@ def get_search_results(self, ko_list_dict = None): else: raise ConfigError("Anvi'o does not know how to parse %s:%s" % (self.alphabet, self.context)) - entry_id += 1 - annotations_dict[entry_id] = entry + if entry: + entry_id += 1 + annotations_dict[entry_id] = entry + + print("Number of weak hits removed", num_hits_removed) + print("Number of hits in annotation dict ", len(annotations_dict.keys())) return annotations_dict From 444159f17f08c6a6735dfb9090c7d1e2714b1e9a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 4 Feb 2020 16:33:37 -0600 Subject: [PATCH 107/400] prettier output --- anvio/parsers/hmmscan.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index 10132d986d..f241d0009c 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -16,12 +16,16 @@ __maintainer__ = "A. Murat Eren" __email__ = "a.murat.eren@gmail.com" +run = anvio.terminal.Run() + class HMMScan(Parser): def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE'): self.alphabet = alphabet self.context = context + self.run = run + files_expected = {'hits': hmm_scan_hits_txt} if self.context == "GENE": @@ -135,7 +139,7 @@ def get_search_results(self, ko_list_dict = None): entry_id += 1 annotations_dict[entry_id] = entry - print("Number of weak hits removed", num_hits_removed) - print("Number of hits in annotation dict ", len(annotations_dict.keys())) + self.run.info("Number of weak hits removed", num_hits_removed) + self.run.info("Number of hits in annotation dict ", len(annotations_dict.keys())) return annotations_dict From b297e4aefac53fb7b2d414b0cbc37c8126cbaf43 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 5 Feb 2020 13:48:22 -0600 Subject: [PATCH 108/400] fix annotation bug --- anvio/kofam.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index f0cce41080..06099c3756 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -389,8 +389,8 @@ def process_kofam_hmms(self): functions_dict[counter] = { 'gene_callers_id': hmm_hit['gene_callers_id'], 'source': 'KOfam', - 'accession': hmm_hit['gene_hmm_id'], - 'function': self.get_annotation_from_ko_dict(hmm_hit['gene_hmm_id'], ok_if_missing_from_dict=True), + 'accession': hmm_hit['gene_name'], + 'function': self.get_annotation_from_ko_dict(hmm_hit['gene_name'], ok_if_missing_from_dict=True), 'e_value': hmm_hit['e_value'], } From 2c77f0ebc2c92de9f6995e0f81d91c7934f407c1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 5 Feb 2020 16:21:32 -0600 Subject: [PATCH 109/400] download kegg orthology file --- anvio/kofam.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 06099c3756..a3155431ea 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -163,6 +163,9 @@ def __init__(self, args, run=run, progress=progress): self.database_url = "ftp://ftp.genome.jp/pub/db/kofam" self.files = ['ko_list.gz', 'profiles.tar.gz'] + # Kegg Orthology text file + self.kegg_orthology_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00001.keg&format=htext&filedir=" + def is_database_exists(self): """This function determines whether the user has already downloaded the Kofam HMM profiles.""" @@ -177,6 +180,9 @@ def download(self): utils.download_file(self.database_url + '/' + file_name, os.path.join(self.kofam_data_dir, file_name), progress=self.progress, run=self.run) + # download the kegg orthology file + utils.download_file(self.kegg_orthology_download_path, os.path.join(self.kofam_data_dir, "ko00001.keg"), progress=self.progress, run=self.run) + def decompress_files(self): """This function decompresses the Kofam profiles.""" From a4e6f4a8d9cd3b854a1b50bfb371826f48b0a364 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 7 Feb 2020 11:09:39 -0600 Subject: [PATCH 110/400] download KEGG modules only instead of all orthology --- anvio/kofam.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index a3155431ea..a721f0e673 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -163,8 +163,8 @@ def __init__(self, args, run=run, progress=progress): self.database_url = "ftp://ftp.genome.jp/pub/db/kofam" self.files = ['ko_list.gz', 'profiles.tar.gz'] - # Kegg Orthology text file - self.kegg_orthology_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00001.keg&format=htext&filedir=" + # Kegg module text file + self.kegg_module_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=htext&filedir=" def is_database_exists(self): @@ -181,7 +181,7 @@ def download(self): os.path.join(self.kofam_data_dir, file_name), progress=self.progress, run=self.run) # download the kegg orthology file - utils.download_file(self.kegg_orthology_download_path, os.path.join(self.kofam_data_dir, "ko00001.keg"), progress=self.progress, run=self.run) + utils.download_file(self.kegg_module_download_path, os.path.join(self.kofam_data_dir, "ko00002.keg"), progress=self.progress, run=self.run) def decompress_files(self): From 5fcf6418e7f02397c327637e40d0d9632f9055f3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 7 Feb 2020 11:12:31 -0600 Subject: [PATCH 111/400] make module file an attribute --- anvio/kofam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index a721f0e673..ef2b457aa2 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -165,6 +165,7 @@ def __init__(self, args, run=run, progress=progress): # Kegg module text file self.kegg_module_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=htext&filedir=" + self.kegg_module_file = os.path.join(self.kofam_data_dir, "ko00002.keg") def is_database_exists(self): @@ -181,7 +182,7 @@ def download(self): os.path.join(self.kofam_data_dir, file_name), progress=self.progress, run=self.run) # download the kegg orthology file - utils.download_file(self.kegg_module_download_path, os.path.join(self.kofam_data_dir, "ko00002.keg"), progress=self.progress, run=self.run) + utils.download_file(self.kegg_module_download_path, self.kegg_module_file, progress=self.progress, run=self.run) def decompress_files(self): From 0bd5d8d5232a832ba073ae3d3f8ffac67e9477e3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 7 Feb 2020 17:09:39 -0600 Subject: [PATCH 112/400] update kofam attributes --- anvio/kofam.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index ef2b457aa2..227f06a6c3 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -44,10 +44,13 @@ def __init__(self, args): # default directory will be called KEGG and will store the KEGG Module data as well self.kofam_data_dir = A('kofam_data_dir') or os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') self.orphan_data_dir = os.path.join(self.kofam_data_dir, "orphan_data") + self.module_data_dir = os.path.join(self.kofam_data_dir, "modules") # shared variables for all KOfam subclasses self.kofam_hmm_file_path = os.path.join(self.kofam_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms self.ko_list_file_path = os.path.join(self.kofam_data_dir, "ko_list") + self.kegg_module_file = os.path.join(self.kofam_data_dir, "ko00002.keg") + self.module_dict = {} # this dict will be filled in by other functions def setup_ko_dict(self): """ From b86bde683b9f6a4329d26b2e4975a4e075c98793 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 7 Feb 2020 17:11:21 -0600 Subject: [PATCH 113/400] check if already downloaded; and now profile download function has different name --- anvio/kofam.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 227f06a6c3..197e127185 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -159,6 +159,7 @@ def __init__(self, args, run=run, progress=progress): filesnpaths.gen_output_directory(self.kofam_data_dir, delete_if_exists=args.reset) filesnpaths.gen_output_directory(self.orphan_data_dir, delete_if_exists=args.reset) + filesnpaths.gen_output_directory(self.module_data_dir, delete_if_exists=args.reset) # ftp path for HMM profiles and KO list # for ko list, add /ko_list.gz to end of url @@ -172,11 +173,19 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): - """This function determines whether the user has already downloaded the Kofam HMM profiles.""" + """This function determines whether the user has already downloaded the Kofam HMM profiles and KEGG modules.""" if os.path.exists(self.kofam_hmm_file_path): raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kofam_data_dir) - def download(self): + if os.path.exists(self.kegg_module_file): + raise ConfigError("Interestingly, though KOfam HMM profiles are not installed on your system, KEGG module information seems to have been \ + already downloaded in %s. Please use the --reset flag to re-download everything from scratch." % self.kofam_data_dir) + + if os.path.exists(self.module_data_dir): + raise ConfigError("It seems the KEGG module directory %s already exists on your system. This is even more strange because Kofam HMM \ + profiles have not been downloaded. We suggest you to use the --reset flag to download everything from scratch." % self.module_data_dir) + + def download_profiles(self): """This function downloads the Kofam profiles.""" self.run.info("Database URL", self.database_url) @@ -300,7 +309,7 @@ def run_hmmpress(self): def setup_profiles(self): """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" - self.download() + self.download_profiles() self.decompress_files() self.setup_ko_dict() self.run_hmmpress() From 726c56ec0732a21fca9359da0d93b0adfb7df165 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 7 Feb 2020 17:12:56 -0600 Subject: [PATCH 114/400] update some vars and output --- anvio/kofam.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 197e127185..a45452d93f 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -169,7 +169,7 @@ def __init__(self, args, run=run, progress=progress): # Kegg module text file self.kegg_module_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=htext&filedir=" - self.kegg_module_file = os.path.join(self.kofam_data_dir, "ko00002.keg") + self.kegg_rest_api_get = "http://rest.kegg.jp/get" def is_database_exists(self): @@ -187,7 +187,7 @@ def is_database_exists(self): def download_profiles(self): """This function downloads the Kofam profiles.""" - self.run.info("Database URL", self.database_url) + self.run.info("Kofam Profile Database URL", self.database_url) for file_name in self.files: utils.download_file(self.database_url + '/' + file_name, From 5dbf6bda0668a19275f7184c9696b6bc89db996a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 7 Feb 2020 17:13:26 -0600 Subject: [PATCH 115/400] functions for downloading the modules :) --- anvio/kofam.py | 91 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index a45452d93f..bdb0e2cf6f 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -193,9 +193,98 @@ def download_profiles(self): utils.download_file(self.database_url + '/' + file_name, os.path.join(self.kofam_data_dir, file_name), progress=self.progress, run=self.run) - # download the kegg orthology file + def process_module_file(self): + """This function reads the kegg module file into a dictionary. It should be called during setup to get the KEGG module numbers + so that KEGG modules can be downloaded. + + The structure of this file is like this: + + +D Module + #

  KEGG Modules

+ ! + APathway modules + B + B Carbohydrate metabolism + C Central carbohydrate metabolism + D M00001 Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate [PATH:map00010 map01200 map01100] + D M00002 Glycolysis, core module involving three-carbon compounds [PATH:map00010 map01200 map01230 map01100] + D M00003 Gluconeogenesis, oxaloacetate => fructose-6P [PATH:map00010 map00020 map01100] + + In other words, a bunch of initial lines to be ignored, and thereafter the line's information can be determined by the one-letter code at the start. + A = Pathway modules (metabolic pathways) or signature modules (gene sets that indicate a phenotypic trait, ie toxins). + B = Category of module (a type of metabolism for pathway modules. For signature modules, either Gene Set or Module Set) + C = Sub-category of module + D = Module + + """ + filesnpaths.is_file_exists(self.kegg_module_file) + filesnpaths.is_file_plain_text(self.kegg_module_file) + + f = open(self.kegg_module_file, 'rU') + + current_module_type = None + current_category = None + current_subcategory = None + + for line in f.readlines(): + line.strip('\n') + first_char = line[0] + + # garbage lines + if first_char in ["+", "#", "!"]: + continue + else: + # module type + if first_char == "A": + fields = re.split('<[^>]*>', line) # we split by the html tag here + current_module_type = fields[1] + # Category + elif first_char == "B": + fields = re.split('<[^>]*>', line) # we split by the html tag here + if len(fields) == 1: # sometimes this level has lines with only a B + continue + current_category = fields[1] + # Sub-category + elif first_char == "C": + fields = re.split('\s{2,}', line) # don't want to split the subcategory name, so we have to split at least 2 spaces + current_subcategory = fields[1] + # module + elif first_char == "D": + fields = re.split('\s{2,}', line) + mnum = fields[1] + module_name = fields[2] + self.module_dict[mnum] = {"name" : module_name} + # unknown code + else: + raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has \ + made the file unparseable. Sad. :(" % (self.kegg_module_file, first_char)) + + def download_modules(self): + """This function downloads the KEGG modules. To do so, it also processes the KEGG module file into a dictionary via the + process_module_file() function. To verify that each file has been downloaded properly, we check that the last line is '///'. + """ + self.run.info("KEGG Module Database URL", self.kegg_rest_api_get) + + # download the kegg module file, which lists all modules utils.download_file(self.kegg_module_download_path, self.kegg_module_file, progress=self.progress, run=self.run) + # get module dict + self.process_module_file() + self.run.info("Number of KEGG Modules", len(self.module_dict.keys())) + + # download all modules + for mnum in self.module_dict.keys(): + file_path = os.path.join(self.module_data_dir, mnum) + utils.download_file(self.kegg_rest_api_get + '/' + mnum, + file_path, progress=self.progress, run=self.run) + # verify entire file has been downloaded + f = open(file_path, 'rU') + f.seek(0, os.SEEK_END) + f.seek(f.tell() - 4, os.SEEK_SET) + last_line = f.readline().strip('\n') + if not last_line == '///': + raise ConfigError("The KEGG module file %s was not downloaded properly. We were expecting the last line in the file \ + to be '///', but instead it was %s." % (file_path, last_line)) def decompress_files(self): """This function decompresses the Kofam profiles.""" From fb7b79df37df1b79fedcc44f22a00ef1b1e65b8f Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 7 Feb 2020 17:29:42 -0600 Subject: [PATCH 116/400] add module download to the driver --- anvio/kofam.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index bdb0e2cf6f..cb874cfc44 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -400,6 +400,7 @@ def setup_profiles(self): """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" self.download_profiles() self.decompress_files() + self.download_modules() self.setup_ko_dict() self.run_hmmpress() From de27b6b97bbd9d868bd4ab3e8f9e9a4288df1d29 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 7 Feb 2020 17:38:07 -0600 Subject: [PATCH 117/400] changed my mind, we will not save the module name here because we can get that from the module file later --- anvio/kofam.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index cb874cfc44..e90abb4284 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -252,7 +252,6 @@ def process_module_file(self): elif first_char == "D": fields = re.split('\s{2,}', line) mnum = fields[1] - module_name = fields[2] self.module_dict[mnum] = {"name" : module_name} # unknown code else: From 5672345e67cf30a8e5cf4decf5b6f6a4edfeb068 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 11 Feb 2020 14:58:52 -0600 Subject: [PATCH 118/400] parse modules skeleton function --- anvio/kofam.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index e90abb4284..b3504a0e0e 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -285,6 +285,10 @@ def download_modules(self): raise ConfigError("The KEGG module file %s was not downloaded properly. We were expecting the last line in the file \ to be '///', but instead it was %s." % (file_path, last_line)) + def parse_kegg_modules(self): + """This function reads information from each of the KEGG module flat files into the module_dict.""" + pass + def decompress_files(self): """This function decompresses the Kofam profiles.""" for file_name in self.files: From 87d4d3d5f1a311d9c0cea5f07a9f92c4fb1db7ed Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 11 Feb 2020 15:09:12 -0600 Subject: [PATCH 119/400] follow docstring convention --- anvio/drivers/hmmer.py | 7 +++-- anvio/kofam.py | 59 ++++++++++++++++++++++------------------ anvio/parsers/hmmscan.py | 5 ++-- 3 files changed, 40 insertions(+), 31 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index 9cdc40eeac..f0124b80c2 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -53,15 +53,16 @@ def __init__(self, target_files_dict, num_threads_to_use=1, progress=progress, r def verify_hmmpress_output(self, hmm_path): """This function verifies that the HMM profiles located at hmm_path have been successfully hmmpressed. + What this means is that every .hmm profile in the directory has an associated .h3f, .h3i, .h3m, and .h3p file. - PARAMETERS: + PARAMETERS + ========== hmm_path string, the path at which the HMM profiles are located - RETURNS: N/A - """ + for file_path in glob.glob(os.path.join(hmm_path, '*.hmm')): base_path = file_path[:-3] expected_extensions = ['h3f', 'h3i', 'h3m', 'h3p'] diff --git a/anvio/kofam.py b/anvio/kofam.py index b3504a0e0e..d774ce018d 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -1,9 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -""" - This file contains KofamSetup and Kofam classes. - -""" +"""This file contains KofamSetup and Kofam classes.""" import os import gzip @@ -36,9 +33,8 @@ class KofamContext(object): - """ - The purpose of this base class is to define shared functions and file paths for all KOfam operations. - """ + """The purpose of this base class is to define shared functions and file paths for all KOfam operations.""" + def __init__(self, args): A = lambda x: args.__dict__[x] if x in args.__dict__ else None # default directory will be called KEGG and will store the KEGG Module data as well @@ -53,8 +49,7 @@ def __init__(self, args): self.module_dict = {} # this dict will be filled in by other functions def setup_ko_dict(self): - """ - The purpose of this function is to process the ko_list file into usable form by Kofam sub-classes. + """The purpose of this function is to process the ko_list file into usable form by Kofam sub-classes. The ko_list file (which is downloaded along with the KOfam HMM profiles) contains important information for each KEGG Orthology number (KO, or knum), incuding pre-defined scoring thresholds @@ -94,8 +89,8 @@ def setup_ko_dict(self): [self.ko_dict.pop(ko) for ko in self.ko_no_threshold_list] def get_ko_skip_list(self): - """ - The purpose of this function is to determine which KO numbers have no associated data or just no score threshold in the ko_list file. + """The purpose of this function is to determine which KO numbers have no associated data or just no score threshold in the ko_list file. + That is, their ko_list entries look like this, with hypens in all but the first and last columns: K14936 - - - - - - - - - - small nucleolar RNA snR191 @@ -115,6 +110,7 @@ def get_ko_skip_list(self): skip_list list of strings, each string is a KO number no_threshold_list list of strings, each string is a KO number """ + col_names_to_check = ["threshold","score_type","profile_type","F-measure","nseq","nseq_used","alen","mlen","eff_nseq","re/pos"] skip_list = [] no_threshold_list = [] @@ -135,8 +131,7 @@ def get_ko_skip_list(self): return skip_list, no_threshold_list class KofamSetup(KofamContext): - """ Class for setting up KEGG Kofam HMM profiles. It performs sanity checks and downloads, unpacks, and prepares - the profiles for later use by `hmmscan`. + """Class for setting up KEGG Kofam HMM profiles. It performs sanity checks and downloads, unpacks, and prepares the profiles for later use by `hmmscan`. Parameters ========== @@ -174,6 +169,7 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): """This function determines whether the user has already downloaded the Kofam HMM profiles and KEGG modules.""" + if os.path.exists(self.kofam_hmm_file_path): raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kofam_data_dir) @@ -187,6 +183,7 @@ def is_database_exists(self): def download_profiles(self): """This function downloads the Kofam profiles.""" + self.run.info("Kofam Profile Database URL", self.database_url) for file_name in self.files: @@ -194,8 +191,7 @@ def download_profiles(self): os.path.join(self.kofam_data_dir, file_name), progress=self.progress, run=self.run) def process_module_file(self): - """This function reads the kegg module file into a dictionary. It should be called during setup to get the KEGG module numbers - so that KEGG modules can be downloaded. + """This function reads the kegg module file into a dictionary. It should be called during setup to get the KEGG module numbers so that KEGG modules can be downloaded. The structure of this file is like this: @@ -217,6 +213,7 @@ def process_module_file(self): D = Module """ + filesnpaths.is_file_exists(self.kegg_module_file) filesnpaths.is_file_plain_text(self.kegg_module_file) @@ -259,9 +256,12 @@ def process_module_file(self): made the file unparseable. Sad. :(" % (self.kegg_module_file, first_char)) def download_modules(self): - """This function downloads the KEGG modules. To do so, it also processes the KEGG module file into a dictionary via the + """This function downloads the KEGG modules. + + To do so, it also processes the KEGG module file into a dictionary via the process_module_file() function. To verify that each file has been downloaded properly, we check that the last line is '///'. """ + self.run.info("KEGG Module Database URL", self.kegg_rest_api_get) # download the kegg module file, which lists all modules @@ -287,10 +287,12 @@ def download_modules(self): def parse_kegg_modules(self): """This function reads information from each of the KEGG module flat files into the module_dict.""" + pass def decompress_files(self): """This function decompresses the Kofam profiles.""" + for file_name in self.files: full_path = os.path.join(self.kofam_data_dir, file_name) @@ -300,10 +302,12 @@ def decompress_files(self): utils.gzip_decompress_file(full_path, keep_original=False) def confirm_downloaded_files(self): - """This function verifies that all Kofam profiles have been properly downloaded. It is intended to be run - after the files have been decompressed. The profiles directory should contain hmm files from K00001.hmm to - K23763.hmm with some exceptions; all KO numbers from ko_list file (except those in ko_skip_list) should be - included.""" + """This function verifies that all Kofam profiles have been properly downloaded. + + It is intended to be run after the files have been decompressed. The profiles directory should contain hmm files from K00001.hmm to + K23763.hmm with some exceptions; all KO numbers from ko_list file (except those in ko_skip_list) should be included. + """ + ko_nums = self.ko_dict.keys() for k in ko_nums: if k not in self.ko_skip_list: @@ -314,13 +318,15 @@ def confirm_downloaded_files(self): flag." % (hmm_path)) def move_orphan_files(self): - """ - This function moves the following to the orphan files directory: + """This function moves the following to the orphan files directory: + - profiles that do not have ko_list entries - profiles whose ko_list entries have no scoring threshold (in ko_no_threshold_list) + And, the following profiles should not have been downloaded, but we check if they exist and move any that do: - profiles whose ko_list entries have no data at all (in ko_skip_list) """ + if not os.path.exists(self.orphan_data_dir): # should not happen but we check just in case raise ConfigError("Hmm. Something is out of order. The orphan data directory %s does not exist \ yet, but it needs to in order for the move_orphan_files() function to work." % self.orphan_data_dir) @@ -368,6 +374,7 @@ def move_orphan_files(self): def run_hmmpress(self): """This function concatenates the Kofam profiles and runs hmmpress on them.""" + self.progress.new('Preparing Kofam HMM Profiles') log_file_path = os.path.join(self.kofam_data_dir, '00_hmmpress_log.txt') @@ -401,6 +408,7 @@ def run_hmmpress(self): def setup_profiles(self): """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" + self.download_profiles() self.decompress_files() self.download_modules() @@ -408,14 +416,14 @@ def setup_profiles(self): self.run_hmmpress() class KofamRunHMMs(KofamContext): - """ Class for running `hmmscan` against the KOfam database and adding the resulting hits to contigs DBs - for later metabolism prediction. + """ Class for running `hmmscan` against the KOfam database and adding the resulting hits to contigs DB for later metabolism prediction. Parameters ========== args: Namespace object All the arguments supplied by user to anvi-run-kegg-kofams """ + def __init__(self, args, run=run, progress=progress): self.args = args self.run = run @@ -456,8 +464,7 @@ def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): return self.ko_dict[knum]['definition'] def process_kofam_hmms(self): - """This is a driver function for running HMMs against the KOfam database and processing the hits into the - provided contigs DB""" + """This is a driver function for running HMMs against the KOfam database and processing the hits into the provided contigs DB""" tmp_directory_path = filesnpaths.get_temp_directory_path() contigs_db = dbops.ContigsSuperclass(self.args) # initialize contigs db diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index f241d0009c..9da75b6319 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -56,8 +56,8 @@ def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE'): def get_search_results(self, ko_list_dict = None): - """ - This function goes through the hits provided by `hmmscan` and generates an annotation dictionary with the relevant information about each hit. + """This function goes through the hits provided by `hmmscan` and generates an annotation dictionary with the relevant information about each hit. + If we are parsing Kofam hits, then this function makes sure only hits with a high enough bit score make it into the annotation dictionary. Parameters @@ -68,6 +68,7 @@ def get_search_results(self, ko_list_dict = None): ======= annotations_dict dictionary of annotations """ + annotations_dict = {} # this is the stuff we are going to try to fill with this: From efe1699b511ddaf97f210e4f1f9e7f88188dd6b2 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 17 Feb 2020 11:47:00 -0600 Subject: [PATCH 120/400] typos --- anvio/dbops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/dbops.py b/anvio/dbops.py index f69f8b1dc4..06dc63f856 100644 --- a/anvio/dbops.py +++ b/anvio/dbops.py @@ -3453,7 +3453,7 @@ def create(self, args): filesnpaths.is_file_exists(external_gene_calls) if external_gene_calls and skip_gene_calling: - raise ConfigError("You provided a file for external gene calls, and used requested gene calling to be\ + raise ConfigError("You provided a file for external gene calls, but also requested gene calling to be\ skipped. Please make up your mind.") if (external_gene_calls or skip_gene_calling) and prodigal_translation_table: @@ -3717,7 +3717,7 @@ def create(self, args): def compress_nt_position_info(self, contig_length, genes_in_contig, genes_in_contigs_dict): """This function compresses information regarding each nucleotide position in a given contig into a small int. Every nucleotide position is represented by four bits depending on whether - they occur in a complete opoen reading frame, and which base they correspond to in a codon. + they occur in a complete open reading frame, and which base they correspond to in a codon. 0000 |||| From 888ade4efeac7dd4678ded5c224071dd26ac1a11 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 17 Feb 2020 11:58:47 -0600 Subject: [PATCH 121/400] modules db class --- anvio/kofam.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index d774ce018d..2e407148c3 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -"""This file contains KofamSetup and Kofam classes.""" +"""This file contains KofamSetup, Kofam, and ModulesDatabase classes.""" import os import gzip @@ -530,3 +530,17 @@ def process_kofam_hmms(self): like to keep it for testing purposes)', nl_before=1, nl_after=1) shutil.rmtree(tmp_directory_path) hmmer.clean_tmp_dirs() + +class ModulesDatabase(): + """To create or access a Modules DB. + + This DB should be created in the Kegg Data folder during Kofam setup, and will be populated with information from the + Kegg Module files. + """ + + def __init__(self, db_path, run=run, progress=progress): + self.db = None + self.db_path = db_path + + self.run = run + self.progress = progress From be836198953d4481d21313ce635e1d191821a0d7 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 17 Feb 2020 11:58:58 -0600 Subject: [PATCH 122/400] func to setup modules.db --- anvio/kofam.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 2e407148c3..83ef3e2bfe 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -406,6 +406,12 @@ def run_hmmpress(self): self.progress.end() + def setup_modules_db(self): + """This function creates the Modules DB from the Kegg Module files. """ + + mod_db = ModulesDatabase(self.kofam_data_dir, run=run, progress=progress) + + def setup_profiles(self): """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" @@ -414,6 +420,7 @@ def setup_profiles(self): self.download_modules() self.setup_ko_dict() self.run_hmmpress() + self.setup_modules_db() class KofamRunHMMs(KofamContext): """ Class for running `hmmscan` against the KOfam database and adding the resulting hits to contigs DB for later metabolism prediction. From 5c61346f06d701cd57f25efa4ad3dd6036881961 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 17 Feb 2020 13:06:23 -0600 Subject: [PATCH 123/400] add version for modules DB --- anvio/__init__.py | 10 +++++++--- anvio/tables/__init__.py | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index e095041974..252dc4385a 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -2166,7 +2166,8 @@ def set_version(): t.genes_db_version, \ t.auxiliary_data_version, \ t.genomes_storage_vesion, \ - t.structure_db_version + t.structure_db_version, \ + t.modules_db_version def get_version_tuples(): @@ -2177,7 +2178,8 @@ def get_version_tuples(): ("Auxiliary data storage version", __auxiliary_data_version__), ("Pan DB version", __pan__version__), ("Genome data storage version", __genomes_storage_version__), - ("Structure DB version", __structure__version__)] + ("Structure DB version", __structure__version__), + ("Modules DB version", __modules__version__)] def print_version(): @@ -2188,6 +2190,7 @@ def print_version(): run.info("Genome data storage version", __genomes_storage_version__) run.info("Auxiliary data storage version", __auxiliary_data_version__) run.info("Structure DB version", __structure__version__) + run.info("Modules DB version", __modules__version__) __version__, \ @@ -2198,7 +2201,8 @@ def print_version(): __genes__version__, \ __auxiliary_data_version__, \ __genomes_storage_version__ , \ -__structure__version__ = set_version() +__structure__version__, \ +__modules__version__ = set_version() if '-v' in sys.argv or '--version' in sys.argv: diff --git a/anvio/tables/__init__.py b/anvio/tables/__init__.py index 0396a097cb..0fa81f71f5 100644 --- a/anvio/tables/__init__.py +++ b/anvio/tables/__init__.py @@ -21,6 +21,7 @@ structure_db_version = "1" genomes_storage_vesion = "6" workflow_config_version = "1" +modules_db_version = "1" versions_for_db_types = {'contigs': contigs_db_version, 'profile': profile_db_version, From 31f5c29306263b2779922e4efc24fd102dabbf01 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 17 Feb 2020 13:07:37 -0600 Subject: [PATCH 124/400] fix modules db path --- anvio/kofam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index 83ef3e2bfe..428334cdd4 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -10,6 +10,7 @@ import re import anvio +import anvio.db as db import anvio.dbops as dbops import anvio.utils as utils import anvio.terminal as terminal @@ -409,7 +410,7 @@ def run_hmmpress(self): def setup_modules_db(self): """This function creates the Modules DB from the Kegg Module files. """ - mod_db = ModulesDatabase(self.kofam_data_dir, run=run, progress=progress) + mod_db = ModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), run=run, progress=progress) def setup_profiles(self): From b1b97994e43748f49ac1e1cbb762b47ca25cb9cf Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 17 Feb 2020 15:34:55 -0600 Subject: [PATCH 125/400] touch function --- anvio/kofam.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/anvio/kofam.py b/anvio/kofam.py index 428334cdd4..ad1d9782ca 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -552,3 +552,34 @@ def __init__(self, db_path, run=run, progress=progress): self.run = run self.progress = progress + + ## here we should call init function if the db exists + + def touch(): + """Creates an empty Modules database on disk, and sets `self.db` to access to it. + + At some point self.db.disconnect() must be called to complete the creation of the new db. + """ + + # sanity check to avoid overriding previous Modules DB + # this will probably never happen as long as this function is called through the setup script, but we check just in case + if os.path.exists(self.db_path): + raise ConfigError("A modules database at %s already exists. Please use the --reset flag when you restart the setup \ + if you really want to get rid of this one and make a new one." % (self.db_path)) + + + self.db = db.DB(self.db_path, anvio.__modules__version__, new_database=True) + + # I wonder if these should be moved to the tables __init__.py at some point? + module_table_name = "kegg_modules" + module_table_structure = ['module', 'data_name', 'data_value', 'data_definition'] + module_table_types = [ 'str' , 'str' , 'str' , 'str' ] + + self.db.create_table(module_table_name, module_table_structure, module_table_types) + + return self.db + + def create(): + """Creates the Modules DB""" + + self.touch() From 9c3d40fca26355a72a405365b62bc42174de1065 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 18 Feb 2020 20:48:24 -0600 Subject: [PATCH 126/400] create modules db is working --- anvio/kofam.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/anvio/kofam.py b/anvio/kofam.py index ad1d9782ca..f563fbcb63 100644 --- a/anvio/kofam.py +++ b/anvio/kofam.py @@ -411,6 +411,7 @@ def setup_modules_db(self): """This function creates the Modules DB from the Kegg Module files. """ mod_db = ModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), run=run, progress=progress) + mod_db.create() def setup_profiles(self): @@ -555,7 +556,7 @@ def __init__(self, db_path, run=run, progress=progress): ## here we should call init function if the db exists - def touch(): + def touch(self): """Creates an empty Modules database on disk, and sets `self.db` to access to it. At some point self.db.disconnect() must be called to complete the creation of the new db. @@ -579,7 +580,11 @@ def touch(): return self.db - def create(): + def create(self): """Creates the Modules DB""" self.touch() + + self.db.set_meta_value('db_type', 'modules') + + self.db.disconnect() From b3b743565e2e583f4c8946e4c8e872a1e6975e0f Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 19 Feb 2020 09:09:46 -0600 Subject: [PATCH 127/400] rename kofam to kegg --- anvio/{kofam.py => kegg.py} | 41 ++++++++++++++++++++++++++++++------- bin/anvi-run-kegg-kofams | 4 ++-- bin/anvi-setup-kegg-kofams | 8 ++++---- 3 files changed, 40 insertions(+), 13 deletions(-) rename anvio/{kofam.py => kegg.py} (95%) diff --git a/anvio/kofam.py b/anvio/kegg.py similarity index 95% rename from anvio/kofam.py rename to anvio/kegg.py index f563fbcb63..446f4d5f52 100644 --- a/anvio/kofam.py +++ b/anvio/kegg.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -"""This file contains KofamSetup, Kofam, and ModulesDatabase classes.""" +"""This file contains Kegg related classes.""" import os import gzip @@ -33,7 +33,7 @@ pp = terminal.pretty_print -class KofamContext(object): +class KeggContext(object): """The purpose of this base class is to define shared functions and file paths for all KOfam operations.""" def __init__(self, args): @@ -131,7 +131,7 @@ def get_ko_skip_list(self): no_threshold_list.append(k) return skip_list, no_threshold_list -class KofamSetup(KofamContext): +class KeggSetup(KeggContext): """Class for setting up KEGG Kofam HMM profiles. It performs sanity checks and downloads, unpacks, and prepares the profiles for later use by `hmmscan`. Parameters @@ -146,7 +146,7 @@ def __init__(self, args, run=run, progress=progress): self.progress = progress # init the base class - KofamContext.__init__(self, self.args) + KeggContext.__init__(self, self.args) filesnpaths.is_program_exists('hmmpress') @@ -424,10 +424,9 @@ def setup_profiles(self): self.run_hmmpress() self.setup_modules_db() -class KofamRunHMMs(KofamContext): +class KeggRunHMMs(KeggContext): """ Class for running `hmmscan` against the KOfam database and adding the resulting hits to contigs DB for later metabolism prediction. - Parameters ========== args: Namespace object All the arguments supplied by user to anvi-run-kegg-kofams @@ -442,7 +441,7 @@ def __init__(self, args, run=run, progress=progress): self.ko_dict = None # should be set up by setup_ko_dict() # init the base class - KofamContext.__init__(self, self.args) + KeggContext.__init__(self, self.args) filesnpaths.is_program_exists('hmmscan') @@ -588,3 +587,31 @@ def create(self): self.db.set_meta_value('db_type', 'modules') self.db.disconnect() + +class ModulesTable: + """This class defines operations for creating the Modules table in Modules.db""" + + def __init__(self, split_length): + self.db_entries = [] + self.total_modules = 0 + + """ UPDATE ME TO WORK FOR MODULES + def append(self, seq_id, sequence, gene_start_stops=None): + sequence_length = len(sequence) + gc_content = utils.get_GC_content_for_sequence(sequence) + + # how many splits will there be? + split_start_stops = utils.get_split_start_stops(sequence_length, self.split_length, gene_start_stops) + + self.total_nts += sequence_length + self.total_contigs += 1 + db_entry = tuple([seq_id, sequence_length, gc_content, len(split_start_stops)]) + self.db_entries.append(db_entry) + + return (sequence_length, split_start_stops, gc_content) + + + def store(self, db): + if len(self.db_entries): + db._exec_many('''INSERT INTO %s VALUES (%s)''' % (t.contigs_info_table_name, (','.join(['?'] * len(self.db_entries[0])))), self.db_entries) + """ diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams index ed292c3353..915b31cc23 100755 --- a/bin/anvi-run-kegg-kofams +++ b/bin/anvi-run-kegg-kofams @@ -5,7 +5,7 @@ import sys import anvio import anvio.terminal as terminal -import anvio.kofam as kofam +import anvio.kegg as kegg from anvio.errors import ConfigError, FilesNPathsError from anvio.terminal import time_program @@ -22,7 +22,7 @@ __description__ = "Run KOfam HMMs on an anvi'o contigs database." @time_program def main(args): - p = kofam.KofamRunHMMs(args) + p = kegg.KeggRunHMMs(args) p.process_kofam_hmms() if __name__ == '__main__': diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index c21cc9dcd8..868e1026ae 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -4,7 +4,7 @@ import sys import anvio -import anvio.kofam as kofam +import anvio.kegg as kegg from anvio.errors import ConfigError, FilesNPathsError @@ -12,8 +12,8 @@ __author__ = "Developers of anvi'o (see AUTHORS.txt)" __copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" __license__ = "GPL 3.0" __version__ = anvio.__version__ -__maintainer__ = "Özcan Esen" -__email__ = "ozcanesen@gmail.com" +__maintainer__ = "Iva Veseli" +__email__ = "iveseli@uchicago.edu" __provides__ = ["kofam-data"] __description__ = "Download and setup KEGG KOfam HMM profiles." @@ -33,7 +33,7 @@ if __name__ == '__main__': args = anvio.get_args(parser) try: - setup = kofam.KofamSetup(args) + setup = kegg.KeggSetup(args) setup.setup_profiles() except ConfigError as e: From ab4adc6a8337fdafc247585a6af9940d3d2a3246 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 19 Feb 2020 09:18:24 -0600 Subject: [PATCH 128/400] rename modules to kegg_modules --- anvio/__init__.py | 8 ++++---- anvio/kegg.py | 10 +++++----- anvio/tables/__init__.py | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index 252dc4385a..2c90a65d4f 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -2167,7 +2167,7 @@ def set_version(): t.auxiliary_data_version, \ t.genomes_storage_vesion, \ t.structure_db_version, \ - t.modules_db_version + t.kegg_modules_db_version def get_version_tuples(): @@ -2179,7 +2179,7 @@ def get_version_tuples(): ("Pan DB version", __pan__version__), ("Genome data storage version", __genomes_storage_version__), ("Structure DB version", __structure__version__), - ("Modules DB version", __modules__version__)] + ("Kegg Modules DB version", __kegg_modules_version__)] def print_version(): @@ -2190,7 +2190,7 @@ def print_version(): run.info("Genome data storage version", __genomes_storage_version__) run.info("Auxiliary data storage version", __auxiliary_data_version__) run.info("Structure DB version", __structure__version__) - run.info("Modules DB version", __modules__version__) + run.info("Kegg Modules DB version", __kegg_modules_version__) __version__, \ @@ -2202,7 +2202,7 @@ def print_version(): __auxiliary_data_version__, \ __genomes_storage_version__ , \ __structure__version__, \ -__modules__version__ = set_version() +__kegg_modules_version__ = set_version() if '-v' in sys.argv or '--version' in sys.argv: diff --git a/anvio/kegg.py b/anvio/kegg.py index 446f4d5f52..affd3f0859 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -410,7 +410,7 @@ def run_hmmpress(self): def setup_modules_db(self): """This function creates the Modules DB from the Kegg Module files. """ - mod_db = ModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), run=run, progress=progress) + mod_db = KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), run=run, progress=progress) mod_db.create() @@ -539,7 +539,7 @@ def process_kofam_hmms(self): shutil.rmtree(tmp_directory_path) hmmer.clean_tmp_dirs() -class ModulesDatabase(): +class KeggModulesDatabase(): """To create or access a Modules DB. This DB should be created in the Kegg Data folder during Kofam setup, and will be populated with information from the @@ -568,7 +568,7 @@ def touch(self): if you really want to get rid of this one and make a new one." % (self.db_path)) - self.db = db.DB(self.db_path, anvio.__modules__version__, new_database=True) + self.db = db.DB(self.db_path, anvio.__kegg_modules_version__, new_database=True) # I wonder if these should be moved to the tables __init__.py at some point? module_table_name = "kegg_modules" @@ -588,8 +588,8 @@ def create(self): self.db.disconnect() -class ModulesTable: - """This class defines operations for creating the Modules table in Modules.db""" +class KeggModulesTable: + """This class defines operations for creating the KEGG Modules table in Modules.db""" def __init__(self, split_length): self.db_entries = [] diff --git a/anvio/tables/__init__.py b/anvio/tables/__init__.py index 0fa81f71f5..e363a9f261 100644 --- a/anvio/tables/__init__.py +++ b/anvio/tables/__init__.py @@ -21,7 +21,7 @@ structure_db_version = "1" genomes_storage_vesion = "6" workflow_config_version = "1" -modules_db_version = "1" +kegg_modules_db_version = "1" versions_for_db_types = {'contigs': contigs_db_version, 'profile': profile_db_version, From 5e62eaf492639bb0773807f9ab056c4038326b54 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 21 Feb 2020 13:03:18 -0600 Subject: [PATCH 129/400] move module table info to init --- anvio/kegg.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index affd3f0859..e08daaeced 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -553,6 +553,12 @@ def __init__(self, db_path, run=run, progress=progress): self.run = run self.progress = progress + # modules table info + # I wonder if these should be moved to the tables __init__.py at some point? + self.module_table_name = "kegg_modules" + self.module_table_structure = ['module', 'data_name', 'data_value', 'data_definition'] + self.module_table_types = [ 'str' , 'str' , 'str' , 'str' ] + ## here we should call init function if the db exists def touch(self): @@ -570,12 +576,7 @@ def touch(self): self.db = db.DB(self.db_path, anvio.__kegg_modules_version__, new_database=True) - # I wonder if these should be moved to the tables __init__.py at some point? - module_table_name = "kegg_modules" - module_table_structure = ['module', 'data_name', 'data_value', 'data_definition'] - module_table_types = [ 'str' , 'str' , 'str' , 'str' ] - - self.db.create_table(module_table_name, module_table_structure, module_table_types) + self.db.create_table(self.module_table_name, self.module_table_structure, self.module_table_types) return self.db From b63c1cbde89dbedf3c42739c003031ea1c03397b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 21 Feb 2020 13:03:57 -0600 Subject: [PATCH 130/400] ModulesDB now inherits KeggContext so it knows paths --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index e08daaeced..034dc51486 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -539,7 +539,7 @@ def process_kofam_hmms(self): shutil.rmtree(tmp_directory_path) hmmer.clean_tmp_dirs() -class KeggModulesDatabase(): +class KeggModulesDatabase(KeggContext): """To create or access a Modules DB. This DB should be created in the Kegg Data folder during Kofam setup, and will be populated with information from the From 21758f361a2ae673d36e8d21fd53ed71048bb13a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 21 Feb 2020 17:40:25 -0600 Subject: [PATCH 131/400] module table name as param for KeggModulesTable --- anvio/kegg.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 034dc51486..b548b487ce 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -592,10 +592,16 @@ def create(self): class KeggModulesTable: """This class defines operations for creating the KEGG Modules table in Modules.db""" - def __init__(self, split_length): + def __init__(self, mod_table_name = None): + """""" self.db_entries = [] self.total_modules = 0 + if mod_table_name: + self.module_table_name = mod_table_name + else: + raise ConfigError("Beep Beep. Warning. KeggModulesTable was initialized without knowing its own name.") + """ UPDATE ME TO WORK FOR MODULES def append(self, seq_id, sequence, gene_start_stops=None): sequence_length = len(sequence) From c5d5ed89b74971163c89eceb9cbd2852ab5239ab Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 21 Feb 2020 17:48:11 -0600 Subject: [PATCH 132/400] append and store funcs --- anvio/kegg.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index b548b487ce..466e218f1e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -595,30 +595,29 @@ class KeggModulesTable: def __init__(self, mod_table_name = None): """""" self.db_entries = [] - self.total_modules = 0 + self.total_entries = 0 if mod_table_name: self.module_table_name = mod_table_name else: raise ConfigError("Beep Beep. Warning. KeggModulesTable was initialized without knowing its own name.") - """ UPDATE ME TO WORK FOR MODULES - def append(self, seq_id, sequence, gene_start_stops=None): - sequence_length = len(sequence) - gc_content = utils.get_GC_content_for_sequence(sequence) - # how many splits will there be? - split_start_stops = utils.get_split_start_stops(sequence_length, self.split_length, gene_start_stops) + def append_and_store(self, module_num, data_name, data_value, data_definition=None, line_num=None): + """This function handles collects db entries (as tuples) into a list, and once we have 10,000 of them it stores that set into the Modules table. - self.total_nts += sequence_length - self.total_contigs += 1 - db_entry = tuple([seq_id, sequence_length, gc_content, len(split_start_stops)]) + The db_entries list is cleared after each store so that future stores don't add duplicate entries to the table. + """ + + db_entry = tuple([module_num, data_name, data_value, data_definition, line_num]) self.db_entries.append(db_entry) + self.total_entries += 1 - return (sequence_length, split_start_stops, gc_content) + if len(self.db_entries) > 10000: + self.store() + self.db_entries = [] - def store(self, db): + def store(self): if len(self.db_entries): - db._exec_many('''INSERT INTO %s VALUES (%s)''' % (t.contigs_info_table_name, (','.join(['?'] * len(self.db_entries[0])))), self.db_entries) - """ + db._exec_many('''INSERT INTO %s VALUES (%s)''' % (self.module_table_name, (','.join(['?'] * len(self.db_entries[0])))), self.db_entries) From 1d0ed9404725b49d081cdd7c26c0a1bb82467bdc Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 21 Feb 2020 18:12:38 -0600 Subject: [PATCH 133/400] loop through modules files in create() --- anvio/kegg.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 466e218f1e..2504573aeb 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -585,6 +585,48 @@ def create(self): self.touch() + self.progress.new("Loading KEGG modules into Modules DB...") + + # sanity check that we setup the modules previously. + # It shouldn't be a problem since this function should only be called during the setup process after modules download, but just in case. + if not os.exists(module_data_dir) or len(self.module_dict.keys()) == 0: + raise ConfigError("Appparently, the Kegg Modules were not correctly setup and now all sorts of things are broken. The \ + Modules DB cannot be created from broken things. BTW, this error is not supposed to happen to anyone except maybe developers, so \ + if you do not fall into that category you are likely in deep doo-doo. Maybe re-running setup with --reset will work? (if not, you \ + probably should email/Slack/telepathically cry out for help to the developers.)") + + + num_modules_parsed = 0 + for mnum in self.module_dict.keys(): + mod_file_path = os.path.join(self.module_data_dir, mnum) + f = open(mod_file_path, 'rU') + + prev_data_name_field = None + for line in f.readlines(): + line.strip('\n') + + # check for last line ///. We don't want to send the last line to the parsing function because it will break. + if not line == '///': + # parse the line into a tuple + + # here is the tricky bit about parsing these files. Not all lines start with the data_name field; those that don't start with a space. + # if this is the case, we need to tell the parsing function what the previous data_name field has been. + if line[0] == ' ': + pass + else: + pass + + # extract that tuple info + # update prev_data_name_field + # call append_and_store which will collect db entries and store every 10000 at a time + + + num_modules_parsed += 1 + + # give some run info + # record number of modules in db + + self.db.set_meta_value('db_type', 'modules') self.db.disconnect() From 7646dd4011fe7f2318f9338cab8538315ba5cc93 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 24 Feb 2020 09:51:48 -0600 Subject: [PATCH 134/400] add line num tracking for module parsing --- anvio/kegg.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 2504573aeb..6ed72d6571 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -597,6 +597,7 @@ def create(self): num_modules_parsed = 0 + line_number = 0 for mnum in self.module_dict.keys(): mod_file_path = os.path.join(self.module_data_dir, mnum) f = open(mod_file_path, 'rU') @@ -604,6 +605,7 @@ def create(self): prev_data_name_field = None for line in f.readlines(): line.strip('\n') + line_number += 1 # check for last line ///. We don't want to send the last line to the parsing function because it will break. if not line == '///': From c430d8e5706c036d728febb68416732307b0b963 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 24 Feb 2020 15:44:47 -0600 Subject: [PATCH 135/400] line parsing function --- anvio/kegg.py | 59 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6ed72d6571..2446032802 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -286,10 +286,6 @@ def download_modules(self): raise ConfigError("The KEGG module file %s was not downloaded properly. We were expecting the last line in the file \ to be '///', but instead it was %s." % (file_path, last_line)) - def parse_kegg_modules(self): - """This function reads information from each of the KEGG module flat files into the module_dict.""" - - pass def decompress_files(self): """This function decompresses the Kofam profiles.""" @@ -580,6 +576,61 @@ def touch(self): return self.db + def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None): + """This function parses information from one line of a KEGG module file. + + These files have fields separated by 2 or more spaces. Fields can include data name (not always), data value (always), and data definition (not always). + Lines for pathway module files can have between 2 and 4 fields, but in fact the only situation where there should be 4 lines is the ENTRY data, + which for some inexplicable reason has multiple spaces between "Pathway" and "Module" in the data definition field. We can safely ignore this last "Module", I think. + + Some lines will have multiple entities in the data_value field (ie, multiple KOs or reaction numbers) and will be split into multiple db entries. + + PARAMETERS + ========== + line str, the line to parse + line_num int, which line number we are working on. We need this to keep track of which entities come from the same line of the file. + current_data_name str, which data name we are working on. If this is None, we need to parse this info from the first field in the line. + + RETURNS + ======= + line_entries a list of tuples, each containing information for one db entry, namely data name, data value, data definition, and line number. + Not all parts of the db entry will be included (module num, for instance), so this information must be parsed and combined with + the missing information before being added to the database. + """ + + fields = re.split('\s{2,}', line) + data_vals = None + data_def = None + line_entries = [] + + # data name unknown, parse from first field + if not current_data_name: + # sanity check: if line starts with space then there is no data name field and we should have passed a current_data_name + if line[0] == ' ': + raise ConfigError("Oh, please. Some silly developer (you know who you are) has tried to call parse_kegg_modules_line() on \ + a line without a data name field, and forgot to give it the current data name. Shame on you, go fix this. (For reference here \ + is the line: %s)" % (line)) + + current_data_name = fields[0] + data_vals = fields[1] + if len(fields) > 2: # not all lines have a definition field + data_def = fields[2] + else: # data name known + data_vals = fields[0] + data_def = fields[1] + + # some types of information may need to be split into multiple db entries + data_types_to_split = ["ORTHOLOGY","REACTION"] # lines that fall under these categories need to have data_vals split on comma + if current_data_name in data_types_to_split: + for val in data_vals.split(','): + line_entries.append((current_data_name, val, data_def, line_num) + else: # just send what we found without splitting the line + line_entries.append((current_data_name, data_vals, data_def, line_num) + + # still need to figure out what to do about REFERENCE info type (includes AUTHORS, TITLE, JOURNAL) - do we want this? + return line_entries + + def create(self): """Creates the Modules DB""" From 91155c4308fbb2c8ed0e45f30f957f64ad10c640 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 24 Feb 2020 15:45:46 -0600 Subject: [PATCH 136/400] parens fixy --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 2446032802..28c0b0d9b7 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -623,7 +623,7 @@ def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None) data_types_to_split = ["ORTHOLOGY","REACTION"] # lines that fall under these categories need to have data_vals split on comma if current_data_name in data_types_to_split: for val in data_vals.split(','): - line_entries.append((current_data_name, val, data_def, line_num) + line_entries.append((current_data_name, val, data_def, line_num)) else: # just send what we found without splitting the line line_entries.append((current_data_name, data_vals, data_def, line_num) From 06ea6b8298ec06de724f3bc7281e9fea735ffd16 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 24 Feb 2020 16:01:44 -0600 Subject: [PATCH 137/400] now create can parse the lines and add to db --- anvio/kegg.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 28c0b0d9b7..8639e95b60 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -646,10 +646,13 @@ def create(self): if you do not fall into that category you are likely in deep doo-doo. Maybe re-running setup with --reset will work? (if not, you \ probably should email/Slack/telepathically cry out for help to the developers.)") + # init the Modules table + mod_table = KeggModulesTable(self.module_table_name) num_modules_parsed = 0 line_number = 0 for mnum in self.module_dict.keys(): + self.progress.update("Parsing KEGG Module %s" % mnum) mod_file_path = os.path.join(self.module_data_dir, mnum) f = open(mod_file_path, 'rU') @@ -661,17 +664,23 @@ def create(self): # check for last line ///. We don't want to send the last line to the parsing function because it will break. if not line == '///': # parse the line into a tuple - + entries_tuple_list = None # here is the tricky bit about parsing these files. Not all lines start with the data_name field; those that don't start with a space. # if this is the case, we need to tell the parsing function what the previous data_name field has been. if line[0] == ' ': - pass + entries_tuple_list = self.parse_kegg_modules_line(line, line_number, prev_data_name_field) else: - pass + entries_tuple_list = self.parse_kegg_modules_line(line, line_number) + + # update prev_data_name_field; use the first (and perhaps only) entry by default + prev_data_name_field = entries_tuple_list[0][0] + + # unpack that tuple info + for entry_info in entries_tuple_list: + name, val, def, line = entry_info + # call append_and_store which will collect db entries and store every 10000 at a time + mod_table.append_and_store(mnum, name, val, def, line) - # extract that tuple info - # update prev_data_name_field - # call append_and_store which will collect db entries and store every 10000 at a time num_modules_parsed += 1 From 175b219068348438dbe92c39b940630f66c38e37 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 25 Feb 2020 12:00:34 -0600 Subject: [PATCH 138/400] generate some output and save some metadata --- anvio/kegg.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 8639e95b60..20989ffa61 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -542,9 +542,10 @@ class KeggModulesDatabase(KeggContext): Kegg Module files. """ - def __init__(self, db_path, run=run, progress=progress): + def __init__(self, db_path, run=run, progress=progress, quiet=False): self.db = None self.db_path = db_path + self.quiet = quiet self.run = run self.progress = progress @@ -686,10 +687,14 @@ def create(self): num_modules_parsed += 1 # give some run info - # record number of modules in db - + self.run.info('Modules database', 'A new database, %s, has been created.' % (self.db_path), quiet=self.quiet) + self.run.info('Number of KEGG modules', num_modules_parsed, quiet=self.quiet) + self.run.info('Number of entries', mod_table.get_total_entries(), quiet=self.quiet) + # record some useful metadata self.db.set_meta_value('db_type', 'modules') + self.db.set_meta_value('num_modules', num_modules_parsed) + self.db.set_meta_value('total_entries', mod_table.get_total_entries()) self.db.disconnect() @@ -725,3 +730,6 @@ def append_and_store(self, module_num, data_name, data_value, data_definition=No def store(self): if len(self.db_entries): db._exec_many('''INSERT INTO %s VALUES (%s)''' % (self.module_table_name, (','.join(['?'] * len(self.db_entries[0])))), self.db_entries) + + def get_total_entries(self): + return self.total_entries From 26de4973c6b3df674b5830a0bd18346217e4bd81 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 11:42:02 -0600 Subject: [PATCH 139/400] fix tab and syntax errors --- anvio/kegg.py | 227 +++++++++++++++++++++++++------------------------- 1 file changed, 113 insertions(+), 114 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 20989ffa61..1889fcbfa0 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -58,8 +58,8 @@ def setup_ko_dict(self): It looks something like this: - knum threshold score_type profile_type F-measure nseq nseq_used alen mlen eff_nseq re/pos definition - K00001 329.57 domain trim 0.231663 1473 1069 1798 371 17.12 0.590 alcohol dehydrogenase [EC:1.1.1.1] + knum threshold score_type profile_type F-measure nseq nseq_used alen mlen eff_nseq re/pos definition + K00001 329.57 domain trim 0.231663 1473 1069 1798 371 17.12 0.590 alcohol dehydrogenase [EC:1.1.1.1] Since this information is useful for both the setup process (we need to know all the knums) and HMM process, all Kofam subclasses need to have access to this dictionary. @@ -94,12 +94,12 @@ def get_ko_skip_list(self): That is, their ko_list entries look like this, with hypens in all but the first and last columns: - K14936 - - - - - - - - - - small nucleolar RNA snR191 - K15035 - - - - - - - - - - transfer-messenger RNA - K15841 - - - - - - - - - - small regulatory RNA GlmY - K15851 - - - - - - - - - - quorum regulatory RNA Qrr - K16736 - - - - - - - - - - bantam - K16863 - - - - - - - - - - microRNA 21 + K14936 - - - - - - - - - - small nucleolar RNA snR191 + K15035 - - - - - - - - - - transfer-messenger RNA + K15841 - - - - - - - - - - small regulatory RNA GlmY + K15851 - - - - - - - - - - quorum regulatory RNA Qrr + K16736 - - - - - - - - - - bantam + K16863 - - - - - - - - - - microRNA 21 These are RNAs. @@ -196,7 +196,7 @@ def process_module_file(self): The structure of this file is like this: - +D Module + +D Module #

  KEGG Modules

! APathway modules @@ -545,13 +545,13 @@ class KeggModulesDatabase(KeggContext): def __init__(self, db_path, run=run, progress=progress, quiet=False): self.db = None self.db_path = db_path - self.quiet = quiet + self.quiet = quiet self.run = run self.progress = progress - # modules table info - # I wonder if these should be moved to the tables __init__.py at some point? + # modules table info + # I wonder if these should be moved to the tables __init__.py at some point? self.module_table_name = "kegg_modules" self.module_table_structure = ['module', 'data_name', 'data_value', 'data_definition'] self.module_table_types = [ 'str' , 'str' , 'str' , 'str' ] @@ -577,59 +577,59 @@ def touch(self): return self.db - def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None): + def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None): """This function parses information from one line of a KEGG module file. - These files have fields separated by 2 or more spaces. Fields can include data name (not always), data value (always), and data definition (not always). - Lines for pathway module files can have between 2 and 4 fields, but in fact the only situation where there should be 4 lines is the ENTRY data, - which for some inexplicable reason has multiple spaces between "Pathway" and "Module" in the data definition field. We can safely ignore this last "Module", I think. + These files have fields separated by 2 or more spaces. Fields can include data name (not always), data value (always), and data definition (not always). + Lines for pathway module files can have between 2 and 4 fields, but in fact the only situation where there should be 4 lines is the ENTRY data, + which for some inexplicable reason has multiple spaces between "Pathway" and "Module" in the data definition field. We can safely ignore this last "Module", I think. - Some lines will have multiple entities in the data_value field (ie, multiple KOs or reaction numbers) and will be split into multiple db entries. + Some lines will have multiple entities in the data_value field (ie, multiple KOs or reaction numbers) and will be split into multiple db entries. - PARAMETERS - ========== - line str, the line to parse - line_num int, which line number we are working on. We need this to keep track of which entities come from the same line of the file. - current_data_name str, which data name we are working on. If this is None, we need to parse this info from the first field in the line. + PARAMETERS + ========== + line str, the line to parse + line_num int, which line number we are working on. We need this to keep track of which entities come from the same line of the file. + current_data_name str, which data name we are working on. If this is None, we need to parse this info from the first field in the line. - RETURNS - ======= - line_entries a list of tuples, each containing information for one db entry, namely data name, data value, data definition, and line number. - Not all parts of the db entry will be included (module num, for instance), so this information must be parsed and combined with - the missing information before being added to the database. - """ + RETURNS + ======= + line_entries a list of tuples, each containing information for one db entry, namely data name, data value, data definition, and line number. + Not all parts of the db entry will be included (module num, for instance), so this information must be parsed and combined with + the missing information before being added to the database. + """ fields = re.split('\s{2,}', line) - data_vals = None - data_def = None - line_entries = [] - - # data name unknown, parse from first field - if not current_data_name: - # sanity check: if line starts with space then there is no data name field and we should have passed a current_data_name - if line[0] == ' ': - raise ConfigError("Oh, please. Some silly developer (you know who you are) has tried to call parse_kegg_modules_line() on \ - a line without a data name field, and forgot to give it the current data name. Shame on you, go fix this. (For reference here \ - is the line: %s)" % (line)) - - current_data_name = fields[0] - data_vals = fields[1] - if len(fields) > 2: # not all lines have a definition field - data_def = fields[2] - else: # data name known - data_vals = fields[0] - data_def = fields[1] - - # some types of information may need to be split into multiple db entries - data_types_to_split = ["ORTHOLOGY","REACTION"] # lines that fall under these categories need to have data_vals split on comma - if current_data_name in data_types_to_split: - for val in data_vals.split(','): - line_entries.append((current_data_name, val, data_def, line_num)) - else: # just send what we found without splitting the line - line_entries.append((current_data_name, data_vals, data_def, line_num) - - # still need to figure out what to do about REFERENCE info type (includes AUTHORS, TITLE, JOURNAL) - do we want this? - return line_entries + data_vals = None + data_def = None + line_entries = [] + + # data name unknown, parse from first field + if not current_data_name: + # sanity check: if line starts with space then there is no data name field and we should have passed a current_data_name + if line[0] == ' ': + raise ConfigError("Oh, please. Some silly developer (you know who you are) has tried to call parse_kegg_modules_line() on \ + a line without a data name field, and forgot to give it the current data name. Shame on you, go fix this. (For reference here \ + is the line: %s)" % (line)) + + current_data_name = fields[0] + data_vals = fields[1] + if len(fields) > 2: # not all lines have a definition field + data_def = fields[2] + else: # data name known + data_vals = fields[0] + data_def = fields[1] + + # some types of information may need to be split into multiple db entries + data_types_to_split = ["ORTHOLOGY","REACTION"] # lines that fall under these categories need to have data_vals split on comma + if current_data_name in data_types_to_split: + for val in data_vals.split(','): + line_entries.append((current_data_name, val, data_def, line_num)) + else: # just send what we found without splitting the line + line_entries.append((current_data_name, data_vals, data_def, line_num)) + + # still need to figure out what to do about REFERENCE info type (includes AUTHORS, TITLE, JOURNAL) - do we want this? + return line_entries def create(self): @@ -637,64 +637,63 @@ def create(self): self.touch() - self.progress.new("Loading KEGG modules into Modules DB...") + self.progress.new("Loading KEGG modules into Modules DB...") - # sanity check that we setup the modules previously. - # It shouldn't be a problem since this function should only be called during the setup process after modules download, but just in case. - if not os.exists(module_data_dir) or len(self.module_dict.keys()) == 0: - raise ConfigError("Appparently, the Kegg Modules were not correctly setup and now all sorts of things are broken. The \ - Modules DB cannot be created from broken things. BTW, this error is not supposed to happen to anyone except maybe developers, so \ - if you do not fall into that category you are likely in deep doo-doo. Maybe re-running setup with --reset will work? (if not, you \ - probably should email/Slack/telepathically cry out for help to the developers.)") + # sanity check that we setup the modules previously. + # It shouldn't be a problem since this function should only be called during the setup process after modules download, but just in case. + if not os.exists(module_data_dir) or len(self.module_dict.keys()) == 0: + raise ConfigError("Appparently, the Kegg Modules were not correctly setup and now all sorts of things are broken. The \ + Modules DB cannot be created from broken things. BTW, this error is not supposed to happen to anyone except maybe developers, so \ + if you do not fall into that category you are likely in deep doo-doo. Maybe re-running setup with --reset will work? (if not, you \ + probably should email/Slack/telepathically cry out for help to the developers.)") - # init the Modules table - mod_table = KeggModulesTable(self.module_table_name) + # init the Modules table + mod_table = KeggModulesTable(self.module_table_name) - num_modules_parsed = 0 - line_number = 0 - for mnum in self.module_dict.keys(): - self.progress.update("Parsing KEGG Module %s" % mnum) + num_modules_parsed = 0 + line_number = 0 + for mnum in self.module_dict.keys(): + self.progress.update("Parsing KEGG Module %s" % mnum) mod_file_path = os.path.join(self.module_data_dir, mnum) - f = open(mod_file_path, 'rU') + f = open(mod_file_path, 'rU') - prev_data_name_field = None - for line in f.readlines(): - line.strip('\n') - line_number += 1 + prev_data_name_field = None + for line in f.readlines(): + line.strip('\n') + line_number += 1 - # check for last line ///. We don't want to send the last line to the parsing function because it will break. - if not line == '///': - # parse the line into a tuple - entries_tuple_list = None - # here is the tricky bit about parsing these files. Not all lines start with the data_name field; those that don't start with a space. - # if this is the case, we need to tell the parsing function what the previous data_name field has been. - if line[0] == ' ': - entries_tuple_list = self.parse_kegg_modules_line(line, line_number, prev_data_name_field) - else: - entries_tuple_list = self.parse_kegg_modules_line(line, line_number) + # check for last line ///. We don't want to send the last line to the parsing function because it will break. + if not line == '///': + # parse the line into a tuple + entries_tuple_list = None + # here is the tricky bit about parsing these files. Not all lines start with the data_name field; those that don't start with a space. + # if this is the case, we need to tell the parsing function what the previous data_name field has been. + if line[0] == ' ': + entries_tuple_list = self.parse_kegg_modules_line(line, line_number, prev_data_name_field) + else: + entries_tuple_list = self.parse_kegg_modules_line(line, line_number) - # update prev_data_name_field; use the first (and perhaps only) entry by default - prev_data_name_field = entries_tuple_list[0][0] + # update prev_data_name_field; use the first (and perhaps only) entry by default + prev_data_name_field = entries_tuple_list[0][0] - # unpack that tuple info - for entry_info in entries_tuple_list: - name, val, def, line = entry_info - # call append_and_store which will collect db entries and store every 10000 at a time - mod_table.append_and_store(mnum, name, val, def, line) + # unpack that tuple info + for name, val, definition, line in entries_tuple_list: + # call append_and_store which will collect db entries and store every 10000 at a time + mod_table.append_and_store(mnum, name, val, definition, line) - num_modules_parsed += 1 + num_modules_parsed += 1 - # give some run info - self.run.info('Modules database', 'A new database, %s, has been created.' % (self.db_path), quiet=self.quiet) - self.run.info('Number of KEGG modules', num_modules_parsed, quiet=self.quiet) + # give some run info + self.run.info('Modules database', 'A new database, %s, has been created.' % (self.db_path), quiet=self.quiet) + self.run.info('Number of KEGG modules', num_modules_parsed, quiet=self.quiet) self.run.info('Number of entries', mod_table.get_total_entries(), quiet=self.quiet) - # record some useful metadata + # record some useful metadata self.db.set_meta_value('db_type', 'modules') - self.db.set_meta_value('num_modules', num_modules_parsed) - self.db.set_meta_value('total_entries', mod_table.get_total_entries()) + self.db.set_meta_value('num_modules', num_modules_parsed) + self.db.set_meta_value('total_entries', mod_table.get_total_entries()) self.db.disconnect() @@ -702,34 +701,34 @@ class KeggModulesTable: """This class defines operations for creating the KEGG Modules table in Modules.db""" def __init__(self, mod_table_name = None): - """""" + """""" self.db_entries = [] - self.total_entries = 0 + self.total_entries = 0 - if mod_table_name: - self.module_table_name = mod_table_name - else: - raise ConfigError("Beep Beep. Warning. KeggModulesTable was initialized without knowing its own name.") + if mod_table_name: + self.module_table_name = mod_table_name + else: + raise ConfigError("Beep Beep. Warning. KeggModulesTable was initialized without knowing its own name.") def append_and_store(self, module_num, data_name, data_value, data_definition=None, line_num=None): """This function handles collects db entries (as tuples) into a list, and once we have 10,000 of them it stores that set into the Modules table. - The db_entries list is cleared after each store so that future stores don't add duplicate entries to the table. - """ + The db_entries list is cleared after each store so that future stores don't add duplicate entries to the table. + """ db_entry = tuple([module_num, data_name, data_value, data_definition, line_num]) self.db_entries.append(db_entry) - self.total_entries += 1 + self.total_entries += 1 if len(self.db_entries) > 10000: - self.store() - self.db_entries = [] + self.store() + self.db_entries = [] def store(self): if len(self.db_entries): db._exec_many('''INSERT INTO %s VALUES (%s)''' % (self.module_table_name, (','.join(['?'] * len(self.db_entries[0])))), self.db_entries) - def get_total_entries(self): - return self.total_entries + def get_total_entries(self): + return self.total_entries From 9e17d8a99e8c9bd86f05be2df11b8260a9e596c7 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 11:55:06 -0600 Subject: [PATCH 140/400] talk to the user while decompressing files --- anvio/kegg.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 1889fcbfa0..f55694d4df 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -291,6 +291,7 @@ def decompress_files(self): """This function decompresses the Kofam profiles.""" for file_name in self.files: + self.progress.new('Decompressing file', file_name) full_path = os.path.join(self.kofam_data_dir, file_name) if full_path.endswith("tar.gz"): # extract tar file instead of doing gzip @@ -298,6 +299,10 @@ def decompress_files(self): else: utils.gzip_decompress_file(full_path, keep_original=False) + self.progress.update("File decompressed. Yay.") + self.progress.end() + + def confirm_downloaded_files(self): """This function verifies that all Kofam profiles have been properly downloaded. From 877fcf21b5b7a4389d8be684198db2aabd2c643b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 11:56:36 -0600 Subject: [PATCH 141/400] rename confirm function --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index f55694d4df..cfee3cb286 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -303,7 +303,7 @@ def decompress_files(self): self.progress.end() - def confirm_downloaded_files(self): + def confirm_downloaded_profiles(self): """This function verifies that all Kofam profiles have been properly downloaded. It is intended to be run after the files have been decompressed. The profiles directory should contain hmm files from K00001.hmm to @@ -381,7 +381,7 @@ def run_hmmpress(self): log_file_path = os.path.join(self.kofam_data_dir, '00_hmmpress_log.txt') self.progress.update('Verifying that the Kofam directory at %s contains all HMM profiles' % self.kofam_data_dir) - self.confirm_downloaded_files() + self.confirm_downloaded_profiles() self.progress.update('Handling orphan files') self.move_orphan_files() From 9e8be56667e9099a90f0ef8bc1441b713f6f76cd Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 12:08:57 -0600 Subject: [PATCH 142/400] fix output --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index cfee3cb286..6b069a4aff 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -291,7 +291,7 @@ def decompress_files(self): """This function decompresses the Kofam profiles.""" for file_name in self.files: - self.progress.new('Decompressing file', file_name) + self.progress.new('Decompressing file %s' %s file_name) full_path = os.path.join(self.kofam_data_dir, file_name) if full_path.endswith("tar.gz"): # extract tar file instead of doing gzip From 7b29f38be59448f5595d0f07b573f8e3ce9b1ca6 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 12:13:19 -0600 Subject: [PATCH 143/400] parsing module file is more verbose --- anvio/kegg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6b069a4aff..06a7150e6e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -219,6 +219,7 @@ def process_module_file(self): filesnpaths.is_file_plain_text(self.kegg_module_file) f = open(self.kegg_module_file, 'rU') + self.progress.new("Parsing KEGG Module file") current_module_type = None current_category = None @@ -255,6 +256,8 @@ def process_module_file(self): else: raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has \ made the file unparseable. Sad. :(" % (self.kegg_module_file, first_char)) + self.progress.update("Done 🍁") + self.progress.end() def download_modules(self): """This function downloads the KEGG modules. From 17317ee7824e5685a58f59baa572018f48db256b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 12:13:54 -0600 Subject: [PATCH 144/400] fix module name in module dictionary --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 06a7150e6e..d357bd40d3 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -251,7 +251,7 @@ def process_module_file(self): elif first_char == "D": fields = re.split('\s{2,}', line) mnum = fields[1] - self.module_dict[mnum] = {"name" : module_name} + self.module_dict[mnum] = {"name" : fields[2]} # unknown code else: raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has \ From a3b005ea490b11ba395207cfe79ffc8930871fcd Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 15:13:18 -0600 Subject: [PATCH 145/400] little bug fixies --- anvio/kegg.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index d357bd40d3..4ab1d85e72 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -256,7 +256,6 @@ def process_module_file(self): else: raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has \ made the file unparseable. Sad. :(" % (self.kegg_module_file, first_char)) - self.progress.update("Done 🍁") self.progress.end() def download_modules(self): @@ -294,7 +293,7 @@ def decompress_files(self): """This function decompresses the Kofam profiles.""" for file_name in self.files: - self.progress.new('Decompressing file %s' %s file_name) + self.progress.new('Decompressing file %s' % file_name) full_path = os.path.join(self.kofam_data_dir, file_name) if full_path.endswith("tar.gz"): # extract tar file instead of doing gzip @@ -383,7 +382,7 @@ def run_hmmpress(self): self.progress.new('Preparing Kofam HMM Profiles') log_file_path = os.path.join(self.kofam_data_dir, '00_hmmpress_log.txt') - self.progress.update('Verifying that the Kofam directory at %s contains all HMM profiles' % self.kofam_data_dir) + self.progress.update('Verifying the Kofam directory %s contains all HMM profiles' % self.kofam_data_dir) self.confirm_downloaded_profiles() self.progress.update('Handling orphan files') @@ -649,7 +648,7 @@ def create(self): # sanity check that we setup the modules previously. # It shouldn't be a problem since this function should only be called during the setup process after modules download, but just in case. - if not os.exists(module_data_dir) or len(self.module_dict.keys()) == 0: + if not os.path.exists(self.module_data_dir) or len(self.module_dict.keys()) == 0: raise ConfigError("Appparently, the Kegg Modules were not correctly setup and now all sorts of things are broken. The \ Modules DB cannot be created from broken things. BTW, this error is not supposed to happen to anyone except maybe developers, so \ if you do not fall into that category you are likely in deep doo-doo. Maybe re-running setup with --reset will work? (if not, you \ From 7526b1b8f2083f76bc8776be132a8982dcba65ba Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 15:14:11 -0600 Subject: [PATCH 146/400] time setup process --- bin/anvi-setup-kegg-kofams | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index 868e1026ae..11ffbd3191 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -17,6 +17,11 @@ __email__ = "iveseli@uchicago.edu" __provides__ = ["kofam-data"] __description__ = "Download and setup KEGG KOfam HMM profiles." +@time_program +def main(args): + setup = kegg.KeggSetup(args) + setup.setup_profiles() + if __name__ == '__main__': import argparse @@ -33,8 +38,7 @@ if __name__ == '__main__': args = anvio.get_args(parser) try: - setup = kegg.KeggSetup(args) - setup.setup_profiles() + main(args) except ConfigError as e: print(e) From 8d9d187b75cc1782bfc416125ac1b7744aec1413 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 15:46:52 -0600 Subject: [PATCH 147/400] previously forgot to save extra info in the module dict, now fixed --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 4ab1d85e72..5e84c85bea 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -251,7 +251,7 @@ def process_module_file(self): elif first_char == "D": fields = re.split('\s{2,}', line) mnum = fields[1] - self.module_dict[mnum] = {"name" : fields[2]} + self.module_dict[mnum] = {"name" : fields[2], "type" : current_module_type, "category" : current_category, "subcategory" : current_subcategory} # unknown code else: raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has \ From 9dafba6b4d47df09070a11bf9e553df259e7f982 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 15:47:25 -0600 Subject: [PATCH 148/400] modules db class has to init the context class for shared variables to work. duh --- anvio/kegg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 5e84c85bea..1af1682472 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -413,7 +413,7 @@ def run_hmmpress(self): def setup_modules_db(self): """This function creates the Modules DB from the Kegg Module files. """ - mod_db = KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), run=run, progress=progress) + mod_db = KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), args=self.args, run=run, progress=progress) mod_db.create() @@ -549,7 +549,7 @@ class KeggModulesDatabase(KeggContext): Kegg Module files. """ - def __init__(self, db_path, run=run, progress=progress, quiet=False): + def __init__(self, db_path, args, run=run, progress=progress, quiet=False): self.db = None self.db_path = db_path self.quiet = quiet @@ -557,6 +557,9 @@ def __init__(self, db_path, run=run, progress=progress, quiet=False): self.run = run self.progress = progress + # init the base class for access to shared paths and such + KeggContext.__init__(self, args) + # modules table info # I wonder if these should be moved to the tables __init__.py at some point? self.module_table_name = "kegg_modules" From e6b8f0a65018c24db41398f70191685ce0a4ef76 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 26 Feb 2020 15:47:44 -0600 Subject: [PATCH 149/400] more verbose error output for sanity check --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 1af1682472..ba21547198 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -655,7 +655,8 @@ def create(self): raise ConfigError("Appparently, the Kegg Modules were not correctly setup and now all sorts of things are broken. The \ Modules DB cannot be created from broken things. BTW, this error is not supposed to happen to anyone except maybe developers, so \ if you do not fall into that category you are likely in deep doo-doo. Maybe re-running setup with --reset will work? (if not, you \ - probably should email/Slack/telepathically cry out for help to the developers.)") + probably should email/Slack/telepathically cry out for help to the developers). By the way, if this helps make things any clearer, \ + the number of modules in the module dictionary is currently %s" % len(self.module_dict.keys())) # init the Modules table mod_table = KeggModulesTable(self.module_table_name) From 6934c0abe665559b4ea7788b905e1340f677801a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 27 Feb 2020 09:28:53 -0600 Subject: [PATCH 150/400] fix progress indent --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index ba21547198..7879a23348 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -256,7 +256,7 @@ def process_module_file(self): else: raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has \ made the file unparseable. Sad. :(" % (self.kegg_module_file, first_char)) - self.progress.end() + self.progress.end() def download_modules(self): """This function downloads the KEGG modules. @@ -655,7 +655,7 @@ def create(self): raise ConfigError("Appparently, the Kegg Modules were not correctly setup and now all sorts of things are broken. The \ Modules DB cannot be created from broken things. BTW, this error is not supposed to happen to anyone except maybe developers, so \ if you do not fall into that category you are likely in deep doo-doo. Maybe re-running setup with --reset will work? (if not, you \ - probably should email/Slack/telepathically cry out for help to the developers). By the way, if this helps make things any clearer, \ + probably should email/Slack/telepathically cry out for help to the developers). Anyway, if this helps make things any clearer, \ the number of modules in the module dictionary is currently %s" % len(self.module_dict.keys())) # init the Modules table From 1bdc38af6059ea330b6445860d4483c376875757 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 27 Feb 2020 09:50:02 -0600 Subject: [PATCH 151/400] finally we got db to recognize module dict --- anvio/kegg.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 7879a23348..22c7dec331 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -47,7 +47,7 @@ def __init__(self, args): self.kofam_hmm_file_path = os.path.join(self.kofam_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms self.ko_list_file_path = os.path.join(self.kofam_data_dir, "ko_list") self.kegg_module_file = os.path.join(self.kofam_data_dir, "ko00002.keg") - self.module_dict = {} # this dict will be filled in by other functions + def setup_ko_dict(self): """The purpose of this function is to process the ko_list file into usable form by Kofam sub-classes. @@ -214,6 +214,7 @@ def process_module_file(self): D = Module """ + self.module_dict = {} filesnpaths.is_file_exists(self.kegg_module_file) filesnpaths.is_file_plain_text(self.kegg_module_file) @@ -413,7 +414,7 @@ def run_hmmpress(self): def setup_modules_db(self): """This function creates the Modules DB from the Kegg Module files. """ - mod_db = KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), args=self.args, run=run, progress=progress) + mod_db = KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), module_dictionary=self.module_dict, args=self.args, run=run, progress=progress) mod_db.create() @@ -549,13 +550,13 @@ class KeggModulesDatabase(KeggContext): Kegg Module files. """ - def __init__(self, db_path, args, run=run, progress=progress, quiet=False): + def __init__(self, db_path, module_dictionary, args, run=run, progress=progress, quiet=False): self.db = None self.db_path = db_path - self.quiet = quiet - + self.module_dict = module_dictionary self.run = run self.progress = progress + self.quiet = quiet # init the base class for access to shared paths and such KeggContext.__init__(self, args) From f404c11272de9ffae24e1a6ca105016c948a0cf5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 27 Feb 2020 12:12:11 -0600 Subject: [PATCH 152/400] save line stripped of whitespace --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 22c7dec331..41d9e58db7 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -227,7 +227,7 @@ def process_module_file(self): current_subcategory = None for line in f.readlines(): - line.strip('\n') + line = line.strip('\n') first_char = line[0] # garbage lines @@ -671,7 +671,7 @@ def create(self): prev_data_name_field = None for line in f.readlines(): - line.strip('\n') + line = line.strip('\n') line_number += 1 # check for last line ///. We don't want to send the last line to the parsing function because it will break. From b64a144266cf043ffbda8c84d57b5054e9f45a3b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 27 Feb 2020 15:08:05 -0600 Subject: [PATCH 153/400] need to pass the db to store --- anvio/kegg.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 41d9e58db7..f1c867efa2 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -586,7 +586,6 @@ def touch(self): self.db.create_table(self.module_table_name, self.module_table_structure, self.module_table_types) - return self.db def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None): """This function parses information from one line of a KEGG module file. @@ -691,7 +690,7 @@ def create(self): # unpack that tuple info for name, val, definition, line in entries_tuple_list: # call append_and_store which will collect db entries and store every 10000 at a time - mod_table.append_and_store(mnum, name, val, definition, line) + mod_table.append_and_store(self.db, mnum, name, val, definition, line) @@ -723,7 +722,7 @@ def __init__(self, mod_table_name = None): raise ConfigError("Beep Beep. Warning. KeggModulesTable was initialized without knowing its own name.") - def append_and_store(self, module_num, data_name, data_value, data_definition=None, line_num=None): + def append_and_store(self, db, module_num, data_name, data_value, data_definition=None, line_num=None): """This function handles collects db entries (as tuples) into a list, and once we have 10,000 of them it stores that set into the Modules table. The db_entries list is cleared after each store so that future stores don't add duplicate entries to the table. @@ -734,11 +733,11 @@ def append_and_store(self, module_num, data_name, data_value, data_definition=No self.total_entries += 1 if len(self.db_entries) > 10000: - self.store() + self.store(db) self.db_entries = [] - def store(self): + def store(self, db): if len(self.db_entries): db._exec_many('''INSERT INTO %s VALUES (%s)''' % (self.module_table_name, (','.join(['?'] * len(self.db_entries[0])))), self.db_entries) From ecba1dbcdea60f0b69b5599774b1d7ae1af786d2 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 27 Feb 2020 15:08:41 -0600 Subject: [PATCH 154/400] fix field assignment since first field is empty string --- anvio/kegg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index f1c867efa2..ba4f94f0c2 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -626,9 +626,9 @@ def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None) data_vals = fields[1] if len(fields) > 2: # not all lines have a definition field data_def = fields[2] - else: # data name known - data_vals = fields[0] - data_def = fields[1] + else: # data name known, first field still exists but is actually the empty string '' + data_vals = fields[1] + data_def = fields[2] # some types of information may need to be split into multiple db entries data_types_to_split = ["ORTHOLOGY","REACTION"] # lines that fall under these categories need to have data_vals split on comma From b3bf09f0e59ed129823fd133d7386e16349f9a22 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 27 Feb 2020 16:24:51 -0600 Subject: [PATCH 155/400] partial data value sanity check --- anvio/kegg.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index ba4f94f0c2..23a10f72a6 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -586,6 +586,63 @@ def touch(self): self.db.create_table(self.module_table_name, self.module_table_structure, self.module_table_types) + def data_vals_sanity_check(self, data_vals, current_data_name): + """This function checks if the data values were correctly parsed from a line in a KEGG module file. + + This is a sadly necessary step because some KEGG module file lines are problematic and don't follow the right format (ie, 2+ spaces + between different fields). So here we check if the values that we parsed look like they are the right format, without any extra bits. + Each data name (ORTHOLOGY, DEFINITION, etc) has a different format to check for. + + Note that we don't check the following data name types: NAME, REFERENCE + + PARAMETERS + ========== + data_vals str, the data values field (split from the kegg module line) + current_data_name str, which data name we are working on. It should never be None because we should have already figured this out by parsing the line. + + RETURNS + ======= + is_ok bool, whether the values look correctly formatted or not + """ + + is_ok = True + extra_info_to_print = None + + if not current_data_name: + raise ConfigError("data_vals_sanity_check() cannot be performed when the current data name is None. Something was not right when parsing the KEGG \ + module line.") + elif current_data_name == "ENTRY": + # example format: M00175 + if data_vals[0] != 'M' or len(data_vals) != 6: + is_ok = False + elif current_data_name == "DEFINITION": + # example format: (K01647,K05942) (K01681,K01682) (K00031,K00030) (K00164+K00658+K00382,K00174+K00175-K00177-K00176) + knums = [x for x in re.split('(|)|,| |+|-',data_vals) if x] + for k in knums: + if k[0] != 'K' or len(k) != 6: + is_ok = False + extra_info_to_print = knums + elif current_data_name == "ORTHOLOGY": + # example format: K00234,K00235,K00236,K00237 + knums = [x for x in re.split(',|+|-',data_vals) if x] + for k in knums: + if k[0] != 'K' or len(k) != 6: + is_ok = False + extra_info_to_print = knums + + + if not is_ok: + if extra_info_to_print: + self.run.warning("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ + incorrectly-formatted data value field: %s \ + and here is somem extra info that may be helpful: %s" % (current_data_name, data_vals, extra_info_to_print)) + else: + self.run.warning("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ + incorrectly-formatted data value field: %s" % (current_data_name, data_vals)) + + + return is_ok + def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None): """This function parses information from one line of a KEGG module file. @@ -623,11 +680,13 @@ def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None) is the line: %s)" % (line)) current_data_name = fields[0] - data_vals = fields[1] - if len(fields) > 2: # not all lines have a definition field - data_def = fields[2] - else: # data name known, first field still exists but is actually the empty string '' - data_vals = fields[1] + # note that if data name is known, first field still exists but is actually the empty string '' + # so no matter which situation, data value is field 1 and data definition (if any) is field 2 + data_vals = fields[1] + # need to sanity check data value field because SOME modules don't follow the 2-space separation formatting + vals_are_okay = self.data_vals_sanity_check(data_vals, current_data_name) + + if vals_are_okay and len(fields) > 2: # not all lines have a definition field data_def = fields[2] # some types of information may need to be split into multiple db entries From fd4e1e0e88dbaeb6f78be2cf8e642b5c60357fbf Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 10:03:57 -0600 Subject: [PATCH 156/400] the rest of the line sanity check function though it does not fix badly parsed lines yet --- anvio/kegg.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 23a10f72a6..c24b931933 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -593,7 +593,7 @@ def data_vals_sanity_check(self, data_vals, current_data_name): between different fields). So here we check if the values that we parsed look like they are the right format, without any extra bits. Each data name (ORTHOLOGY, DEFINITION, etc) has a different format to check for. - Note that we don't check the following data name types: NAME, REFERENCE + Note that we don't check the following data name types: NAME, CLASS, REFERENCE PARAMETERS ========== @@ -617,25 +617,44 @@ def data_vals_sanity_check(self, data_vals, current_data_name): is_ok = False elif current_data_name == "DEFINITION": # example format: (K01647,K05942) (K01681,K01682) (K00031,K00030) (K00164+K00658+K00382,K00174+K00175-K00177-K00176) - knums = [x for x in re.split('(|)|,| |+|-',data_vals) if x] + knums = [x for x in re.split('\(|\)|,| |\+|-',data_vals) if x] for k in knums: if k[0] != 'K' or len(k) != 6: is_ok = False extra_info_to_print = knums elif current_data_name == "ORTHOLOGY": # example format: K00234,K00235,K00236,K00237 - knums = [x for x in re.split(',|+|-',data_vals) if x] + knums = [x for x in re.split(',|\+|-', data_vals) if x] for k in knums: if k[0] != 'K' or len(k) != 6: is_ok = False extra_info_to_print = knums + elif current_data_name == "PATHWAY": + # example format: map00020 + if data_vals[0:3] != "map" or len(data_vals) != 8: + is_ok = False + elif current_data_name == "REACTION": + # example format: R01899+R00268,R00267,R00709 + rnums = [x for x in re.split(',|\+', data_vals) if x] + for r in rnums: + if r[0] != 'R' or len(r) != 6: + is_ok = False + extra_info_to_print = rnums + elif current_data_name == "COMPOUND": + # example format: C00024 + if data_vals[0] != 'C' or len(data_vals) != 6: + is_ok = False + elif current_data_name == "RMODULE": + # example format: RM003 + if data_vals[0:2] != "RM" or len(data_vals) != 5: + is_ok = False if not is_ok: if extra_info_to_print: self.run.warning("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ incorrectly-formatted data value field: %s \ - and here is somem extra info that may be helpful: %s" % (current_data_name, data_vals, extra_info_to_print)) + and here is some extra info that may be helpful: %s" % (current_data_name, data_vals, extra_info_to_print)) else: self.run.warning("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ incorrectly-formatted data value field: %s" % (current_data_name, data_vals)) From 89ef0c6ce6474e7f74183a094d9a4dfbfcf3500c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 10:11:47 -0600 Subject: [PATCH 157/400] also accept compound nums that start with G --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index c24b931933..0c859b243f 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -642,7 +642,7 @@ def data_vals_sanity_check(self, data_vals, current_data_name): extra_info_to_print = rnums elif current_data_name == "COMPOUND": # example format: C00024 - if data_vals[0] != 'C' or len(data_vals) != 6: + if data_vals[0] not in ['C','G'] or len(data_vals) != 6: is_ok = False elif current_data_name == "RMODULE": # example format: RM003 From 30303a02ebc5ae30ed85004d1e69610b6a602ed9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 10:15:33 -0600 Subject: [PATCH 158/400] get rid of extra printed info it is actually useless --- anvio/kegg.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 0c859b243f..8053201e62 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -606,7 +606,6 @@ def data_vals_sanity_check(self, data_vals, current_data_name): """ is_ok = True - extra_info_to_print = None if not current_data_name: raise ConfigError("data_vals_sanity_check() cannot be performed when the current data name is None. Something was not right when parsing the KEGG \ @@ -621,14 +620,12 @@ def data_vals_sanity_check(self, data_vals, current_data_name): for k in knums: if k[0] != 'K' or len(k) != 6: is_ok = False - extra_info_to_print = knums elif current_data_name == "ORTHOLOGY": # example format: K00234,K00235,K00236,K00237 knums = [x for x in re.split(',|\+|-', data_vals) if x] for k in knums: if k[0] != 'K' or len(k) != 6: is_ok = False - extra_info_to_print = knums elif current_data_name == "PATHWAY": # example format: map00020 if data_vals[0:3] != "map" or len(data_vals) != 8: @@ -639,7 +636,6 @@ def data_vals_sanity_check(self, data_vals, current_data_name): for r in rnums: if r[0] != 'R' or len(r) != 6: is_ok = False - extra_info_to_print = rnums elif current_data_name == "COMPOUND": # example format: C00024 if data_vals[0] not in ['C','G'] or len(data_vals) != 6: @@ -651,13 +647,8 @@ def data_vals_sanity_check(self, data_vals, current_data_name): if not is_ok: - if extra_info_to_print: - self.run.warning("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ - incorrectly-formatted data value field: %s \ - and here is some extra info that may be helpful: %s" % (current_data_name, data_vals, extra_info_to_print)) - else: - self.run.warning("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ - incorrectly-formatted data value field: %s" % (current_data_name, data_vals)) + self.run.warning("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ + incorrectly-formatted data value field: %s" % (current_data_name, data_vals)) return is_ok From ba9e610e50567d8acbe35c36e42aca34f17da9b1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 11:39:51 -0600 Subject: [PATCH 159/400] handle more complex orthology format --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 8053201e62..5b7ac819d7 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -622,7 +622,8 @@ def data_vals_sanity_check(self, data_vals, current_data_name): is_ok = False elif current_data_name == "ORTHOLOGY": # example format: K00234,K00235,K00236,K00237 - knums = [x for x in re.split(',|\+|-', data_vals) if x] + # more complex example: (K00163,K00161+K00162)+K00627+K00382-K13997 + knums = [x for x in re.split('\(|\)|,|\+|-', data_vals) if x] for k in knums: if k[0] != 'K' or len(k) != 6: is_ok = False From 09e0e119be994d21b40c5aff85167b2dc3fd3bb8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 15:31:24 -0600 Subject: [PATCH 160/400] correction for incorrect orthology --- anvio/kegg.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 5b7ac819d7..9f9aca71bb 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -606,6 +606,9 @@ def data_vals_sanity_check(self, data_vals, current_data_name): """ is_ok = True + is_corrected = False + corrected_vals = None + corrected_def = None if not current_data_name: raise ConfigError("data_vals_sanity_check() cannot be performed when the current data name is None. Something was not right when parsing the KEGG \ @@ -627,6 +630,16 @@ def data_vals_sanity_check(self, data_vals, current_data_name): for k in knums: if k[0] != 'K' or len(k) != 6: is_ok = False + # try to fix it by splitting on first space + if not is_ok: + split_data_vals = data_vals.split(" ", maxsplit=1) + corrected_vals = split_data_vals[0] + corrected_def = split_data_vals[1] + # double check that we don't have a knum in the new definition + if re.match("K\d{5}",corrected_def): + corrected_vals = "".join([corrected_vals,corrected_def]) + corrected_def = None + is_corrected = True elif current_data_name == "PATHWAY": # example format: map00020 if data_vals[0:3] != "map" or len(data_vals) != 8: @@ -651,8 +664,10 @@ def data_vals_sanity_check(self, data_vals, current_data_name): self.run.warning("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ incorrectly-formatted data value field: %s" % (current_data_name, data_vals)) + if is_corrected: + print("Line has been corrected. Corrected data values: %s\nCorrected data definition: %s" % (corrected_vals, corrected_def)) - return is_ok + return is_ok, corrected_vals, corrected_def def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None): @@ -695,7 +710,7 @@ def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None) # so no matter which situation, data value is field 1 and data definition (if any) is field 2 data_vals = fields[1] # need to sanity check data value field because SOME modules don't follow the 2-space separation formatting - vals_are_okay = self.data_vals_sanity_check(data_vals, current_data_name) + vals_are_okay, corrected_vals, corrected_def = self.data_vals_sanity_check(data_vals, current_data_name) if vals_are_okay and len(fields) > 2: # not all lines have a definition field data_def = fields[2] From 9311f36b30420026cde23ba93985a82a4ba4bde3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 15:33:50 -0600 Subject: [PATCH 161/400] correction for incorrect pathway --- anvio/kegg.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 9f9aca71bb..412625dc11 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -644,6 +644,10 @@ def data_vals_sanity_check(self, data_vals, current_data_name): # example format: map00020 if data_vals[0:3] != "map" or len(data_vals) != 8: is_ok = False + split_data_vals = data_vals.split(" ", maxsplit=1) + corrected_vals = split_data_vals[0] + corrected_def = split_data_vals[1] + is_corrected = True elif current_data_name == "REACTION": # example format: R01899+R00268,R00267,R00709 rnums = [x for x in re.split(',|\+', data_vals) if x] From 76d14ed1e820a653128ed20ce24e2e012f4cd09e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 15:37:16 -0600 Subject: [PATCH 162/400] correction for incorrect reaction --- anvio/kegg.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 412625dc11..b78edcfbdd 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -654,6 +654,11 @@ def data_vals_sanity_check(self, data_vals, current_data_name): for r in rnums: if r[0] != 'R' or len(r) != 6: is_ok = False + if not is_ok: + split_data_vals = data_vals.split(" ", maxsplit=1) + corrected_vals = split_data_vals[0] + corrected_def = split_data_vals[1] + is_corrected = True elif current_data_name == "COMPOUND": # example format: C00024 if data_vals[0] not in ['C','G'] or len(data_vals) != 6: From 0843ebaff029d5d929f48f066c3d9c2b6cde2eef Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 15:49:46 -0600 Subject: [PATCH 163/400] update warning for corrected lines and raise error if we find uncorrected issue --- anvio/kegg.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index b78edcfbdd..9b462403fc 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -669,12 +669,18 @@ def data_vals_sanity_check(self, data_vals, current_data_name): is_ok = False - if not is_ok: - self.run.warning("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ + if not is_ok and not is_corrected: + # in production, this should not end with an error. This raises an error for now just so I can easily find errors that I haven't implemented + # correction for yet + raise ConfigError("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ incorrectly-formatted data value field: %s" % (current_data_name, data_vals)) if is_corrected: - print("Line has been corrected. Corrected data values: %s\nCorrected data definition: %s" % (corrected_vals, corrected_def)) + self.run.warning("While parsing a KEGG Module line, we found an issue with the formatting. We did our very best to parse the line \ + correctly, but please check that it looks right to you by examining the following values.") + self.run.info("Incorrectly parsed data value field", data_vals) + self.run.info("Corrected data values", corrected_vals) + self.run.info("Corrected data definition", corrected_def) return is_ok, corrected_vals, corrected_def From 8d92dac15ff03877bfe107b4711418cb4537c687 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 15:55:45 -0600 Subject: [PATCH 164/400] fix module table structure --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 9b462403fc..a5a47c1feb 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -564,8 +564,8 @@ def __init__(self, db_path, module_dictionary, args, run=run, progress=progress, # modules table info # I wonder if these should be moved to the tables __init__.py at some point? self.module_table_name = "kegg_modules" - self.module_table_structure = ['module', 'data_name', 'data_value', 'data_definition'] - self.module_table_types = [ 'str' , 'str' , 'str' , 'str' ] + self.module_table_structure = ['module', 'data_name', 'data_value', 'data_definition', 'line'] + self.module_table_types = [ 'str' , 'str' , 'str' , 'str' ,'numeric' ] ## here we should call init function if the db exists From 07bcdc26cf4d9e4b476127ff0e25932491507a22 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 15:58:09 -0600 Subject: [PATCH 165/400] change db store condition so we do not call store for just 1 entry --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index a5a47c1feb..a8d80a13d1 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -832,7 +832,8 @@ def append_and_store(self, db, module_num, data_name, data_value, data_definitio self.db_entries.append(db_entry) self.total_entries += 1 - if len(self.db_entries) > 10000: + # we can store chunks of 5000 at a time, so we don't want over 10,000 entries. + if len(self.db_entries) >= 10000: self.store(db) self.db_entries = [] From dfc5a3f994898f1f0c31ac6531627d1a7b9e4e1f Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 28 Feb 2020 16:05:53 -0600 Subject: [PATCH 166/400] accept M nums for orthology. KEGG module db creation is finally working --- anvio/kegg.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index a8d80a13d1..0387bfd91b 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -619,16 +619,18 @@ def data_vals_sanity_check(self, data_vals, current_data_name): is_ok = False elif current_data_name == "DEFINITION": # example format: (K01647,K05942) (K01681,K01682) (K00031,K00030) (K00164+K00658+K00382,K00174+K00175-K00177-K00176) + # another example: (M00161,M00163) M00165 knums = [x for x in re.split('\(|\)|,| |\+|-',data_vals) if x] for k in knums: - if k[0] != 'K' or len(k) != 6: + if k[0] not in ['K','M'] or len(k) != 6: is_ok = False elif current_data_name == "ORTHOLOGY": # example format: K00234,K00235,K00236,K00237 # more complex example: (K00163,K00161+K00162)+K00627+K00382-K13997 + # another example: (M00161 [ie, from (M00161 Photosystem II)] knums = [x for x in re.split('\(|\)|,|\+|-', data_vals) if x] for k in knums: - if k[0] != 'K' or len(k) != 6: + if k[0] not in ['K','M'] or len(k) != 6: is_ok = False # try to fix it by splitting on first space if not is_ok: From 384248a4214f3e0ca7a011cb653d8f353ca62028 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 4 Mar 2020 13:57:00 -0600 Subject: [PATCH 167/400] end module DB creation output --- anvio/kegg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 0387bfd91b..989f55de79 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -797,6 +797,7 @@ def create(self): num_modules_parsed += 1 + self.progress.end() # give some run info self.run.info('Modules database', 'A new database, %s, has been created.' % (self.db_path), quiet=self.quiet) From 8e19e685c744472dca29bbbe53d10c3d5bff22cb Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 4 Mar 2020 15:00:05 -0600 Subject: [PATCH 168/400] now we actually put the corrected values in the db --- anvio/kegg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 989f55de79..14fee9000c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -731,6 +731,9 @@ def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None) if vals_are_okay and len(fields) > 2: # not all lines have a definition field data_def = fields[2] + elif not vals_are_okay: + data_vals = corrected_vals + data_def = corrected_def # some types of information may need to be split into multiple db entries data_types_to_split = ["ORTHOLOGY","REACTION"] # lines that fall under these categories need to have data_vals split on comma From 7e0146bc106bca2ef629d3add303dd8ee77b4a98 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 4 Mar 2020 15:40:28 -0600 Subject: [PATCH 169/400] fix for orthology that has commas within () --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 14fee9000c..094d8c1e6f 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -738,7 +738,8 @@ def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None) # some types of information may need to be split into multiple db entries data_types_to_split = ["ORTHOLOGY","REACTION"] # lines that fall under these categories need to have data_vals split on comma if current_data_name in data_types_to_split: - for val in data_vals.split(','): + # here we should NOT split on any commas within parentheses + for val in re.split(',(?!.*\))', data_vals): line_entries.append((current_data_name, val, data_def, line_num)) else: # just send what we found without splitting the line line_entries.append((current_data_name, data_vals, data_def, line_num)) From 9b0d47ee11ef752a81fd7fdd66e7947a1f4caf8c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 4 Mar 2020 15:51:47 -0600 Subject: [PATCH 170/400] ignore lines that are completely blank --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 094d8c1e6f..d888f3c523 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -780,7 +780,8 @@ def create(self): line_number += 1 # check for last line ///. We don't want to send the last line to the parsing function because it will break. - if not line == '///': + # we also check here that the line is not entirely blank (this happens sometimes in KEGG modules, inexplicably) + if not line == '///' and re.search(r"\S+", line): # parse the line into a tuple entries_tuple_list = None # here is the tricky bit about parsing these files. Not all lines start with the data_name field; those that don't start with a space. From af7ed659ed2f8d9005fd021632037193c1175280 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 08:29:22 -0600 Subject: [PATCH 171/400] is_kegg_modules_db function --- anvio/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/anvio/utils.py b/anvio/utils.py index 344d094069..37ca0d4803 100644 --- a/anvio/utils.py +++ b/anvio/utils.py @@ -2695,6 +2695,11 @@ def is_genes_db(db_path): raise ConfigError("'%s' is not an anvi'o genes database." % db_path) return True +def is_kegg_modules_db(db_path): + if get_db_type(db_path) != 'modules': + raise ConfigError("'%s' is not an anvi'o KEGG modules database." % db_path) + return True + def is_profile_db_merged(profile_db_path): is_profile_db(profile_db_path) From 90bbf14ff3ee1e0ec307f148e2a9d78b19ee84b6 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 08:29:47 -0600 Subject: [PATCH 172/400] initialize an existing modules db --- anvio/kegg.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index d888f3c523..d37b61951e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -568,6 +568,12 @@ def __init__(self, db_path, module_dictionary, args, run=run, progress=progress, self.module_table_types = [ 'str' , 'str' , 'str' , 'str' ,'numeric' ] ## here we should call init function if the db exists + if os.path.exists(self.db_path): + utils.is_kegg_modules_db(self.db_path) + self.db = db.DB(self.db_path, anvio.__kegg_modules_version__, new_database=False) + + self.run.info('Modules database', 'An existing database, %s, has been loaded.' % self.db_path, quiet=self.quiet) + self.run.info('Kegg Modules', '%d found' % self.db.get_meta_value('num_modules'), quiet=self.quiet) def touch(self): """Creates an empty Modules database on disk, and sets `self.db` to access to it. From 4f0ce5d33b6174bf331f1d2ab8ab1907c07d5064 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 08:43:19 -0600 Subject: [PATCH 173/400] make modules dict optional param for modules db; existing db should not need it --- anvio/kegg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index d37b61951e..6b57c55f03 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -414,7 +414,7 @@ def run_hmmpress(self): def setup_modules_db(self): """This function creates the Modules DB from the Kegg Module files. """ - mod_db = KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), module_dictionary=self.module_dict, args=self.args, run=run, progress=progress) + mod_db = KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), args=self.args, module_dictionary=self.module_dict, run=run, progress=progress) mod_db.create() @@ -550,7 +550,7 @@ class KeggModulesDatabase(KeggContext): Kegg Module files. """ - def __init__(self, db_path, module_dictionary, args, run=run, progress=progress, quiet=False): + def __init__(self, db_path, args, module_dictionary=None, run=run, progress=progress, quiet=False): self.db = None self.db_path = db_path self.module_dict = module_dictionary @@ -574,6 +574,9 @@ def __init__(self, db_path, module_dictionary, args, run=run, progress=progress, self.run.info('Modules database', 'An existing database, %s, has been loaded.' % self.db_path, quiet=self.quiet) self.run.info('Kegg Modules', '%d found' % self.db.get_meta_value('num_modules'), quiet=self.quiet) + else: + if not self.module_dict: + raise ConfigError("ERROR - a new KeggModulesDatabase() cannot be initialized without providing a modules dictionary. IT WILL DIE NOW.") def touch(self): """Creates an empty Modules database on disk, and sets `self.db` to access to it. From 1d10502bd0ed129babd9c4f5b4463e935c56bdce Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 13:23:22 -0600 Subject: [PATCH 174/400] update to get_some_rows_from_table_as_dict so that it works for DBs without unique key in first column of table --- anvio/db.py | 27 ++++++++++++++++++++------- anvio/kegg.py | 3 +++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/anvio/db.py b/anvio/db.py index c31a6afd44..a5970ce207 100644 --- a/anvio/db.py +++ b/anvio/db.py @@ -400,7 +400,7 @@ def get_table_as_dict(self, table_name, table_structure=None, string_the_key=Fal # entry assigns a new `entry_id`, enters the data. it is all good when there is a single process doing it. # but when there are multiple processes running in parallel, sometimes race conditions occur: two processes # learn the max entry id about the same time, and when they finally enter the data to the db, some entries - # end up not being unique. this is a toughie because sometimes entry ids are used to connect distinct + # end up not being unique. this is a toughie because sometimes entry ids are used to connect distinct # information from different tables, so they must be known before the data goes into the database, etc. # when these race conditions occur, anvi'o gives an error telling the user kindly that they are fucked. but in # some cases it is possible to recover from that (THE CODE BELOW TRIES TO DO THAT) by reassigning all ids on the @@ -542,7 +542,7 @@ def get_table_as_dataframe(self, table_name, table_structure = None, columns_of return results_df - def get_some_rows_from_table_as_dict(self, table_name, where_clause, error_if_no_data=True, string_the_key=False): + def get_some_rows_from_table_as_dict(self, table_name, where_clause, error_if_no_data=True, string_the_key=False, row_num_as_key=False): """This is similar to get_table_as_dict, but much less general. get_table_as_dict can do a lot, but it first reads all data into the memory to operate on it. @@ -557,16 +557,29 @@ def get_some_rows_from_table_as_dict(self, table_name, where_clause, error_if_no rows = self._exec('''SELECT * FROM %s WHERE %s''' % (table_name, where_clause)).fetchall() + row_num = 0 for row in rows: entry = {} - for i in columns_to_return[1:]: - entry[table_structure[i]] = row[i] + if row_num_as_key: + entry[table_structure[0]] = row[0] + for i in columns_to_return[1:]: + entry[table_structure[i]] = row[i] - if string_the_key: - results_dict[str(row[0])] = entry + if string_the_key: + results_dict[str(row_num)] = entry + else: + results_dict[row_num] = entry else: - results_dict[row[0]] = entry + for i in columns_to_return[1:]: + entry[table_structure[i]] = row[i] + + if string_the_key: + results_dict[str(row[0])] = entry + else: + results_dict[row[0]] = entry + + row_num += 1 if error_if_no_data and not len(results_dict): raise ConfigError("Query on %s with the where clause of '%s' did not return anything." % (table_name, where_clause)) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6b57c55f03..bb784f7f82 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -460,6 +460,9 @@ def __init__(self, args, run=run, progress=progress): self.setup_ko_dict() # read the ko_list file into self.ko_dict + # load existing kegg modules db + self.kegg_db = kegg.KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), args=self.args) + def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): if not self.ko_dict: raise ConfigError("Oops! The ko_list file has not been properly loaded, so get_annotation_from_ko_dict() is \ From 5eb5ad0a790d3e8ffdc9d4e04bfd7f107470d169 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 14:34:29 -0600 Subject: [PATCH 175/400] we should use kegg_modules instead of just kegg or just modules wherever possible to avoid confusion later --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index bb784f7f82..6265a9b50c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -461,7 +461,7 @@ def __init__(self, args, run=run, progress=progress): self.setup_ko_dict() # read the ko_list file into self.ko_dict # load existing kegg modules db - self.kegg_db = kegg.KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), args=self.args) + self.kegg_modules_db = kegg.KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), args=self.args) def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): if not self.ko_dict: From 0868eb5e0c7d2da11c8142050965ffb07dc17c21 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 14:34:57 -0600 Subject: [PATCH 176/400] modules table accessor method for mnum + data_name --- anvio/kegg.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6265a9b50c..8e116e38cd 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -828,6 +828,42 @@ def create(self): self.db.disconnect() + + # KEGG Modules Table functions for data access and parsing start below + # ==================================================================== + def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): + """This function returns data_value elements from the modules table for the specified module and data_name pair. + + All elements corresponding to the pair (ie, M00001 and ORTHOLOGY) will be returned. + The function relies on the db.get_some_rows_from_table_as_dict() functino to first fetch all rows corresponding \ + to a particular model, and then parses the resulting dictionary to find all the elements with the given data_name field. + + PARAMETERS + ========== + module_num str, the module to fetch data for + data_name str, which data_name field we want + + RETURNS + ======= + data_values_to_ret list of str, the data_values corresponding to the module/data_name pair + """ + + where_clause_string = "module = '%s'" % (module_num) + dict_from_mod_table = self.db.get_some_rows_from_table_as_dict(self.module_table_name, where_clause_string, row_num_as_key=True) + # the returned dictionary is keyed by an arbitrary integer, and each value is a dict containing one row from the modules table + # ex of one row in this dict: 0: {'module': 'M00001', 'data_name': 'ENTRY', 'data_value': 'M00001', 'data_definition': 'Pathway', 'line': 1} + data_values_to_ret = [] + for key in dict_from_mod_table.keys(): + if dict_from_mod_table[key]['data_name'] == data_name: + data_values_to_ret.append(dict_from_mod_table[key]['data_value']) + + if not data_values_to_ret: # didn't find anything under that data_name + self.run.warning("Just so you know, we tried to fetch data from the KEGG Modules database for the data_name field %s and KEGG module %s, \ + but didn't come up with anything, so an empty list is being returned. This may cause errors down the line, and if so we're very sorry for that.") + + return data_values_to_ret + + class KeggModulesTable: """This class defines operations for creating the KEGG Modules table in Modules.db""" From 4bf9e2feaf7d9ce651b932f42aa92b9c5f7e6e96 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 15:01:02 -0600 Subject: [PATCH 177/400] parsing and accessor functions for class of a module --- anvio/kegg.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 8e116e38cd..f431532b60 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -864,6 +864,28 @@ def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): return data_values_to_ret + def parse_kegg_class_value(self, class_data_val): + """This function takes a data_value string for the CLASS field in the modules table and parses it into a dictionary. + + The data_value string of CLASS fields should look something like this: Pathway modules; Amino acid metabolism; Lysine metabolism + so they can be parsed into 3 parts: class, category, and subcategory. + """ + + fields = class_data_val.split("; ") + class_dict = {"class" : fields[0], "category" : fields[1], "subcategory" : fields[2] if len(fields) > 2 else None} + return class_dict + + def get_kegg_module_class_dict(self, mnum): + """This function returns a dictionary of values in the CLASS field for a specific module + + It really exists only for convenience to put together the data fetch and parsing functions. + """ + + # there should only be one CLASS line per module, so we extract the first list element + class_value = self.get_data_value_entries_for_module_by_data_name(mnum, "CLASS")[0] + return self.parse_kegg_class_value(class_value) + + class KeggModulesTable: """This class defines operations for creating the KEGG Modules table in Modules.db""" From bd286fa37878e4faa414e9cea04f3a38b7c9c8b3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 16:20:27 -0600 Subject: [PATCH 178/400] keep track of current module being parsed --- anvio/kegg.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index f431532b60..f0bb11085a 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -598,7 +598,7 @@ def touch(self): self.db.create_table(self.module_table_name, self.module_table_structure, self.module_table_types) - def data_vals_sanity_check(self, data_vals, current_data_name): + def data_vals_sanity_check(self, data_vals, current_data_name, current_module_num): """This function checks if the data values were correctly parsed from a line in a KEGG module file. This is a sadly necessary step because some KEGG module file lines are problematic and don't follow the right format (ie, 2+ spaces @@ -607,10 +607,13 @@ def data_vals_sanity_check(self, data_vals, current_data_name): Note that we don't check the following data name types: NAME, CLASS, REFERENCE + WARNING: The error checking and correction is by no means perfect and may well fail when KEGG is next updated. :( + PARAMETERS ========== data_vals str, the data values field (split from the kegg module line) current_data_name str, which data name we are working on. It should never be None because we should have already figured this out by parsing the line. + current_module_num str, which module we are working on. We need this to keep track of which modules throw parsing errors. RETURNS ======= @@ -699,7 +702,7 @@ def data_vals_sanity_check(self, data_vals, current_data_name): return is_ok, corrected_vals, corrected_def - def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None): + def parse_kegg_modules_line(self, line, current_module, line_num = None, current_data_name=None, error_dictionary=None): """This function parses information from one line of a KEGG module file. These files have fields separated by 2 or more spaces. Fields can include data name (not always), data value (always), and data definition (not always). @@ -711,6 +714,7 @@ def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None) PARAMETERS ========== line str, the line to parse + current_module str, which module we are working on. We need this to keep track of which modules throw parsing errors line_num int, which line number we are working on. We need this to keep track of which entities come from the same line of the file. current_data_name str, which data name we are working on. If this is None, we need to parse this info from the first field in the line. @@ -739,7 +743,7 @@ def parse_kegg_modules_line(self, line, line_num = None, current_data_name=None) # so no matter which situation, data value is field 1 and data definition (if any) is field 2 data_vals = fields[1] # need to sanity check data value field because SOME modules don't follow the 2-space separation formatting - vals_are_okay, corrected_vals, corrected_def = self.data_vals_sanity_check(data_vals, current_data_name) + vals_are_okay, corrected_vals, corrected_def = self.data_vals_sanity_check(data_vals, current_data_name, current_module) if vals_are_okay and len(fields) > 2: # not all lines have a definition field data_def = fields[2] @@ -799,9 +803,9 @@ def create(self): # here is the tricky bit about parsing these files. Not all lines start with the data_name field; those that don't start with a space. # if this is the case, we need to tell the parsing function what the previous data_name field has been. if line[0] == ' ': - entries_tuple_list = self.parse_kegg_modules_line(line, line_number, prev_data_name_field) + entries_tuple_list = self.parse_kegg_modules_line(line, mnum, line_number, prev_data_name_field) else: - entries_tuple_list = self.parse_kegg_modules_line(line, line_number) + entries_tuple_list = self.parse_kegg_modules_line(line, mnum, line_number) # update prev_data_name_field; use the first (and perhaps only) entry by default prev_data_name_field = entries_tuple_list[0][0] From 7aad6e70cf4eb2f5e9ad74769fd7033cde7d6550 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 16:21:20 -0600 Subject: [PATCH 179/400] keep track of parsing errors in dict for concise output --- anvio/kegg.py | 53 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index f0bb11085a..cbaa8d2462 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -632,6 +632,7 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu # example format: M00175 if data_vals[0] != 'M' or len(data_vals) != 6: is_ok = False + self.parsing_error_dict['bad_kegg_code_format'].append(current_module_num) elif current_data_name == "DEFINITION": # example format: (K01647,K05942) (K01681,K01682) (K00031,K00030) (K00164+K00658+K00382,K00174+K00175-K00177-K00176) # another example: (M00161,M00163) M00165 @@ -639,6 +640,8 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu for k in knums: if k[0] not in ['K','M'] or len(k) != 6: is_ok = False + if not is_ok: # this goes here to avoid counting multiple errors for the same line + self.parsing_error_dict['bad_kegg_code_format'].append(current_module_num) elif current_data_name == "ORTHOLOGY": # example format: K00234,K00235,K00236,K00237 # more complex example: (K00163,K00161+K00162)+K00627+K00382-K13997 @@ -649,6 +652,7 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu is_ok = False # try to fix it by splitting on first space if not is_ok: + self.parsing_error_dict['bad_line_splitting'].append(current_module_num) split_data_vals = data_vals.split(" ", maxsplit=1) corrected_vals = split_data_vals[0] corrected_def = split_data_vals[1] @@ -661,6 +665,7 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu # example format: map00020 if data_vals[0:3] != "map" or len(data_vals) != 8: is_ok = False + self.parsing_error_dict['bad_line_splitting'].append(current_module_num) split_data_vals = data_vals.split(" ", maxsplit=1) corrected_vals = split_data_vals[0] corrected_def = split_data_vals[1] @@ -672,6 +677,7 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu if r[0] != 'R' or len(r) != 6: is_ok = False if not is_ok: + self.parsing_error_dict['bad_line_splitting'].append(current_module_num) split_data_vals = data_vals.split(" ", maxsplit=1) corrected_vals = split_data_vals[0] corrected_def = split_data_vals[1] @@ -680,24 +686,28 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu # example format: C00024 if data_vals[0] not in ['C','G'] or len(data_vals) != 6: is_ok = False + self.parsing_error_dict['bad_kegg_code_format'].append(current_module_num) elif current_data_name == "RMODULE": # example format: RM003 if data_vals[0:2] != "RM" or len(data_vals) != 5: is_ok = False + self.parsing_error_dict['bad_kegg_code_format'].append(current_module_num) if not is_ok and not is_corrected: - # in production, this should not end with an error. This raises an error for now just so I can easily find errors that I haven't implemented - # correction for yet + self.num_uncorrected_errors += 1 + # we should allow a --just-do-it option here for people to ignore uncorrected errors raise ConfigError("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ incorrectly-formatted data value field: %s" % (current_data_name, data_vals)) if is_corrected: - self.run.warning("While parsing a KEGG Module line, we found an issue with the formatting. We did our very best to parse the line \ - correctly, but please check that it looks right to you by examining the following values.") - self.run.info("Incorrectly parsed data value field", data_vals) - self.run.info("Corrected data values", corrected_vals) - self.run.info("Corrected data definition", corrected_def) + self.num_corrected_errors += 1 + if anvio.DEBUG and not self.quiet: + self.run.warning("While parsing a KEGG Module line, we found an issue with the formatting. We did our very best to parse the line \ + correctly, but please check that it looks right to you by examining the following values.") + self.run.info("Incorrectly parsed data value field", data_vals) + self.run.info("Corrected data values", corrected_vals) + self.run.info("Corrected data definition", corrected_def) return is_ok, corrected_vals, corrected_def @@ -783,6 +793,11 @@ def create(self): # init the Modules table mod_table = KeggModulesTable(self.module_table_name) + # keep track of errors encountered while parsing + self.parsing_error_dict = {"two_definition_fields" : [], "bad_line_splitting" : [], "bad_kegg_code_format" : []} + self.num_corrected_errors = 0 + self.num_uncorrected_errors = 0 + num_modules_parsed = 0 line_number = 0 for mnum in self.module_dict.keys(): @@ -820,10 +835,34 @@ def create(self): num_modules_parsed += 1 self.progress.end() + # warn user about parsing errors + if anvio.DEBUG: + self.run.warning("Several parsing errors were encountered while building the KEGG Modules DB. \ + Below you will see which modules threw each type of parsing error. Note that modules which threw multiple \ + errors will occur in the list as many times as it threw each error.") + self.run.info("Two DEFINITION lines (in one module)", self.parsing_error_dict["two_definition_fields"]) + self.run.info("Bad line splitting (usually due to rogue or missing spaces)", self.parsing_error_dict["bad_line_splitting"]) + self.run.info("Bad KEGG code format (usually not correctable)", self.parsing_error_dict["bad_kegg_code_format"]) + else: # less verbose + self.run.warning("First things first - don't panic. Several parsing errors were encountered while building the KEGG Modules DB. But that \ + is probably okay, because if you got to this point it is likely that we already fixed all of them ourselves. So don't worry too much. \ + Below you will see how many of each type of error was encountered. If you would like to see which modules threw these errors, please \ + re-run the setup using the --debug flag (you will also probably need the --reset flag). When doing so, you will also see which lines \ + caused issues; this can be a lot of output, so you can suppress the line-specific output with the --quiet flag if that makes things easier to read. \ + So, in summary: You can probably ignore this warning. But if you want more info: \ + run setup again with --reset --debug --quiet to see exactly which modules had issues, or \ + run --reset --debug to see exactly which lines in which modules had issues. \ + Now, here is a kiss for you because you have been so patient and good with anvi'o 😚") + self.run.info("Two DEFINITION lines (in one module)", len(self.parsing_error_dict["two_definition_fields"])) + self.run.info("Bad line splitting (usually due to rogue or missing spaces)", len(self.parsing_error_dict["bad_line_splitting"])) + self.run.info("Bad KEGG code format (usually not correctable)", len(self.parsing_error_dict["bad_kegg_code_format"])) + # give some run info self.run.info('Modules database', 'A new database, %s, has been created.' % (self.db_path), quiet=self.quiet) self.run.info('Number of KEGG modules', num_modules_parsed, quiet=self.quiet) self.run.info('Number of entries', mod_table.get_total_entries(), quiet=self.quiet) + self.run.info('Number of parsing errors (corrected)', self.num_corrected_errors) + self.run.info('Number of parsing errors (uncorrected)', self.num_uncorrected_errors) # record some useful metadata self.db.set_meta_value('db_type', 'modules') From 5a2b444df5131d87d7b3174c1ae939d0eb8c7a5e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 16:21:49 -0600 Subject: [PATCH 180/400] add --quiet param --- anvio/kegg.py | 1 + bin/anvi-setup-kegg-kofams | 1 + 2 files changed, 2 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index cbaa8d2462..b3e84f71d6 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -42,6 +42,7 @@ def __init__(self, args): self.kofam_data_dir = A('kofam_data_dir') or os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') self.orphan_data_dir = os.path.join(self.kofam_data_dir, "orphan_data") self.module_data_dir = os.path.join(self.kofam_data_dir, "modules") + self.quiet = A('quiet') or False # shared variables for all KOfam subclasses self.kofam_hmm_file_path = os.path.join(self.kofam_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index 11ffbd3191..bf880b29ff 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -34,6 +34,7 @@ if __name__ == '__main__': parser.add_argument('--reset', default=False, action="store_true", help="This program by default attempts to use previously\ downloaded files in your KEGG KOfam data directory if there are any. If something is wrong for some reason you\ can use this parameter to tell anvi'o to remove everything, and start over.") + parser.add_argument('--quiet', default=False, action="store_true", help="Use this flag for less verbose output.") args = anvio.get_args(parser) From 57d95bee66198cf62106eca642adc7944c68c2e9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 16:39:13 -0600 Subject: [PATCH 181/400] count multiple definition field "errors" --- anvio/kegg.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index b3e84f71d6..f40f92fde6 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -750,6 +750,15 @@ def parse_kegg_modules_line(self, line, current_module, line_num = None, current is the line: %s)" % (line)) current_data_name = fields[0] + # find the double DEFINITION field "errors" + elif current_data_name == "DEFINITION" and line[0] == ' ': + self.parsing_error_dict['multiple_definition_fields'].append(current_module) + self.num_uncorrected_errors += 1 + if anvio.DEBUG and not self.quiet: + self.run.warning("While parsing a KEGG Module line, we found more than one DEFINITION field for module %s. This is unusual, but \ + probably not REALLY an error, so we kept the extra line. Please note that we are counting it as an uncorrected error, though. \ + We hope that doesn't freak you out." % current_module) + self.run.info("Extra DEFINITION field", line) # note that if data name is known, first field still exists but is actually the empty string '' # so no matter which situation, data value is field 1 and data definition (if any) is field 2 data_vals = fields[1] @@ -795,7 +804,7 @@ def create(self): mod_table = KeggModulesTable(self.module_table_name) # keep track of errors encountered while parsing - self.parsing_error_dict = {"two_definition_fields" : [], "bad_line_splitting" : [], "bad_kegg_code_format" : []} + self.parsing_error_dict = {"multiple_definition_fields" : [], "bad_line_splitting" : [], "bad_kegg_code_format" : []} self.num_corrected_errors = 0 self.num_uncorrected_errors = 0 @@ -841,9 +850,9 @@ def create(self): self.run.warning("Several parsing errors were encountered while building the KEGG Modules DB. \ Below you will see which modules threw each type of parsing error. Note that modules which threw multiple \ errors will occur in the list as many times as it threw each error.") - self.run.info("Two DEFINITION lines (in one module)", self.parsing_error_dict["two_definition_fields"]) + self.run.info("Multiple DEFINITION lines (not corrected, but probably fine)", self.parsing_error_dict["multiple_definition_fields"]) self.run.info("Bad line splitting (usually due to rogue or missing spaces)", self.parsing_error_dict["bad_line_splitting"]) - self.run.info("Bad KEGG code format (usually not correctable)", self.parsing_error_dict["bad_kegg_code_format"]) + self.run.info("Bad KEGG code format (not corrected; possibly problematic)", self.parsing_error_dict["bad_kegg_code_format"]) else: # less verbose self.run.warning("First things first - don't panic. Several parsing errors were encountered while building the KEGG Modules DB. But that \ is probably okay, because if you got to this point it is likely that we already fixed all of them ourselves. So don't worry too much. \ @@ -854,7 +863,7 @@ def create(self): run setup again with --reset --debug --quiet to see exactly which modules had issues, or \ run --reset --debug to see exactly which lines in which modules had issues. \ Now, here is a kiss for you because you have been so patient and good with anvi'o 😚") - self.run.info("Two DEFINITION lines (in one module)", len(self.parsing_error_dict["two_definition_fields"])) + self.run.info("Multiple DEFINITION lines (in one module)", len(self.parsing_error_dict["multiple_definition_fields"])) self.run.info("Bad line splitting (usually due to rogue or missing spaces)", len(self.parsing_error_dict["bad_line_splitting"])) self.run.info("Bad KEGG code format (usually not correctable)", len(self.parsing_error_dict["bad_kegg_code_format"])) From b93b302efaa0c5ce2ba647fd5500452dd865746f Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 16:39:44 -0600 Subject: [PATCH 182/400] add quiet option to parsing error counts --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index f40f92fde6..7f273d2aa2 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -871,8 +871,8 @@ def create(self): self.run.info('Modules database', 'A new database, %s, has been created.' % (self.db_path), quiet=self.quiet) self.run.info('Number of KEGG modules', num_modules_parsed, quiet=self.quiet) self.run.info('Number of entries', mod_table.get_total_entries(), quiet=self.quiet) - self.run.info('Number of parsing errors (corrected)', self.num_corrected_errors) - self.run.info('Number of parsing errors (uncorrected)', self.num_uncorrected_errors) + self.run.info('Number of parsing errors (corrected)', self.num_corrected_errors, quiet=self.quiet) + self.run.info('Number of parsing errors (uncorrected)', self.num_uncorrected_errors, quiet=self.quiet) # record some useful metadata self.db.set_meta_value('db_type', 'modules') From 65d3979c22f128da9060b45c41a1ed0622edd3ba Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 16:53:15 -0600 Subject: [PATCH 183/400] I broke this somehow, but now I fixed it again --- bin/anvi-setup-kegg-kofams | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index bf880b29ff..7df2495beb 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -7,6 +7,7 @@ import anvio import anvio.kegg as kegg from anvio.errors import ConfigError, FilesNPathsError +from anvio.terminal import time_program __author__ = "Developers of anvi'o (see AUTHORS.txt)" __copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" From c561d9127f626f8352878db462ba2b4cd3775f63 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 17:05:38 -0600 Subject: [PATCH 184/400] argument convention --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 7f273d2aa2..0ccc6580e6 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -713,7 +713,7 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu return is_ok, corrected_vals, corrected_def - def parse_kegg_modules_line(self, line, current_module, line_num = None, current_data_name=None, error_dictionary=None): + def parse_kegg_modules_line(self, line, current_module, line_num=None, current_data_name=None, error_dictionary=None): """This function parses information from one line of a KEGG module file. These files have fields separated by 2 or more spaces. Fields can include data name (not always), data value (always), and data definition (not always). From d628248eb540a5ddc3278a30fd0a901fd7828b14 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 17:10:42 -0600 Subject: [PATCH 185/400] add just-do-it as acceptable param --- bin/anvi-setup-kegg-kofams | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index 7df2495beb..d2a6a04ef2 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -36,6 +36,7 @@ if __name__ == '__main__': downloaded files in your KEGG KOfam data directory if there are any. If something is wrong for some reason you\ can use this parameter to tell anvi'o to remove everything, and start over.") parser.add_argument('--quiet', default=False, action="store_true", help="Use this flag for less verbose output.") + parser.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) args = anvio.get_args(parser) From 06e4e88f34fc953b787a24bb66a663e9ae467679 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 17:17:46 -0600 Subject: [PATCH 186/400] get --reset the right way --- bin/anvi-setup-kegg-kofams | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index d2a6a04ef2..12159ddb2e 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -32,9 +32,7 @@ if __name__ == '__main__': up. The advantage of it is that everyone will be using a single data directory, but then you may need\ superuser privileges to do it. Using this parameter you can choose the location of the data directory somewhere\ you like. However, when it is time to run Kofam, you will need to remember that path and provide it to the program.") - parser.add_argument('--reset', default=False, action="store_true", help="This program by default attempts to use previously\ - downloaded files in your KEGG KOfam data directory if there are any. If something is wrong for some reason you\ - can use this parameter to tell anvi'o to remove everything, and start over.") + parser.add_argument(*anvio.A('reset'), **anvio.K('reset')) parser.add_argument('--quiet', default=False, action="store_true", help="Use this flag for less verbose output.") parser.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) From e418eda657e84a0247f66b69b5152b15743dbc90 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 17:22:56 -0600 Subject: [PATCH 187/400] update requires and provides for kegg programs --- bin/anvi-run-kegg-kofams | 4 ++-- bin/anvi-setup-kegg-kofams | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams index 915b31cc23..ea2f9184de 100755 --- a/bin/anvi-run-kegg-kofams +++ b/bin/anvi-run-kegg-kofams @@ -16,8 +16,8 @@ __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "Iva Veseli" __email__ = "iveseli@uchicago.edu" -__requires__ = ['contigs-db', "kofam-data",] -#__provides__ = ## TODO: fill in +__requires__ = ['contigs-db', "kofam-data", "kegg-modules-db",] +__provides__ = ['functions',] __description__ = "Run KOfam HMMs on an anvi'o contigs database." @time_program diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index 12159ddb2e..896286df59 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -15,7 +15,7 @@ __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "Iva Veseli" __email__ = "iveseli@uchicago.edu" -__provides__ = ["kofam-data"] +__provides__ = ["kofam-data", "kegg-modules-db",] __description__ = "Download and setup KEGG KOfam HMM profiles." @time_program From 451e1f2e762c18879547ec5b5b1347936575b116 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 17:36:11 -0600 Subject: [PATCH 188/400] argument groups for nicer -h --- bin/anvi-run-kegg-kofams | 9 ++++++--- bin/anvi-setup-kegg-kofams | 9 +++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams index ea2f9184de..e68e7c1b1c 100755 --- a/bin/anvi-run-kegg-kofams +++ b/bin/anvi-run-kegg-kofams @@ -30,9 +30,12 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description=__description__) - parser.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db')) - parser.add_argument(*anvio.A('kofam-data-dir'), **anvio.K('kofam-data-dir')) - parser.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) + groupR = parser.add_argument_group('REQUIRED INPUT', 'The stuff you need for this to work.') + groupO = parser.add_argument_group('OPTIONAL INPUT', "The stuff you (probably) don't need.") + + groupR.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db')) + groupO.add_argument(*anvio.A('kofam-data-dir'), **anvio.K('kofam-data-dir')) + groupO.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) args = anvio.get_args(parser) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index 896286df59..b3be48d4c9 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -27,14 +27,15 @@ if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description=__description__) - parser.add_argument('--kofam-data-dir', default=None, type=str, help="The directory for KEGG KOfam HMM profiles to be stored. If you leave it\ + groupI = parser.add_argument_group('POSSIBLE INPUT', 'Not required for this program to run, but could be useful.') + groupI.add_argument('--kofam-data-dir', default=None, type=str, help="The directory for KEGG KOfam HMM profiles to be stored. If you leave it\ as is without specifying anything, the default destination for the data directory will be used to set things\ up. The advantage of it is that everyone will be using a single data directory, but then you may need\ superuser privileges to do it. Using this parameter you can choose the location of the data directory somewhere\ you like. However, when it is time to run Kofam, you will need to remember that path and provide it to the program.") - parser.add_argument(*anvio.A('reset'), **anvio.K('reset')) - parser.add_argument('--quiet', default=False, action="store_true", help="Use this flag for less verbose output.") - parser.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) + groupI.add_argument(*anvio.A('reset'), **anvio.K('reset')) + groupI.add_argument('--quiet', default=False, action="store_true", help="Use this flag for less verbose output.") + groupI.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) args = anvio.get_args(parser) From c469122e9461fcc67b8feebe7ca8e3c0f8c06ffd Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 5 Mar 2020 22:28:15 -0600 Subject: [PATCH 189/400] allow skipping uncorrectable errors with --just-do-it --- anvio/kegg.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 0ccc6580e6..26db419679 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -43,6 +43,7 @@ def __init__(self, args): self.orphan_data_dir = os.path.join(self.kofam_data_dir, "orphan_data") self.module_data_dir = os.path.join(self.kofam_data_dir, "modules") self.quiet = A('quiet') or False + self.just_do_it = A('just_do_it') # shared variables for all KOfam subclasses self.kofam_hmm_file_path = os.path.join(self.kofam_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms @@ -697,8 +698,12 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu if not is_ok and not is_corrected: self.num_uncorrected_errors += 1 - # we should allow a --just-do-it option here for people to ignore uncorrected errors - raise ConfigError("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ + if self.just_do_it: + self.run.warning("While parsing, anvi'o found an uncorrectable issue with a KEGG Module line in module %s, but since you used the --just-do-it flag, \ + anvi'o will quietly ignore this issue and add the line to the MODULES.db anyway. Please be warned that this may break things downstream. \ + In case you are interested, the line causing this issue has data name %s and data value %s" % (current_module_num, current_data_name, data_vals)) + else: + raise ConfigError("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ incorrectly-formatted data value field: %s" % (current_data_name, data_vals)) if is_corrected: From 074539c134a084f08beca970ee575d6ed6638eca Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 12:11:13 -0600 Subject: [PATCH 190/400] fix --just-do-it to save the erroneous lines in the DB; this is now tested and it works --- anvio/kegg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 26db419679..d8e3999b3e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -702,9 +702,12 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu self.run.warning("While parsing, anvi'o found an uncorrectable issue with a KEGG Module line in module %s, but since you used the --just-do-it flag, \ anvi'o will quietly ignore this issue and add the line to the MODULES.db anyway. Please be warned that this may break things downstream. \ In case you are interested, the line causing this issue has data name %s and data value %s" % (current_module_num, current_data_name, data_vals)) + is_ok = True # let's pretend that everything is alright so that the next function will take the original parsed values else: - raise ConfigError("Found an issue with a KEGG Module line. Data values incorrectly parsed. Current data name is %s, here is the \ - incorrectly-formatted data value field: %s" % (current_data_name, data_vals)) + raise ConfigError("While parsing, anvi'o found an uncorrectable issue with a KEGG Module line in module %s. The current data name is %s, \ + here is the incorrectly-formatted data value field: %s. If you think this is totally fine and want to ignore errors like this, please \ + re-run the setup with the --just-do-it flag. But if you choose to do that of course we are obliged to inform you that things may eventually \ + break as a result." % (current_module_num, current_data_name, data_vals)) if is_corrected: self.num_corrected_errors += 1 From 4c417eb98b0f154457b78592bc90089e6d20b7f8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 12:29:28 -0600 Subject: [PATCH 191/400] removal of comment build-up --- anvio/kegg.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index d8e3999b3e..b7e4597d3f 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -38,7 +38,7 @@ class KeggContext(object): def __init__(self, args): A = lambda x: args.__dict__[x] if x in args.__dict__ else None - # default directory will be called KEGG and will store the KEGG Module data as well + # default data directory will be called KEGG and will store the KEGG Module data as well self.kofam_data_dir = A('kofam_data_dir') or os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') self.orphan_data_dir = os.path.join(self.kofam_data_dir, "orphan_data") self.module_data_dir = os.path.join(self.kofam_data_dir, "modules") @@ -87,7 +87,6 @@ def setup_ko_dict(self): orphan_ko_headers = ["threshold","score_type","profile_type","F-measure","nseq","nseq_used","alen","mlen","eff_nseq","re/pos", "definition"] utils.store_dict_as_TAB_delimited_file(orphan_ko_dict, orphan_ko_path, key_header="knum", headers=orphan_ko_headers) - # here we remove KOs from the dictionary if they are in the skip list or no threshold list [self.ko_dict.pop(ko) for ko in self.ko_skip_list] [self.ko_dict.pop(ko) for ko in self.ko_no_threshold_list] @@ -165,7 +164,7 @@ def __init__(self, args, run=run, progress=progress): self.database_url = "ftp://ftp.genome.jp/pub/db/kofam" self.files = ['ko_list.gz', 'profiles.tar.gz'] - # Kegg module text file + # Kegg module text files self.kegg_module_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=htext&filedir=" self.kegg_rest_api_get = "http://rest.kegg.jp/get" @@ -299,7 +298,7 @@ def decompress_files(self): self.progress.new('Decompressing file %s' % file_name) full_path = os.path.join(self.kofam_data_dir, file_name) - if full_path.endswith("tar.gz"): # extract tar file instead of doing gzip + if full_path.endswith("tar.gz"): utils.tar_extract_file(full_path, output_file_path = self.kofam_data_dir, keep_original=False) else: utils.gzip_decompress_file(full_path, keep_original=False) @@ -572,7 +571,6 @@ def __init__(self, db_path, args, module_dictionary=None, run=run, progress=prog self.module_table_structure = ['module', 'data_name', 'data_value', 'data_definition', 'line'] self.module_table_types = [ 'str' , 'str' , 'str' , 'str' ,'numeric' ] - ## here we should call init function if the db exists if os.path.exists(self.db_path): utils.is_kegg_modules_db(self.db_path) self.db = db.DB(self.db_path, anvio.__kegg_modules_version__, new_database=False) @@ -749,7 +747,7 @@ def parse_kegg_modules_line(self, line, current_module, line_num=None, current_d data_def = None line_entries = [] - # data name unknown, parse from first field + # when data name unknown, parse from first field if not current_data_name: # sanity check: if line starts with space then there is no data name field and we should have passed a current_data_name if line[0] == ' ': @@ -768,7 +766,7 @@ def parse_kegg_modules_line(self, line, current_module, line_num=None, current_d We hope that doesn't freak you out." % current_module) self.run.info("Extra DEFINITION field", line) # note that if data name is known, first field still exists but is actually the empty string '' - # so no matter which situation, data value is field 1 and data definition (if any) is field 2 + # so no matter the situation, data value is field 1 and data definition (if any) is field 2 data_vals = fields[1] # need to sanity check data value field because SOME modules don't follow the 2-space separation formatting vals_are_okay, corrected_vals, corrected_def = self.data_vals_sanity_check(data_vals, current_data_name, current_module) @@ -785,10 +783,9 @@ def parse_kegg_modules_line(self, line, current_module, line_num=None, current_d # here we should NOT split on any commas within parentheses for val in re.split(',(?!.*\))', data_vals): line_entries.append((current_data_name, val, data_def, line_num)) - else: # just send what we found without splitting the line + else: line_entries.append((current_data_name, data_vals, data_def, line_num)) - # still need to figure out what to do about REFERENCE info type (includes AUTHORS, TITLE, JOURNAL) - do we want this? return line_entries @@ -840,12 +837,10 @@ def create(self): else: entries_tuple_list = self.parse_kegg_modules_line(line, mnum, line_number) - # update prev_data_name_field; use the first (and perhaps only) entry by default prev_data_name_field = entries_tuple_list[0][0] - # unpack that tuple info for name, val, definition, line in entries_tuple_list: - # call append_and_store which will collect db entries and store every 10000 at a time + # append_and_store will collect db entries and store every 10000 at a time mod_table.append_and_store(self.db, mnum, name, val, definition, line) @@ -918,7 +913,7 @@ def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): if dict_from_mod_table[key]['data_name'] == data_name: data_values_to_ret.append(dict_from_mod_table[key]['data_value']) - if not data_values_to_ret: # didn't find anything under that data_name + if not data_values_to_ret: self.run.warning("Just so you know, we tried to fetch data from the KEGG Modules database for the data_name field %s and KEGG module %s, \ but didn't come up with anything, so an empty list is being returned. This may cause errors down the line, and if so we're very sorry for that.") From a9e8a6a8b836c4b9107701b409b02355b9c11b36 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 12:37:50 -0600 Subject: [PATCH 192/400] update docstring --- anvio/db.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/anvio/db.py b/anvio/db.py index a5970ce207..c8c22b82c7 100644 --- a/anvio/db.py +++ b/anvio/db.py @@ -548,7 +548,11 @@ def get_some_rows_from_table_as_dict(self, table_name, where_clause, error_if_no get_table_as_dict can do a lot, but it first reads all data into the memory to operate on it. In some cases the programmer may like to access to only a small fraction of entries in a table by using `WHERE column = value` notation, which is not possible with the more generalized - function.""" + function. + + row_num_as_key bool added as parameter so this function works for KEGG MODULES.db, which does not have unique IDs in the + first column. If True, the returned dictionary will be keyed by integers from 0 to (# rows returned - 1) + """ results_dict = {} From 2f605eadb2ba5badb0784d56ee19f537c4f0d953 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 13:23:22 -0600 Subject: [PATCH 193/400] access function for modules by knum --- anvio/kegg.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index b7e4597d3f..57dda06248 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -919,6 +919,11 @@ def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): return data_values_to_ret + def get_modules_for_knum(self, knum): + """This function returns a list of modules that the given KO number belongs to.""" + where_clause_string = "data_value = '%s'" % (knum) + return self.db.get_single_column_from_table(self.module_table_name, 'module', unique=True, where_clause=where_clause_string) + def parse_kegg_class_value(self, class_data_val): """This function takes a data_value string for the CLASS field in the modules table and parses it into a dictionary. From 8c9a25a8b2e950b22b2e8f0f989a8d6a699ce434 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 13:34:35 -0600 Subject: [PATCH 194/400] access function for class by knum --- anvio/kegg.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 57dda06248..7c3fc46295 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -920,10 +920,20 @@ def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): return data_values_to_ret def get_modules_for_knum(self, knum): - """This function returns a list of modules that the given KO number belongs to.""" + """This function returns a list of modules that the given KO belongs to.""" where_clause_string = "data_value = '%s'" % (knum) return self.db.get_single_column_from_table(self.module_table_name, 'module', unique=True, where_clause=where_clause_string) + def get_module_classes_for_knum(self, knum): + """This function returns the classes for the modules that a given KO belongs to in a dictionary of dictionaries keyed by integer.""" + mods = self.get_modules_for_knum(knum) + module_counter = 0 + all_mods_classes_dict = {} + for mnum in mods: + all_mods_classes_dict[module_counter] = self.get_kegg_module_class_dict(mnum) + module_counter += 1 + return all_mods_classes_dict + def parse_kegg_class_value(self, class_data_val): """This function takes a data_value string for the CLASS field in the modules table and parses it into a dictionary. From 6f0ae9318156eac32f9d34f64247a116954da65d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 15:51:09 -0600 Subject: [PATCH 195/400] access functions for module names --- anvio/kegg.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 7c3fc46295..e9f23cbb20 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -934,6 +934,19 @@ def get_module_classes_for_knum(self, knum): module_counter += 1 return all_mods_classes_dict + def get_module_name(self, mnum): + """This function returns the name of the specified KEGG module.""" + where_clause_string = "module = '%s'" % (mnum) + # there should only be one NAME per module, so we return the first list element + return self.get_data_value_entries_for_module_by_data_name(mnum, "NAME")[0] + + def get_module_names_for_knum(self, knum): + """This function returns all names of each KEGG module that the given KO belongs to in a list.""" + mods = self.get_modules_for_knum(knum) + module_names_list = [] + for mnum in mods: + module_names_list.append(self.get_module_name(mnum)) + return module_names_list def parse_kegg_class_value(self, class_data_val): """This function takes a data_value string for the CLASS field in the modules table and parses it into a dictionary. From 0dac17bffa3c9f0b99f17650ce8676c347a68386 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 15:52:06 -0600 Subject: [PATCH 196/400] module class dict is now keyed by module number --- anvio/kegg.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index e9f23cbb20..4e106e132e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -925,13 +925,11 @@ def get_modules_for_knum(self, knum): return self.db.get_single_column_from_table(self.module_table_name, 'module', unique=True, where_clause=where_clause_string) def get_module_classes_for_knum(self, knum): - """This function returns the classes for the modules that a given KO belongs to in a dictionary of dictionaries keyed by integer.""" + """This function returns the classes for the modules that a given KO belongs to in a dictionary of dictionaries keyed by module number.""" mods = self.get_modules_for_knum(knum) - module_counter = 0 all_mods_classes_dict = {} for mnum in mods: - all_mods_classes_dict[module_counter] = self.get_kegg_module_class_dict(mnum) - module_counter += 1 + all_mods_classes_dict[mnum] = self.get_kegg_module_class_dict(mnum) return all_mods_classes_dict def get_module_name(self, mnum): From 04a760df1c33954b4c573eea2ced066a2d520546 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 15:53:33 -0600 Subject: [PATCH 197/400] module name dict now keyed by module number --- anvio/kegg.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 4e106e132e..4ad66f757d 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -939,12 +939,12 @@ def get_module_name(self, mnum): return self.get_data_value_entries_for_module_by_data_name(mnum, "NAME")[0] def get_module_names_for_knum(self, knum): - """This function returns all names of each KEGG module that the given KO belongs to in a list.""" + """This function returns all names of each KEGG module that the given KO belongs to in a dictionary keyed by module number.""" mods = self.get_modules_for_knum(knum) - module_names_list = [] + module_names = {} for mnum in mods: - module_names_list.append(self.get_module_name(mnum)) - return module_names_list + module_names[mnum] = self.get_module_name(mnum) + return module_names def parse_kegg_class_value(self, class_data_val): """This function takes a data_value string for the CLASS field in the modules table and parses it into a dictionary. From 2f4929007a1101d706f2bb900599a1aeebcf0211 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 17:25:57 -0600 Subject: [PATCH 198/400] fix bug, remove undefined class reference --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 4ad66f757d..13d3acbec5 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -462,7 +462,7 @@ def __init__(self, args, run=run, progress=progress): self.setup_ko_dict() # read the ko_list file into self.ko_dict # load existing kegg modules db - self.kegg_modules_db = kegg.KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), args=self.args) + self.kegg_modules_db = KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), args=self.args) def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): if not self.ko_dict: From 9afb8dc5ef5ae1be27afaa99eab70ec45f2af699 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 17:26:36 -0600 Subject: [PATCH 199/400] rename accessor function to show it returns a dict --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 13d3acbec5..00a950cd42 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -924,7 +924,7 @@ def get_modules_for_knum(self, knum): where_clause_string = "data_value = '%s'" % (knum) return self.db.get_single_column_from_table(self.module_table_name, 'module', unique=True, where_clause=where_clause_string) - def get_module_classes_for_knum(self, knum): + def get_module_classes_for_knum_as_dict(self, knum): """This function returns the classes for the modules that a given KO belongs to in a dictionary of dictionaries keyed by module number.""" mods = self.get_modules_for_knum(knum) all_mods_classes_dict = {} From 96a5120d518b7ecdb5a5a54630dba338c3e2c6df Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 17:27:08 -0600 Subject: [PATCH 200/400] accessor function to get classes as a list --- anvio/kegg.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 00a950cd42..11e89a89f1 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -932,6 +932,15 @@ def get_module_classes_for_knum_as_dict(self, knum): all_mods_classes_dict[mnum] = self.get_kegg_module_class_dict(mnum) return all_mods_classes_dict + def get_module_classes_for_knum_as_list(self, knum): + """This function returns the classes for the modules that a given KO belongs to as a list of strings.""" + mods = self.get_modules_for_knum(knum) + all_mods_classes_list = [] + for mnum in mods: + mod_class = self.get_data_value_entries_for_module_by_data_name(mnum, "CLASS")[0] + all_mods_classes_list.append(mod_class) + return all_mods_classes_list + def get_module_name(self, mnum): """This function returns the name of the specified KEGG module.""" where_clause_string = "module = '%s'" % (mnum) From 91148477c6ec9facb6028e1d97164f4de40ffc75 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 6 Mar 2020 18:52:36 -0600 Subject: [PATCH 201/400] store module name/class ontology as functional sources --- anvio/kegg.py | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 11e89a89f1..4aa7542cb2 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -517,22 +517,59 @@ def process_kofam_hmms(self): parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE') search_results_dict = parser.get_search_results(ko_list_dict=self.ko_dict) - # add functions to database + # add functions and KEGG modules info to database functions_dict = {} + kegg_module_names_dict = {} + kegg_module_classes_dict = {} counter = 0 for hmm_hit in search_results_dict.values(): + knum = hmm_hit['gene_name'] functions_dict[counter] = { 'gene_callers_id': hmm_hit['gene_callers_id'], 'source': 'KOfam', - 'accession': hmm_hit['gene_name'], + 'accession': knum, 'function': self.get_annotation_from_ko_dict(hmm_hit['gene_name'], ok_if_missing_from_dict=True), 'e_value': hmm_hit['e_value'], } + # add associated KEGG module information to database + mods = self.kegg_modules_db.get_modules_for_knum(knum) + names = self.kegg_modules_db.get_module_names_for_knum(knum) + classes = self.kegg_modules_db.get_module_classes_for_knum_as_list(knum) + + # FIXME? some KOs are not associated with modules. Should we report this? + if mods: + mod_annotation = "\n".join(mods) + mod_class_annotation = "\n".join(classes) + mod_name_annotation = "" + + for mod in mods: + if mod_name_annotation: + mod_name_annotation += "\n" + names[mod] + else: + mod_name_annotation = names[mod] + + kegg_module_names_dict[counter] = { + 'gene_callers_id': hmm_hit['gene_callers_id'], + 'source': 'KEGG_Module', + 'accession': mod_annotation, + 'function': mod_name_annotation, + 'e_value': None, + } + kegg_module_classes_dict[counter] = { + 'gene_callers_id': hmm_hit['gene_callers_id'], + 'source': 'KEGG_Class', + 'accession': mod_annotation, + 'function': mod_class_annotation, + 'e_value': None, + } + counter += 1 if functions_dict: gene_function_calls_table.create(functions_dict) + gene_function_calls_table.create(kegg_module_names_dict) + gene_function_calls_table.create(kegg_module_classes_dict) else: self.run.warning("KOfam class has no hits to process. Returning empty handed, but still adding KOfam as \ a functional source.") From a5568e06355f7f94cc04cf4d53a01b384c653fdc Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 8 Mar 2020 17:32:12 -0500 Subject: [PATCH 202/400] update error message for when user tries to access a nonexistant modules db --- anvio/kegg.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 4aa7542cb2..346abc0d98 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -615,8 +615,10 @@ def __init__(self, db_path, args, module_dictionary=None, run=run, progress=prog self.run.info('Modules database', 'An existing database, %s, has been loaded.' % self.db_path, quiet=self.quiet) self.run.info('Kegg Modules', '%d found' % self.db.get_meta_value('num_modules'), quiet=self.quiet) else: + # if self.module_dict is None, then we tried to initialize the DB outside of setup if not self.module_dict: - raise ConfigError("ERROR - a new KeggModulesDatabase() cannot be initialized without providing a modules dictionary. IT WILL DIE NOW.") + raise ConfigError("ERROR - a new KeggModulesDatabase() cannot be initialized without providing a modules dictionary. This \ + usually happens when you try to access a Modules DB before one has been setup. Running `anvi-setup-kegg-kofams` may fix this.") def touch(self): """Creates an empty Modules database on disk, and sets `self.db` to access to it. From 954e4c022dc958db0821dd02672d0dada67f47b1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 9 Mar 2020 16:49:09 -0500 Subject: [PATCH 203/400] I looked into the modules with multiple DEFINITION lines and decided that these are not errors after all. So now we do not count them as errors --- anvio/kegg.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 346abc0d98..985d726ec8 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -795,15 +795,6 @@ def parse_kegg_modules_line(self, line, current_module, line_num=None, current_d is the line: %s)" % (line)) current_data_name = fields[0] - # find the double DEFINITION field "errors" - elif current_data_name == "DEFINITION" and line[0] == ' ': - self.parsing_error_dict['multiple_definition_fields'].append(current_module) - self.num_uncorrected_errors += 1 - if anvio.DEBUG and not self.quiet: - self.run.warning("While parsing a KEGG Module line, we found more than one DEFINITION field for module %s. This is unusual, but \ - probably not REALLY an error, so we kept the extra line. Please note that we are counting it as an uncorrected error, though. \ - We hope that doesn't freak you out." % current_module) - self.run.info("Extra DEFINITION field", line) # note that if data name is known, first field still exists but is actually the empty string '' # so no matter the situation, data value is field 1 and data definition (if any) is field 2 data_vals = fields[1] @@ -848,7 +839,7 @@ def create(self): mod_table = KeggModulesTable(self.module_table_name) # keep track of errors encountered while parsing - self.parsing_error_dict = {"multiple_definition_fields" : [], "bad_line_splitting" : [], "bad_kegg_code_format" : []} + self.parsing_error_dict = {"bad_line_splitting" : [], "bad_kegg_code_format" : []} self.num_corrected_errors = 0 self.num_uncorrected_errors = 0 @@ -892,7 +883,6 @@ def create(self): self.run.warning("Several parsing errors were encountered while building the KEGG Modules DB. \ Below you will see which modules threw each type of parsing error. Note that modules which threw multiple \ errors will occur in the list as many times as it threw each error.") - self.run.info("Multiple DEFINITION lines (not corrected, but probably fine)", self.parsing_error_dict["multiple_definition_fields"]) self.run.info("Bad line splitting (usually due to rogue or missing spaces)", self.parsing_error_dict["bad_line_splitting"]) self.run.info("Bad KEGG code format (not corrected; possibly problematic)", self.parsing_error_dict["bad_kegg_code_format"]) else: # less verbose @@ -905,7 +895,6 @@ def create(self): run setup again with --reset --debug --quiet to see exactly which modules had issues, or \ run --reset --debug to see exactly which lines in which modules had issues. \ Now, here is a kiss for you because you have been so patient and good with anvi'o 😚") - self.run.info("Multiple DEFINITION lines (in one module)", len(self.parsing_error_dict["multiple_definition_fields"])) self.run.info("Bad line splitting (usually due to rogue or missing spaces)", len(self.parsing_error_dict["bad_line_splitting"])) self.run.info("Bad KEGG code format (usually not correctable)", len(self.parsing_error_dict["bad_kegg_code_format"])) From 5f97dbc28268eaa288b9b3dff948f84e4f5a4e82 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 10:18:18 -0500 Subject: [PATCH 204/400] update requires and provides --- bin/anvi-run-kegg-kofams | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams index e68e7c1b1c..749fe981e4 100755 --- a/bin/anvi-run-kegg-kofams +++ b/bin/anvi-run-kegg-kofams @@ -16,8 +16,8 @@ __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "Iva Veseli" __email__ = "iveseli@uchicago.edu" -__requires__ = ['contigs-db', "kofam-data", "kegg-modules-db",] -__provides__ = ['functions',] +__requires__ = ["contigs-db", "kofam-data", "kegg-modules-db",] +__provides__ = ["kegg-functions",] __description__ = "Run KOfam HMMs on an anvi'o contigs database." @time_program From c2f21b52f8d1b3c97b73006d2eb2c675f8dca372 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 10:18:38 -0500 Subject: [PATCH 205/400] skeleton script for estimating metabolism --- bin/anvi-estimate-kegg-metabolism | 50 +++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 bin/anvi-estimate-kegg-metabolism diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism new file mode 100644 index 0000000000..9dee53c6ae --- /dev/null +++ b/bin/anvi-estimate-kegg-metabolism @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import sys + +import anvio +import anvio.kegg as kegg + +from anvio.errors import ConfigError, FilesNPathsError +from anvio.terminal import time_program + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__maintainer__ = "Iva Veseli" +__email__ = "iveseli@uchicago.edu" +__requires__ = ["contigs-db", "kofam-data", "kegg-modules-db", "kegg-functions",] +#__provides__ = ["genome-metabolism", "genome-metabolism-txt",] #TODO: update when finished +__description__ = "Reconstructs metabolic pathways and estimates pathway completeness for a given set of contigs." + + +@terminal.time_program +def main(args): + # initialize class + # call estimate driver function + print("Sorry, nothing is implemented yet. :(") + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description=__description__) + + groupI = parser.add_argument_group('INPUT', "The minimum you must provide this program is a contigs database. In which case\ + anvi'o will attempt to estimate metabolism for all contigs in it, assuming that\ + the contigs database represents a single genome. If the contigs database is actually\ + a metagenome, you should use the `--metagenome` flag to explicitly declare that.") + groupI.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db', {'required': True})) + groupI.add_argument(*anvio.A('metagenome-mode'), **anvio.K('metagenome-mode')) + + args = anvio.get_args(parser) + + try: + main(args) + except ConfigError as e: + print(e) + sys.exit(-1) + except FilesNPathsError as e: + print(e) + sys.exit(-1) From 1e7dc40e378f2262da4a1692ac276cde8d461cd1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 11:09:27 -0500 Subject: [PATCH 206/400] change kofam-data-dir arg to the less confusing kegg-data-dir --- anvio/__init__.py | 7 ++++--- bin/anvi-estimate-kegg-metabolism | 1 + bin/anvi-run-kegg-kofams | 2 +- bin/anvi-setup-kegg-kofams | 6 +----- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index 2c90a65d4f..5ac8e4b9fa 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -634,11 +634,12 @@ def get_args(parser): 'help': "The directory path for your Pfam setup. Anvi'o will try to use the default path\ if you do not specify anything."} ), - 'kofam-data-dir': ( - ['--kofam-data-dir'], + 'kegg-data-dir': ( + ['--kegg-data-dir'], {'default': None, 'type': str, - 'help': "The directory path for your KOfam setup. Anvi'o will try to use the default path\ + 'help': "The directory path for your KEGG setup, which will include things like \ + KOfam profiles and KEGG MODULE data. Anvi'o will try to use the default path\ if you do not specify anything."} ), 'hide-outlier-SNVs': ( diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index 9dee53c6ae..b027f9d559 100644 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -37,6 +37,7 @@ if __name__ == '__main__': a metagenome, you should use the `--metagenome` flag to explicitly declare that.") groupI.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db', {'required': True})) groupI.add_argument(*anvio.A('metagenome-mode'), **anvio.K('metagenome-mode')) + groupI.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) args = anvio.get_args(parser) diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams index 749fe981e4..dbebaf8dea 100755 --- a/bin/anvi-run-kegg-kofams +++ b/bin/anvi-run-kegg-kofams @@ -34,7 +34,7 @@ if __name__ == '__main__': groupO = parser.add_argument_group('OPTIONAL INPUT', "The stuff you (probably) don't need.") groupR.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db')) - groupO.add_argument(*anvio.A('kofam-data-dir'), **anvio.K('kofam-data-dir')) + groupO.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) groupO.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) args = anvio.get_args(parser) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index b3be48d4c9..3157f6c7df 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -28,11 +28,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description=__description__) groupI = parser.add_argument_group('POSSIBLE INPUT', 'Not required for this program to run, but could be useful.') - groupI.add_argument('--kofam-data-dir', default=None, type=str, help="The directory for KEGG KOfam HMM profiles to be stored. If you leave it\ - as is without specifying anything, the default destination for the data directory will be used to set things\ - up. The advantage of it is that everyone will be using a single data directory, but then you may need\ - superuser privileges to do it. Using this parameter you can choose the location of the data directory somewhere\ - you like. However, when it is time to run Kofam, you will need to remember that path and provide it to the program.") + groupI.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) groupI.add_argument(*anvio.A('reset'), **anvio.K('reset')) groupI.add_argument('--quiet', default=False, action="store_true", help="Use this flag for less verbose output.") groupI.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) From 9b29f6464f40832fe273bcf12b68f7c905a6d155 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 11:15:21 -0500 Subject: [PATCH 207/400] kofam-data-dir -> kegg-data-dir --- anvio/kegg.py | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 985d726ec8..e64f61bfdf 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -39,16 +39,16 @@ class KeggContext(object): def __init__(self, args): A = lambda x: args.__dict__[x] if x in args.__dict__ else None # default data directory will be called KEGG and will store the KEGG Module data as well - self.kofam_data_dir = A('kofam_data_dir') or os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') - self.orphan_data_dir = os.path.join(self.kofam_data_dir, "orphan_data") - self.module_data_dir = os.path.join(self.kofam_data_dir, "modules") + self.kegg_data_dir = A('kegg_data_dir') or os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') + self.orphan_data_dir = os.path.join(self.kegg_data_dir, "orphan_data") + self.module_data_dir = os.path.join(self.kegg_data_dir, "modules") self.quiet = A('quiet') or False self.just_do_it = A('just_do_it') # shared variables for all KOfam subclasses - self.kofam_hmm_file_path = os.path.join(self.kofam_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms - self.ko_list_file_path = os.path.join(self.kofam_data_dir, "ko_list") - self.kegg_module_file = os.path.join(self.kofam_data_dir, "ko00002.keg") + self.kofam_hmm_file_path = os.path.join(self.kegg_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms + self.ko_list_file_path = os.path.join(self.kegg_data_dir, "ko_list") + self.kegg_module_file = os.path.join(self.kegg_data_dir, "ko00002.keg") def setup_ko_dict(self): @@ -154,7 +154,7 @@ def __init__(self, args, run=run, progress=progress): if not args.reset and not anvio.DEBUG: self.is_database_exists() - filesnpaths.gen_output_directory(self.kofam_data_dir, delete_if_exists=args.reset) + filesnpaths.gen_output_directory(self.kegg_data_dir, delete_if_exists=args.reset) filesnpaths.gen_output_directory(self.orphan_data_dir, delete_if_exists=args.reset) filesnpaths.gen_output_directory(self.module_data_dir, delete_if_exists=args.reset) @@ -173,11 +173,11 @@ def is_database_exists(self): """This function determines whether the user has already downloaded the Kofam HMM profiles and KEGG modules.""" if os.path.exists(self.kofam_hmm_file_path): - raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kofam_data_dir) + raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kegg_data_dir) if os.path.exists(self.kegg_module_file): raise ConfigError("Interestingly, though KOfam HMM profiles are not installed on your system, KEGG module information seems to have been \ - already downloaded in %s. Please use the --reset flag to re-download everything from scratch." % self.kofam_data_dir) + already downloaded in %s. Please use the --reset flag to re-download everything from scratch." % self.kegg_data_dir) if os.path.exists(self.module_data_dir): raise ConfigError("It seems the KEGG module directory %s already exists on your system. This is even more strange because Kofam HMM \ @@ -190,7 +190,7 @@ def download_profiles(self): for file_name in self.files: utils.download_file(self.database_url + '/' + file_name, - os.path.join(self.kofam_data_dir, file_name), progress=self.progress, run=self.run) + os.path.join(self.kegg_data_dir, file_name), progress=self.progress, run=self.run) def process_module_file(self): """This function reads the kegg module file into a dictionary. It should be called during setup to get the KEGG module numbers so that KEGG modules can be downloaded. @@ -296,10 +296,10 @@ def decompress_files(self): for file_name in self.files: self.progress.new('Decompressing file %s' % file_name) - full_path = os.path.join(self.kofam_data_dir, file_name) + full_path = os.path.join(self.kegg_data_dir, file_name) if full_path.endswith("tar.gz"): - utils.tar_extract_file(full_path, output_file_path = self.kofam_data_dir, keep_original=False) + utils.tar_extract_file(full_path, output_file_path = self.kegg_data_dir, keep_original=False) else: utils.gzip_decompress_file(full_path, keep_original=False) @@ -317,7 +317,7 @@ def confirm_downloaded_profiles(self): ko_nums = self.ko_dict.keys() for k in ko_nums: if k not in self.ko_skip_list: - hmm_path = os.path.join(self.kofam_data_dir, "profiles/%s.hmm" % k) + hmm_path = os.path.join(self.kegg_data_dir, "profiles/%s.hmm" % k) if not os.path.exists(hmm_path): raise ConfigError("The KOfam HMM profile at %s does not exist. This probably means that something went wrong \ while downloading the KOfam database. Please run `anvi-setup-kegg-kofams` with the --reset \ @@ -344,7 +344,7 @@ def move_orphan_files(self): no_data_path = os.path.join(self.orphan_data_dir, "03_hmm_profiles_with_ko_fams_with_no_data.hmm") no_data_file_list = [] - hmm_list = [k for k in glob.glob(os.path.join(self.kofam_data_dir, 'profiles/*.hmm'))] + hmm_list = [k for k in glob.glob(os.path.join(self.kegg_data_dir, 'profiles/*.hmm'))] for hmm_file in hmm_list: ko = re.search('profiles/(K\d{5})\.hmm', hmm_file).group(1) if ko not in self.ko_dict.keys(): @@ -382,25 +382,25 @@ def run_hmmpress(self): """This function concatenates the Kofam profiles and runs hmmpress on them.""" self.progress.new('Preparing Kofam HMM Profiles') - log_file_path = os.path.join(self.kofam_data_dir, '00_hmmpress_log.txt') + log_file_path = os.path.join(self.kegg_data_dir, '00_hmmpress_log.txt') - self.progress.update('Verifying the Kofam directory %s contains all HMM profiles' % self.kofam_data_dir) + self.progress.update('Verifying the Kofam directory %s contains all HMM profiles' % self.kegg_data_dir) self.confirm_downloaded_profiles() self.progress.update('Handling orphan files') self.move_orphan_files() self.progress.update('Concatenating HMM profiles into one file...') - hmm_list = [k for k in glob.glob(os.path.join(self.kofam_data_dir, 'profiles/*.hmm'))] + hmm_list = [k for k in glob.glob(os.path.join(self.kegg_data_dir, 'profiles/*.hmm'))] utils.concatenate_files(self.kofam_hmm_file_path, hmm_list, remove_concatenated_files=False) # there is no reason to keep the original HMM profiles around, unless we are debugging if not anvio.DEBUG: - shutil.rmtree((os.path.join(self.kofam_data_dir, "profiles"))) + shutil.rmtree((os.path.join(self.kegg_data_dir, "profiles"))) self.progress.update('Running hmmpress...') cmd_line = ['hmmpress', self.kofam_hmm_file_path] - log_file_path = os.path.join(self.kofam_data_dir, '00_hmmpress_log.txt') + log_file_path = os.path.join(self.kegg_data_dir, '00_hmmpress_log.txt') ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: @@ -415,7 +415,7 @@ def run_hmmpress(self): def setup_modules_db(self): """This function creates the Modules DB from the Kegg Module files. """ - mod_db = KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), args=self.args, module_dictionary=self.module_dict, run=run, progress=progress) + mod_db = KeggModulesDatabase(os.path.join(self.kegg_data_dir, "MODULES.db"), args=self.args, module_dictionary=self.module_dict, run=run, progress=progress) mod_db.create() @@ -455,14 +455,14 @@ def __init__(self, args, run=run, progress=progress): raise ConfigError("Anvi'o is unable to find the Kofam.hmm file at %s. This can happen one of two ways. Either you \ didn't specify the correct Kofam data directory using the flag --kofam-data-dir, or you haven't \ yet set up the Kofam data by running `anvi-setup-kegg-kofams`. Hopefully you now know what to do \ - to fix this problem. :) " % self.kofam_data_dir) + to fix this problem. :) " % self.kegg_data_dir) utils.is_contigs_db(self.contigs_db_path) self.setup_ko_dict() # read the ko_list file into self.ko_dict # load existing kegg modules db - self.kegg_modules_db = KeggModulesDatabase(os.path.join(self.kofam_data_dir, "MODULES.db"), args=self.args) + self.kegg_modules_db = KeggModulesDatabase(os.path.join(self.kegg_data_dir, "MODULES.db"), args=self.args) def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): if not self.ko_dict: From b7033b94b3cf5efda4e045864ee44ba239b7d48b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 11:21:13 -0500 Subject: [PATCH 208/400] KOfam -> KEGG wherever necessary --- anvio/kegg.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index e64f61bfdf..8ee7dbee23 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -34,7 +34,7 @@ class KeggContext(object): - """The purpose of this base class is to define shared functions and file paths for all KOfam operations.""" + """The purpose of this base class is to define shared functions and file paths for all KEGG operations.""" def __init__(self, args): A = lambda x: args.__dict__[x] if x in args.__dict__ else None @@ -45,14 +45,14 @@ def __init__(self, args): self.quiet = A('quiet') or False self.just_do_it = A('just_do_it') - # shared variables for all KOfam subclasses + # shared variables for all KEGG subclasses self.kofam_hmm_file_path = os.path.join(self.kegg_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms self.ko_list_file_path = os.path.join(self.kegg_data_dir, "ko_list") self.kegg_module_file = os.path.join(self.kegg_data_dir, "ko00002.keg") def setup_ko_dict(self): - """The purpose of this function is to process the ko_list file into usable form by Kofam sub-classes. + """The purpose of this function is to process the ko_list file into usable form by KEGG sub-classes. The ko_list file (which is downloaded along with the KOfam HMM profiles) contains important information for each KEGG Orthology number (KO, or knum), incuding pre-defined scoring thresholds @@ -64,7 +64,7 @@ def setup_ko_dict(self): K00001 329.57 domain trim 0.231663 1473 1069 1798 371 17.12 0.590 alcohol dehydrogenase [EC:1.1.1.1] Since this information is useful for both the setup process (we need to know all the knums) and HMM process, - all Kofam subclasses need to have access to this dictionary. + all KEGG subclasses need to have access to this dictionary. This is a dictionary (indexed by knum) of dictionaries(indexed by column name). Here is an example of the dictionary structure: @@ -74,9 +74,9 @@ def setup_ko_dict(self): self.ko_dict = utils.get_TAB_delimited_file_as_dictionary(self.ko_list_file_path) self.ko_skip_list, self.ko_no_threshold_list = self.get_ko_skip_list() - # if we are currently setting up KOfams, we should generate a text file with the ko_list entries + # if we are currently setting up KEGG, we should generate a text file with the ko_list entries # of the KOs that have no scoring threshold - if self.__class__.__name__ in ['KofamSetup']: + if self.__class__.__name__ in ['KeggSetup']: orphan_ko_dict = {ko:self.ko_dict[ko] for ko in self.ko_skip_list} orphan_ko_dict.update({ko:self.ko_dict[ko] for ko in self.ko_no_threshold_list}) @@ -133,7 +133,10 @@ def get_ko_skip_list(self): return skip_list, no_threshold_list class KeggSetup(KeggContext): - """Class for setting up KEGG Kofam HMM profiles. It performs sanity checks and downloads, unpacks, and prepares the profiles for later use by `hmmscan`. + """Class for setting up KEGG Kofam HMM profiles and modules. + + It performs sanity checks and downloads, unpacks, and prepares the profiles for later use by `hmmscan`. + It also downloads module files and creates the MODULES.db. Parameters ========== @@ -420,7 +423,7 @@ def setup_modules_db(self): def setup_profiles(self): - """This is a driver function which executes the Kofam setup process by downloading, decompressing, and hmmpressing the profiles.""" + """This is a driver function which executes the KEGG setup process by downloading, decompressing, and hmmpressing the profiles.""" self.download_profiles() self.decompress_files() @@ -453,7 +456,7 @@ def __init__(self, args, run=run, progress=progress): # verify that Kofam HMM profiles have been set up if not os.path.exists(self.kofam_hmm_file_path): raise ConfigError("Anvi'o is unable to find the Kofam.hmm file at %s. This can happen one of two ways. Either you \ - didn't specify the correct Kofam data directory using the flag --kofam-data-dir, or you haven't \ + didn't specify the correct KEGG data directory using the flag --kegg-data-dir, or you haven't \ yet set up the Kofam data by running `anvi-setup-kegg-kofams`. Hopefully you now know what to do \ to fix this problem. :) " % self.kegg_data_dir) @@ -587,7 +590,7 @@ def process_kofam_hmms(self): class KeggModulesDatabase(KeggContext): """To create or access a Modules DB. - This DB should be created in the Kegg Data folder during Kofam setup, and will be populated with information from the + This DB should be created in the Kegg Data folder during KEGG setup, and will be populated with information from the Kegg Module files. """ From 64b5c29f6c593056ba0f276d215c96154fe63c16 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 11:25:21 -0500 Subject: [PATCH 209/400] metabolism estimator class with a basic init --- anvio/kegg.py | 27 +++++++++++++++++++++++++++ bin/anvi-estimate-kegg-metabolism | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 8ee7dbee23..6bb04362b1 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -587,6 +587,33 @@ def process_kofam_hmms(self): shutil.rmtree(tmp_directory_path) hmmer.clean_tmp_dirs() +class KeggMetabolismEstimator(KeggContext): + """ Class for reconstructing/estimating metabolism based on hits to KEGG databases. + + ========== + args: Namespace object + All the arguments supplied by user to anvi-estimate-kegg-metabolism + """ + + def __init__(self, args, run=run, progress=progress): + self.args = args + self.run = run + self.progress = progress + self.contigs_db_path = args.contigs_db + + # init the base class + KeggContext.__init__(self, self.args) + + # load existing kegg modules db + if not os.path.exists(os.path.join(self.kegg_data_dir, "MODULES.db")): + raise ConfigError("It appears that a modules database (%s) does not exist in the KEGG data directory %s. \ + Perhaps you need to specify a different KEGG directory using --kegg-data-dir. Or perhaps you didn't run \ + `anvi-setup-kegg-kofams`, though we are not sure how you got to this point in that case \ + since you also cannot run `anvi-run-kegg-kofams` without first having run KEGG setup. But fine. Hopefully \ + you now know what you need to do to make this message go away." % ("MODULES.db", self.kegg_data_dir)) + self.kegg_modules_db = KeggModulesDatabase(os.path.join(self.kegg_data_dir, "MODULES.db"), args=self.args) + + class KeggModulesDatabase(KeggContext): """To create or access a Modules DB. diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index b027f9d559..fc20fdf7ec 100644 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -22,7 +22,7 @@ __description__ = "Reconstructs metabolic pathways and estimates pathway complet @terminal.time_program def main(args): - # initialize class + m = kegg.KeggMetabolismEstimator(args) # call estimate driver function print("Sorry, nothing is implemented yet. :(") From 0bb6cb78956ff7c90e2c91cb9b0b9f9ddd8c67f9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 14:54:43 -0500 Subject: [PATCH 210/400] make executable --- bin/anvi-estimate-kegg-metabolism | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/anvi-estimate-kegg-metabolism diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism old mode 100644 new mode 100755 From 319c9d2410d670f31aa9e8407d328345dbe4d9d2 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 14:57:44 -0500 Subject: [PATCH 211/400] bug fix --- bin/anvi-estimate-kegg-metabolism | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index fc20fdf7ec..993eafbcae 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -20,7 +20,7 @@ __requires__ = ["contigs-db", "kofam-data", "kegg-modules-db", "kegg-functions", __description__ = "Reconstructs metabolic pathways and estimates pathway completeness for a given set of contigs." -@terminal.time_program +@time_program def main(args): m = kegg.KeggMetabolismEstimator(args) # call estimate driver function From f3fb3d4f2501cec4d221721469db688bc4c6aec5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 18:37:24 -0500 Subject: [PATCH 212/400] additional imports that were needed --- anvio/kegg.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6bb04362b1..000cc67539 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -15,11 +15,13 @@ import anvio.utils as utils import anvio.terminal as terminal import anvio.filesnpaths as filesnpaths +import anvio.tables as t from anvio.errors import ConfigError, FilesNPathsError from anvio.drivers.hmmer import HMMer from anvio.parsers import parser_modules from anvio.tables.genefunctions import TableForGeneFunctions +from anvio.dbops import ContigsSuperclass, ContigsDatabase __author__ = "Developers of anvi'o (see AUTHORS.txt)" __copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" @@ -486,7 +488,7 @@ def process_kofam_hmms(self): """This is a driver function for running HMMs against the KOfam database and processing the hits into the provided contigs DB""" tmp_directory_path = filesnpaths.get_temp_directory_path() - contigs_db = dbops.ContigsSuperclass(self.args) # initialize contigs db + contigs_db = ContigsSuperclass(self.args) # initialize contigs db # get AA sequences as FASTA target_files_dict = {'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')} From 1a6e5c7965e85160b7254f636cd2839a31525d42 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 18:37:53 -0500 Subject: [PATCH 213/400] sanity check on contigs db --- anvio/kegg.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 000cc67539..2356eb631a 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -601,11 +601,15 @@ def __init__(self, args, run=run, progress=progress): self.args = args self.run = run self.progress = progress - self.contigs_db_path = args.contigs_db + + A = lambda x: args.__dict__[x] if x in args.__dict__ else None + self.contigs_db_path = A('contigs_db') # init the base class KeggContext.__init__(self, self.args) + utils.is_contigs_db(self.contigs_db_path) + # load existing kegg modules db if not os.path.exists(os.path.join(self.kegg_data_dir, "MODULES.db")): raise ConfigError("It appears that a modules database (%s) does not exist in the KEGG data directory %s. \ From 660ed3db266b738ee60960986e3876dad5cdc5f5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 18:38:28 -0500 Subject: [PATCH 214/400] initialization and the beginnings of an estimate (driver) function --- anvio/kegg.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 2356eb631a..bf461c0999 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -619,6 +619,34 @@ def __init__(self, args, run=run, progress=progress): you now know what you need to do to make this message go away." % ("MODULES.db", self.kegg_data_dir)) self.kegg_modules_db = KeggModulesDatabase(os.path.join(self.kegg_data_dir, "MODULES.db"), args=self.args) + def init_hits_and_splits(self): + """This function loads splits and KOfam hits from the contigs DB. + + We will need the hits with their KO numbers (accessions) so that we can go through the MODULES.db and determine + which steps are present in each module. And we will need the splits so that we can determine which hits belong + to which genomes/bins when we are handling multiple of these. + """ + + self.progress.new('Loading') + self.progress.update('Contigs DB') + contigs_db = ContigsDatabase(self.contigs_db_path, run=self.run, progress=self.progress) + self.contigs_db_project_name = contigs_db.meta['project_name'] + self.progress.update('Splits') + genes_in_splits = contigs_db.db.get_some_columns_from_table(t.genes_in_splits_table_name, "split, gene_callers_id") + self.progress.update('KOfam hits') + kofam_hits = contigs_db.db.get_some_columns_from_table(t.gene_function_calls_table_name, "gene_callers_id, accession", + where_clause="source = 'KOfam'") + + self.progress.end() + + def estimate_metabolism(self): + """This is the driver function for estimating metabolism. + + It will decide what to do based on whether the input contigs DB is a genome or metagenome. + """ + + self.init_hits_and_splits() + class KeggModulesDatabase(KeggContext): """To create or access a Modules DB. From bb99b46b2f963d523e293a7455858428b466dd02 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 18:39:00 -0500 Subject: [PATCH 215/400] now our script calls the driver function --- bin/anvi-estimate-kegg-metabolism | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index 993eafbcae..2559c69f30 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -23,8 +23,7 @@ __description__ = "Reconstructs metabolic pathways and estimates pathway complet @time_program def main(args): m = kegg.KeggMetabolismEstimator(args) - # call estimate driver function - print("Sorry, nothing is implemented yet. :(") + m.estimate_metabolism() if __name__ == '__main__': import argparse From a46662a305d5ae0ed2ec4f1231db18e31728ea0c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 19:01:38 -0500 Subject: [PATCH 216/400] clean up gene calls that do not have kofam hits --- anvio/kegg.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index bf461c0999..fca045fbe6 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -636,6 +636,20 @@ def init_hits_and_splits(self): self.progress.update('KOfam hits') kofam_hits = contigs_db.db.get_some_columns_from_table(t.gene_function_calls_table_name, "gene_callers_id, accession", where_clause="source = 'KOfam'") + contigs_db.disconnect() + + # get rid of gene calls in genes_in_splits that are not associated with KOfam hits. + # Perhaps this is not a necessary step. But it makes me feel clean. + all_gene_calls_in_splits = set([tpl[1] for tpl in genes_in_splits]) + gene_calls_with_kofam_hits = set([tpl[0] for tpl in kofam_hits]) + gene_calls_without_kofam_hits = all_gene_calls_in_splits.difference(gene_calls_with_kofam_hits) + + if gene_calls_without_kofam_hits: + self.progress.update("Removing %s gene calls without KOfam hits" % len(gene_calls_without_kofam_hits)) + genes_in_splits = [tpl for tpl in genes_in_splits if tpl[1] not in gene_calls_without_kofam_hits] + if anvio.DEBUG: + self.run.warning("The following gene calls in your contigs DB were removed from consideration as they \ + do not have any hits to the KOfam database: %s" % (gene_calls_without_kofam_hits)) self.progress.end() From 2ffb4b6fe98d7222067a7df93cde5f927c77babb Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 19:06:58 -0500 Subject: [PATCH 217/400] print some initialization info --- anvio/kegg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index fca045fbe6..3098861a44 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -653,6 +653,9 @@ def init_hits_and_splits(self): self.progress.end() + self.run.info("Contigs DB", 'An existing database, %s, has been loaded.' % self.contigs_db_path, quiet=self.quiet) + self.run.info("KOfam hits", "%d found" % len(kofam_hits)) + def estimate_metabolism(self): """This is the driver function for estimating metabolism. From 379689e5ce934d4204af74998a08128a27a33380 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 10 Mar 2020 19:17:09 -0500 Subject: [PATCH 218/400] add quiet parameter --- anvio/kegg.py | 2 +- bin/anvi-estimate-kegg-metabolism | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 3098861a44..b3dc1b90cb 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -654,7 +654,7 @@ def init_hits_and_splits(self): self.progress.end() self.run.info("Contigs DB", 'An existing database, %s, has been loaded.' % self.contigs_db_path, quiet=self.quiet) - self.run.info("KOfam hits", "%d found" % len(kofam_hits)) + self.run.info("KOfam hits", "%d found" % len(kofam_hits), quiet=self.quiet) def estimate_metabolism(self): """This is the driver function for estimating metabolism. diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index 2559c69f30..2108ddd63f 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -38,6 +38,9 @@ if __name__ == '__main__': groupI.add_argument(*anvio.A('metagenome-mode'), **anvio.K('metagenome-mode')) groupI.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) + groupC = parser.add_argument_group('CONTROL', "Customization parameters. Take the helm, captain.") + groupC.add_argument('--quiet', default=False, action="store_true", help="Use this flag for less verbose output.") + args = anvio.get_args(parser) try: From 560c297635f716471a13f6ae1dd27067c378e4dc Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 11 Mar 2020 12:01:04 -0500 Subject: [PATCH 219/400] add params for working with profiles --- anvio/kegg.py | 15 +++++++++++++++ bin/anvi-estimate-kegg-metabolism | 9 +++++++++ 2 files changed, 24 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index b3dc1b90cb..8aa06624f0 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -604,6 +604,21 @@ def __init__(self, args, run=run, progress=progress): A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.contigs_db_path = A('contigs_db') + self.profile_db_path = A('profile_db') + self.collection_name = A('collection_name') + self.bin_id = A('bin_id') + self.bin_ids_file = A('bin_ids_file') + + self.bin_ids_to_process = None + if self.bin_id and self.bin_ids_file: + raise ConfigError("You have provided anvi'o with both the individual bin id %s and a file with bin ids (%s). \ + Please make up your mind. Which one do you want an estimate for? :)" % (self.bin_id, self.bin_ids_file)) + elif self.bin_id: + self.bin_ids_to_process = [self.bin_id] + elif self.bin_ids_file: + filesnpaths.is_file_exists(self.bin_ids_file) + self.bin_ids_to_process = [line.strip() for line in open(self.bin_ids_file).readlines()] + # init the base class KeggContext.__init__(self, self.args) diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index 2108ddd63f..19f0198168 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -38,6 +38,15 @@ if __name__ == '__main__': groupI.add_argument(*anvio.A('metagenome-mode'), **anvio.K('metagenome-mode')) groupI.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) + groupP = parser.add_argument_group('ADDITIONAL INPUT', "If you also provide a profile database AND a collection name, anvi'o will \ + estimate metabolism separately for each bin in your collection. You can also limit \ + those estimates to a specific bin or set of bins in the collection using the parameters \ + `--bin-id` or `--bin-ids-file`, respectively.") + groupP.add_argument(*anvio.A('profile-db'), **anvio.K('profile-db', {'required': False})) + groupP.add_argument(*anvio.A('collection-name'), **anvio.K('collection-name')) + groupP.add_argument(*anvio.A('bin-id'), **anvio.K('bin-id')) + groupP.add_argument(*anvio.A('bin-ids-file'), **anvio.K('bin-ids-file')) + groupC = parser.add_argument_group('CONTROL', "Customization parameters. Take the helm, captain.") groupC.add_argument('--quiet', default=False, action="store_true", help="Use this flag for less verbose output.") From 40aab01dd0141645fd3a51ccd4579b1a979b8d60 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 11 Mar 2020 12:02:02 -0500 Subject: [PATCH 220/400] remove quiet param, this should be global --- bin/anvi-estimate-kegg-metabolism | 3 --- 1 file changed, 3 deletions(-) diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index 19f0198168..bcc59b781d 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -47,9 +47,6 @@ if __name__ == '__main__': groupP.add_argument(*anvio.A('bin-id'), **anvio.K('bin-id')) groupP.add_argument(*anvio.A('bin-ids-file'), **anvio.K('bin-ids-file')) - groupC = parser.add_argument_group('CONTROL', "Customization parameters. Take the helm, captain.") - groupC.add_argument('--quiet', default=False, action="store_true", help="Use this flag for less verbose output.") - args = anvio.get_args(parser) try: From 13d11e406153a5701282f545d42069de55e1ae47 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 11 Mar 2020 12:12:27 -0500 Subject: [PATCH 221/400] remove quiet param, it is global --- bin/anvi-setup-kegg-kofams | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index 3157f6c7df..0ab10aceec 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -30,7 +30,6 @@ if __name__ == '__main__': groupI = parser.add_argument_group('POSSIBLE INPUT', 'Not required for this program to run, but could be useful.') groupI.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) groupI.add_argument(*anvio.A('reset'), **anvio.K('reset')) - groupI.add_argument('--quiet', default=False, action="store_true", help="Use this flag for less verbose output.") groupI.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) args = anvio.get_args(parser) From 2f02eabf842187c47176a9922ddea6f4191d193b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 11 Mar 2020 16:35:17 -0500 Subject: [PATCH 222/400] remove unprofiled splits --- anvio/kegg.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 8aa06624f0..f5e179e9fd 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -651,6 +651,7 @@ def init_hits_and_splits(self): self.progress.update('KOfam hits') kofam_hits = contigs_db.db.get_some_columns_from_table(t.gene_function_calls_table_name, "gene_callers_id, accession", where_clause="source = 'KOfam'") + min_contig_length_in_contigs_db = contigs_db.db.get_max_value_in_column(t.contigs_info_table_name, "length", return_min_instead=True) contigs_db.disconnect() # get rid of gene calls in genes_in_splits that are not associated with KOfam hits. @@ -666,6 +667,34 @@ def init_hits_and_splits(self): self.run.warning("The following gene calls in your contigs DB were removed from consideration as they \ do not have any hits to the KOfam database: %s" % (gene_calls_without_kofam_hits)) + # get rid of splits (and their associated gene calls) that are not in the profile DB + if self.profile_db_path: + split_names_in_profile_db = set(utils.get_all_item_names_from_the_database(self.profile_db_path)) + split_names_in_contigs_db = set([tpl[0] for tpl in genes_in_splits]) + splits_missing_in_profile_db = split_names_in_contigs_db.difference(split_names_in_profile_db) + + min_contig_length_in_profile_db = ProfileDatabase(self.profile_db_path).meta['min_contig_length'] + + if len(splits_missing_in_profile_db): + self.progress.reset() + self.run.warning("Please note that anvi'o found %s splits in your contigs database with KOfam hits. But only %s of them " + "appear in the profile database. As a result, anvi'o will now remove the %s splits with KOfam hits" + "that occur only in the contigs db from all downstream analyses. Where is this difference coming from though? " + "Well. This is often the case because the 'minimum contig length parameter' set during the `anvi-profile` " + "step can exclude many contigs from downstream analyses (often for good reasons, too). For " + "instance, in your case the minimum contig length goes as low as %s nts in your contigs database. " + "Yet, the minimum contig length set in the profile databaes is %s nts. Hence the difference. Anvi'o " + "hopes that this explaines some things." % (pp(len(split_names_in_contigs_db)), + pp(len(split_names_in_profile_db)), + pp(len(splits_missing_in_profile_db)), + pp(min_contig_length_in_contigs_db), + pp(min_contig_length_in_profile_db))) + + self.progress.update("Removing %s splits (and associated gene calls) that were missing from the profile db" % pp(len(splits_missing_in_profile_db))) + genes_in_splits = [tpl for tpl in genes_in_splits if tpl[0] not in splits_missing_in_profile_db] + remaining_gene_calls = [tpl[1] for tpl in genes_in_splits] + kofam_hits = [tpl for tpl in kofam_hits if tpl[0] in remaining_gene_calls] + self.progress.end() self.run.info("Contigs DB", 'An existing database, %s, has been loaded.' % self.contigs_db_path, quiet=self.quiet) From c947e07e9815f7461bca6b418fd5a3f321741d39 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 12 Mar 2020 11:58:17 -0500 Subject: [PATCH 223/400] import profile db --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index f5e179e9fd..6a0ec28096 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -21,7 +21,8 @@ from anvio.drivers.hmmer import HMMer from anvio.parsers import parser_modules from anvio.tables.genefunctions import TableForGeneFunctions -from anvio.dbops import ContigsSuperclass, ContigsDatabase +from anvio.dbops import ContigsSuperclass, ContigsDatabase, ProfileSuperclass, ProfileDatabase + __author__ = "Developers of anvi'o (see AUTHORS.txt)" __copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" From ff1341dafbfd044676992c56d740b9d3d78d9ad0 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 12 Mar 2020 18:01:25 -0500 Subject: [PATCH 224/400] update run info --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6a0ec28096..7209097a94 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -698,8 +698,9 @@ def init_hits_and_splits(self): self.progress.end() - self.run.info("Contigs DB", 'An existing database, %s, has been loaded.' % self.contigs_db_path, quiet=self.quiet) + self.run.info("Contigs DB", self.contigs_db_path, quiet=self.quiet) self.run.info("KOfam hits", "%d found" % len(kofam_hits), quiet=self.quiet) + self.run.info("Profile DB", self.profile_db_path, quiet=self.quiet) def estimate_metabolism(self): """This is the driver function for estimating metabolism. From 0023d7052cf7d0c378b4da1117e2eabf30b6352a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 12 Mar 2020 18:14:39 -0500 Subject: [PATCH 225/400] profile db + collection param check --- anvio/kegg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 7209097a94..35f024dcb0 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -620,6 +620,9 @@ def __init__(self, args, run=run, progress=progress): filesnpaths.is_file_exists(self.bin_ids_file) self.bin_ids_to_process = [line.strip() for line in open(self.bin_ids_file).readlines()] + if self.profile_db_path and not self.collection_name: + raise ConfigError("If you provide a profiles DB, you should also provide a collection name.") + # init the base class KeggContext.__init__(self, self.args) From dc70e924157c1ace255fddc5fa84eedfc53d40e0 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 12 Mar 2020 18:20:00 -0500 Subject: [PATCH 226/400] get metagenome mode param --- anvio/kegg.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 35f024dcb0..e0eee813a7 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -609,6 +609,7 @@ def __init__(self, args, run=run, progress=progress): self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file = A('bin_ids_file') + self.metagenome_mode = True if A('metagenome_mode') else False self.bin_ids_to_process = None if self.bin_id and self.bin_ids_file: @@ -704,6 +705,7 @@ def init_hits_and_splits(self): self.run.info("Contigs DB", self.contigs_db_path, quiet=self.quiet) self.run.info("KOfam hits", "%d found" % len(kofam_hits), quiet=self.quiet) self.run.info("Profile DB", self.profile_db_path, quiet=self.quiet) + self.run.info('Metagenome mode', self.metagenome_mode) def estimate_metabolism(self): """This is the driver function for estimating metabolism. From ce6f85411497807ea927f938e9270734d1d50e5c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 12 Mar 2020 18:29:41 -0500 Subject: [PATCH 227/400] skeleton control for estimating situation --- anvio/kegg.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index e0eee813a7..27581c38ea 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -715,6 +715,26 @@ def estimate_metabolism(self): self.init_hits_and_splits() + if self.profile_db_path and not self.metagenome_mode: + raise ConfigError("This class doesn't know how to deal with that yet :/") + # isolate genome, with profiling + #something like self.estimate_for_bins_in_collection() + elif not self.profile_db_path and not self.metagenome_mode: + raise ConfigError("This class doesn't know how to deal with that yet :/") + # isolate genome without profiling + #something like self.estimate_for_contigs_db_for_genome() + elif self.profile_db_path and self.metagenome_mode: + raise ConfigError("This class doesn't know how to deal with that yet :/") + # metagenome, with profiling + #self.estimate_for_contigs_db_for_metagenome() + elif not self.profile_db_path and self.metagenome_mode: + raise ConfigError("This class doesn't know how to deal with that yet :/") + # metagenome without profiling + #self.estimate_for_contigs_db_for_metagenome() + else: + raise ConfigError("This class doesn't know how to deal with that yet :/") + + class KeggModulesDatabase(KeggContext): """To create or access a Modules DB. From 89a90469bd2c6555f9bb98c6e69c417acf1ecda4 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 12 Mar 2020 20:06:24 -0500 Subject: [PATCH 228/400] skeleton funcs for single genome estimation. we also now return hits and splits to consider - but these might become self attributes at some point --- anvio/kegg.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 27581c38ea..b87658673d 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -644,7 +644,13 @@ def init_hits_and_splits(self): We will need the hits with their KO numbers (accessions) so that we can go through the MODULES.db and determine which steps are present in each module. And we will need the splits so that we can determine which hits belong - to which genomes/bins when we are handling multiple of these. + to which genomes/bins when we are handling multiple of these. This function gets these hits and splits (as lists + of tuples), and it makes sure that these lists don't include hits/splits we shouldn't be considering. + + RETURNS + ======= + kofam_hits list of (gene_call_id, ko_num) tuples + genes_in_splits list of (split, gene_call_id) tuples """ self.progress.new('Loading') @@ -707,22 +713,38 @@ def init_hits_and_splits(self): self.run.info("Profile DB", self.profile_db_path, quiet=self.quiet) self.run.info('Metagenome mode', self.metagenome_mode) + return kofam_hits, genes_in_splits + + def estimate_for_genome(self): + """This is the metabolism estimation function for a contigs DB that contains a single genome. + + It returns the initial metabolism completion dictionary for that genome, wrapped in the superdict format. + This dictionary will contain the KOs that are present in the genome for each KEGG module. The dict can + be processed later to estimate the completion of each module. + """ + pass + + # for each kofam hit, get the modules it belongs to + # for each module it belongs to, update the presence list + def estimate_metabolism(self): """This is the driver function for estimating metabolism. It will decide what to do based on whether the input contigs DB is a genome or metagenome. + It returns the metabolism superdict which contains a metabolism completion dictionary for each genome/bin in the contigs db. + The metabolism completion dictionary is keyed by KEGG module number. """ - self.init_hits_and_splits() + hits_to_consider, splits_to_consider = self.init_hits_and_splits() + + kegg_metabolism_superdict = {} if self.profile_db_path and not self.metagenome_mode: raise ConfigError("This class doesn't know how to deal with that yet :/") # isolate genome, with profiling #something like self.estimate_for_bins_in_collection() elif not self.profile_db_path and not self.metagenome_mode: - raise ConfigError("This class doesn't know how to deal with that yet :/") - # isolate genome without profiling - #something like self.estimate_for_contigs_db_for_genome() + kegg_metabolism_superdict = self.estimate_for_genome() elif self.profile_db_path and self.metagenome_mode: raise ConfigError("This class doesn't know how to deal with that yet :/") # metagenome, with profiling From f2e82e8da20c4e630933670ffff554f941e2485c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 13 Mar 2020 12:42:56 -0500 Subject: [PATCH 229/400] structure for single genome estimation in place --- anvio/kegg.py | 61 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index b87658673d..a34cedf2d2 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -610,6 +610,7 @@ def __init__(self, args, run=run, progress=progress): self.bin_id = A('bin_id') self.bin_ids_file = A('bin_ids_file') self.metagenome_mode = True if A('metagenome_mode') else False + self.contigs_db_project_name = "Unknown" self.bin_ids_to_process = None if self.bin_id and self.bin_ids_file: @@ -715,18 +716,64 @@ def init_hits_and_splits(self): return kofam_hits, genes_in_splits - def estimate_for_genome(self): - """This is the metabolism estimation function for a contigs DB that contains a single genome. + def mark_kos_present_for_list_of_splits(kofam_hits_in_splits, split_list=None, bin_name=None): + """This function generates a bin-level dictionary of dictionary, which associates modules with the list of KOs + that are present in the bin for each module. - It returns the initial metabolism completion dictionary for that genome, wrapped in the superdict format. - This dictionary will contain the KOs that are present in the genome for each KEGG module. The dict can - be processed later to estimate the completion of each module. + The structure of the dictionary is like this: + {mnum: {present_kos: [knum1, knum2, ....]}} + Why do we need an inner dictionary with just one list? Well. This dictionary will be expanded later by other functions, not to worry. + + PARAMETERS + ========== + kofam_hits_in_splits list of KO numbers that are hits in the current list of splits + split_list list of splits we are considering, this is only for debugging output + bin_name name of the bin containing these splits, this is only for debugging output + + RETURNS + ======= + bin_level_module_dict dict of dicts that maps module number to dictionary of KOs present in the splits for that module """ - pass + bin_level_module_dict = {} + + if anvio.DEBUG: + self.run.info("Marking KOs present for bin", bin_name) + self.run.info("With splits", ",".join(split_list)) + # initialize all modules with empty presence list # for each kofam hit, get the modules it belongs to # for each module it belongs to, update the presence list + return bin_level_module_dict + + def estimate_for_genome(self, kofam_hits, genes_in_splits): + """This is the metabolism estimation function for a contigs DB that contains a single genome. + + It returns the initial metabolism completion dictionary for that genome, wrapped in the superdict format. + This dictionary at first contains the KOs that are present in the genome for each KEGG module. It is later + processed to estimate the completion of each module. + + PARAMETERS + ========== + kofam_hits list of (gene_call_id, ko_num) tuples, all belong to this single genome + genes_in_splits list of (split, gene_call_id) tuples, all belong to this single genome <- MAYBE UNNECESSARY + + RETURNS + ======= + genome_metabolism_dict dictionary mapping genome name to its metabolism completeness dictionary + """ + + genome_metabolism_dict = {} + # get list of KOs only - since all splits belong to one genome, we can take all the hits + ko_in_genome = [tpl[1] for tpl in kofam_hits] + splits_in_genome = [tpl[0] for tpl in genes_in_splits] + # get KO presence in modules + genome_metabolism_dict[self.contigs_db_project_name] = self.mark_kos_present_for_list_of_splits(ko_in_genome, split_list=splits_in_genome, bin_name=self.contigs_db_project_name) + # TODO estimate module completeness + + return genome_metabolism_dict + + def estimate_metabolism(self): """This is the driver function for estimating metabolism. @@ -744,7 +791,7 @@ def estimate_metabolism(self): # isolate genome, with profiling #something like self.estimate_for_bins_in_collection() elif not self.profile_db_path and not self.metagenome_mode: - kegg_metabolism_superdict = self.estimate_for_genome() + kegg_metabolism_superdict = self.estimate_for_genome(hits_to_consider, splits_to_consider) elif self.profile_db_path and self.metagenome_mode: raise ConfigError("This class doesn't know how to deal with that yet :/") # metagenome, with profiling From f3812174837bb69028ff95ef52b7124e4ac92fdd Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 13 Mar 2020 12:44:59 -0500 Subject: [PATCH 230/400] accessor functions for all modules --- anvio/kegg.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index a34cedf2d2..ae9fac081a 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1045,7 +1045,7 @@ def create(self): self.touch() - self.progress.new("Loading KEGG modules into Modules DB...") + self.progress.new("Loading %s KEGG modules into Modules DB..." % len(self.module_dict.keys())) # sanity check that we setup the modules previously. # It shouldn't be a problem since this function should only be called during the setup process after modules download, but just in case. @@ -1094,9 +1094,14 @@ def create(self): # append_and_store will collect db entries and store every 10000 at a time mod_table.append_and_store(self.db, mnum, name, val, definition, line) - + f.close() num_modules_parsed += 1 + # once we are done parsing all modules, we store whatever db entries remain in the db_entries list + # this is necessary because append_and_store() above only stores every 10000 entries + self.progress.update("Storing final batch of module entries into DB") + mod_table.store(self.db) + self.progress.end() # warn user about parsing errors @@ -1168,6 +1173,10 @@ def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): return data_values_to_ret + def get_all_modules_as_list(self): + """This function returns a list of all modules in the DB.""" + return self.db.get_single_column_from_table(self.module_table_name, 'module', unique=True) + def get_modules_for_knum(self, knum): """This function returns a list of modules that the given KO belongs to.""" where_clause_string = "data_value = '%s'" % (knum) From 9867ca7d454ddae663e07f9f324940cb3f4d3bc4 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 13 Mar 2020 12:57:06 -0500 Subject: [PATCH 231/400] MAJOR BUG FIX add leftover db entries to modules DB after processing all to make sure all modules get stored --- anvio/kegg.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index a34cedf2d2..9dde341ff1 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1045,7 +1045,7 @@ def create(self): self.touch() - self.progress.new("Loading KEGG modules into Modules DB...") + self.progress.new("Loading %s KEGG modules into Modules DB..." % len(self.module_dict.keys())) # sanity check that we setup the modules previously. # It shouldn't be a problem since this function should only be called during the setup process after modules download, but just in case. @@ -1094,9 +1094,14 @@ def create(self): # append_and_store will collect db entries and store every 10000 at a time mod_table.append_and_store(self.db, mnum, name, val, definition, line) - + f.close() num_modules_parsed += 1 + # once we are done parsing all modules, we store whatever db entries remain in the db_entries list + # this is necessary because append_and_store() above only stores every 10000 entries + self.progress.update("Storing final batch of module entries into DB") + mod_table.store(self.db) + self.progress.end() # warn user about parsing errors From 0f2b39180635b13974f1d1c84fa6a4b6a3a8558d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 13 Mar 2020 12:57:21 -0500 Subject: [PATCH 232/400] accessor for all modules in db --- anvio/kegg.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 9dde341ff1..ae9fac081a 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1173,6 +1173,10 @@ def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): return data_values_to_ret + def get_all_modules_as_list(self): + """This function returns a list of all modules in the DB.""" + return self.db.get_single_column_from_table(self.module_table_name, 'module', unique=True) + def get_modules_for_knum(self, knum): """This function returns a list of modules that the given KO belongs to.""" where_clause_string = "data_value = '%s'" % (knum) From 05659e44c13a59c5e7403e018f2665cc65bd8ab1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 13 Mar 2020 12:52:06 -0500 Subject: [PATCH 233/400] get the modules list --- anvio/kegg.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index ae9fac081a..fb8882809b 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -740,7 +740,9 @@ def mark_kos_present_for_list_of_splits(kofam_hits_in_splits, split_list=None, b if anvio.DEBUG: self.run.info("Marking KOs present for bin", bin_name) self.run.info("With splits", ",".join(split_list)) + # initialize all modules with empty presence list + modules = self.kegg_modules_db.get_all_modules_as_list() # for each kofam hit, get the modules it belongs to # for each module it belongs to, update the presence list From 0693d2b3cbff472a85e11cb018a1720c6360818b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 13 Mar 2020 13:48:33 -0500 Subject: [PATCH 234/400] mark KOs present for list of splits --- anvio/kegg.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index fb8882809b..ba20f4dec1 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -716,7 +716,7 @@ def init_hits_and_splits(self): return kofam_hits, genes_in_splits - def mark_kos_present_for_list_of_splits(kofam_hits_in_splits, split_list=None, bin_name=None): + def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=None, bin_name=None): """This function generates a bin-level dictionary of dictionary, which associates modules with the list of KOs that are present in the bin for each module. @@ -739,12 +739,26 @@ def mark_kos_present_for_list_of_splits(kofam_hits_in_splits, split_list=None, b if anvio.DEBUG: self.run.info("Marking KOs present for bin", bin_name) - self.run.info("With splits", ",".join(split_list)) + self.run.info("Number of splits", len(split_list)) # initialize all modules with empty presence list modules = self.kegg_modules_db.get_all_modules_as_list() - # for each kofam hit, get the modules it belongs to - # for each module it belongs to, update the presence list + for mnum in modules: + bin_level_module_dict[mnum] = {"present_kos" : []} + + kos_not_in_modules = [] + for ko in kofam_hits_in_splits: + present_in_mods = self.kegg_modules_db.get_modules_for_knum(ko) + if not present_in_mods: + kos_not_in_modules.append(ko) + for m in present_in_mods: + bin_level_module_dict[m]["present_kos"].append(ko) + + if anvio.DEBUG: + self.run.info("KOs processed", "%d in bin" % len(kofam_hits_in_splits)) + if kos_not_in_modules: + self.run.warning("Just so you know, the following KOfam hits did not belong to any KEGG modules in the MODULES.db: %s" + % ", ".join(kos_not_in_modules)) return bin_level_module_dict @@ -770,9 +784,10 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): ko_in_genome = [tpl[1] for tpl in kofam_hits] splits_in_genome = [tpl[0] for tpl in genes_in_splits] # get KO presence in modules - genome_metabolism_dict[self.contigs_db_project_name] = self.mark_kos_present_for_list_of_splits(ko_in_genome, split_list=splits_in_genome, bin_name=self.contigs_db_project_name) + genome_metabolism_dict[self.contigs_db_project_name] = self.mark_kos_present_for_list_of_splits(ko_in_genome, split_list=splits_in_genome, + bin_name=self.contigs_db_project_name) # TODO estimate module completeness - + return genome_metabolism_dict From 84901d34a25d2245f200542d9d3cbc2af693099e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 15 Mar 2020 14:47:15 -0500 Subject: [PATCH 235/400] add module completeness threshold param --- anvio/__init__.py | 9 +++++++++ anvio/kegg.py | 1 + bin/anvi-estimate-kegg-metabolism | 3 +++ 3 files changed, 13 insertions(+) diff --git a/anvio/__init__.py b/anvio/__init__.py index fad20c8fca..846fe3db82 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -2185,6 +2185,15 @@ def get_args(parser): 'help': "Provide if working with INSeq/Tn-Seq genomic data. With this, all gene level " "coverage stats will be calculated using INSeq/Tn-Seq statistical methods."} ), + 'module-completion-threshold': ( + ['--module-completion-threshold'], + {'default': 0.75, + 'metavar': 'NUM', + 'type': float, + 'help': "This threshold defines the point at which we consider a KEGG module to be 'complete' or " + "'present' in a given genome or bin. It is the fraction of steps that must be complete in " + " in order for the entire module to be marked complete. The default is %(default)g."} + ), } # two functions that works with the dictionary above. diff --git a/anvio/kegg.py b/anvio/kegg.py index ba20f4dec1..07710b20fd 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -610,6 +610,7 @@ def __init__(self, args, run=run, progress=progress): self.bin_id = A('bin_id') self.bin_ids_file = A('bin_ids_file') self.metagenome_mode = True if A('metagenome_mode') else False + self.completeness_threshold = A('module-completion-threshold') or 0.75 self.contigs_db_project_name = "Unknown" self.bin_ids_to_process = None diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index bcc59b781d..c9a2fe959b 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -47,6 +47,9 @@ if __name__ == '__main__': groupP.add_argument(*anvio.A('bin-id'), **anvio.K('bin-id')) groupP.add_argument(*anvio.A('bin-ids-file'), **anvio.K('bin-ids-file')) + groupC = parser.add_argument_group('OUTPUT', "Parameters for controlling estimation output.") + groupC.add_argument(*anvio.A('module-completion-threshold'), **anvio.K('module-completion-threshold')) + args = anvio.get_args(parser) try: From 246fe710c28cb7b2650a4c5062edc365f9624d14 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 15 Mar 2020 14:50:08 -0500 Subject: [PATCH 236/400] typo --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 07710b20fd..66d3b19e0c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1163,7 +1163,7 @@ def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): """This function returns data_value elements from the modules table for the specified module and data_name pair. All elements corresponding to the pair (ie, M00001 and ORTHOLOGY) will be returned. - The function relies on the db.get_some_rows_from_table_as_dict() functino to first fetch all rows corresponding \ + The function relies on the db.get_some_rows_from_table_as_dict() function to first fetch all rows corresponding \ to a particular model, and then parses the resulting dictionary to find all the elements with the given data_name field. PARAMETERS From 9fdaf2fd97e26b4d254d1a120e796499a330595a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sun, 15 Mar 2020 19:08:29 -0500 Subject: [PATCH 237/400] one MASSIVE commit containing the code for estimating module completeness. estimation now works for isolate genomes, for most modules. yay! --- anvio/kegg.py | 227 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 224 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 66d3b19e0c..6a4b543be5 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -763,6 +763,204 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N return bin_level_module_dict + def compute_module_completeness(self, mnum, present_list_for_mnum): + """This function calculates the completeness of the specified module. + + This requires some parsing of the module DEFINITION fields. In these fields, we have the following: + "Kxxxxx" (KO numbers) indicating which enzyme contributes to a step in the module + " " (spaces) separating module steps; indicating an AND operation + "," (commas) separating alternatives (which can be singular KOs or entire pathways); indicating an OR operation + "()" (parentheses) enclosing comma-separated alternatives + "+" (plus sign) indicating the following KO is a necessary component of a complex; indicating an AND operation + "-" (minus sign) indicating the following KO is non-essential in a complex; so in other words we don't care if it is there + + What we will do is build a condition statement out of each step which will evaulate to True if the step can be considered present based + on the available KOs in the current genome/bin. + For example, suppose we have a step like: (K13937,((K00036,K19243) (K01057,K07404))) + This will be parsed into the condition statement: (K13937 OR ((K00036 OR K19243) AND (K01057 OR K07404))) + where the KOs will be replaced by True if they are present and False otherwise. + + While we are parsing, we save the individual module steps in lists (one for all steps, one for complete steps) for easy access later. + Afterwards we compute the completeness of the module based on the specified completion threshold. + Then, we return a bunch of information about the completeness of the module, which can then be placed into the module completeness dictionary. + + PARAMETERS + ========== + mnum string, module number to work on + present_list_for_mnum list of strings, the KOs found to be present in this module for the current genome/bin + + RETURNS + ======= + module_step_list list of strings, each string is an individual step in the module (may have sub-steps if there are alternate pathways) + module_complete_steps list of strings, each string is a step in the module that is considered complete based on KO availability + module_nonessential_steps list of strings, each string is a step in the module that doesn't count for completeness estimates + module_total_steps int, the total number of steps in the module + module_num_complete_steps int, the number of complete steps in the module + module_num_nonessential_steps int, the total number of nonessential steps in the module + module_num_complete_nonessential_steps int, the number of nonessential steps in the module that were found to be complete + module_completeness float, a decimal indicating the fraction of complete steps in the module + over_complete_threshold boolean, whether or not the module is considered "complete" overall based on the threshold fraction of completeness + """ + + if not present_list_for_mnum: + # no KOs in this module are present + if anvio.DEBUG: + self.run.warning("No KOs present for module %s. Parsing for completeness is still being done to obtain module steps." % mnum) + + module_step_list = [] # while we are at it, we'll remember what the steps are + module_complete_steps = [] # and what the complete steps are + module_nonessential_steps = [] # steps that aren't necessary for module completeness + module_total_steps = 0 + module_num_complete_steps = 0 + module_num_nonessential_steps = 0 + module_num_complete_nonessential_steps = 0 + + def_lines = self.kegg_modules_db.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") + for d in def_lines: + d = d.strip() + cur_index = 0 # current position in the DEFINITION line + parens_level = 0 # how deep we are in nested parentheses + step_is_present_condition_statement = "" + last_step_end_index = 0 + + while cur_index < len(d): + if d[cur_index] == "K": # we have found a KO + ko = d[cur_index:cur_index+6] + if ko in present_list_for_mnum: + step_is_present_condition_statement += "True" + else: + step_is_present_condition_statement += "False" + cur_index += 6 + + elif d[cur_index] == "(": + parens_level += 1 + step_is_present_condition_statement += "(" + cur_index += 1 + + elif d[cur_index] == ")": + parens_level -= 1 + step_is_present_condition_statement += ")" + cur_index += 1 + + elif d[cur_index] == ",": + step_is_present_condition_statement += " or " + cur_index += 1 + + elif d[cur_index] == "+": + step_is_present_condition_statement += " and " + cur_index += 1 + + elif d[cur_index] == "-": + # either a singular KO or a set of KOs in parentheses can follow this character + # since the following KO(s) are non-essential in the complex, we skip over them to ignore them + # unless this is its own step, in which case we consider the whole step non-essential + + # singular nonessential KO + if d[cur_index+1] == "K": + nonessential_ko = d[cur_index+1:cur_index+7] + cur_index += 7 + """ + OKAY, SO HERE WE HAVE SOME POOPINESS THAT MAY NEED TO BE FIXED EVENTUALLY. + Basically, some DEFINITION lines have KOs that seem to be marked non-essential; + ie, "-K11024" in "K11023 -K11024 K11025 K11026 K11027". + It was difficult to decide whether we should consider only K11024, or K11024 and all following KOs, to be non-essential. + For instance, the module M00778 is a complex case that gave us pause - see Fiesta issue 955. + But for now, we have decided to just track only the one KO as a 'non-essential step', and to not include such steps in + the module completeness estimate. + """ + # if this is the first KO in the step and we find a space after this KO, then we have found a non-essential step + if step_is_present_condition_statement == "" and (cur_index == len(d) or d[cur_index] == " "): + module_nonessential_steps.append(d[last_step_end_index:cur_index]) + module_num_nonessential_steps += 1 + if not self.quiet: + self.run.warning("Just a note here - anvi'o found the following non-essential step in module %s: %s. " + "At this time, we are not counting this step in our completion estimates. If you have a problem with that, " + "then...! Well. Let us know. " % (mnum, d[last_step_end_index:cur_index])) + if nonessential_ko in present_list_for_mnum: + module_num_complete_nonessential_steps += 1 + # reset for next step + last_step_end_index = cur_index + 1 + cur_index += 1 + + # a whole set of nonessential KOs + elif d[cur_index+1] == "(": + while d[cur_index] != ")": + cur_index += 1 + cur_index += 1 # skip over the ')' + + # the '--' (no KO) situation + elif d[cur_index+1] == "-": + # when '--' in a DEFINITION line happens, it signifies a reaction step that has no associated KO. + # we assume that such steps are not complete, because we really can't know if it is from the KOfam hits alone + step_is_present_condition_statement += "False" + cur_index += 2 # skip over both '-', the next character should be a space or end of DEFINITION line + if not self.quiet: + self.run.warning("Just so you know, while estimating the completeness of KEGG module %s, anvi'o saw " + "'--' in the module DEFINITION. This indicates a step in the pathway that has no " + "associated KO. So we really cannot know just based on KOfam hits whether or not this " + "step is present. By default, anvi'o is marking this step incomplete. But it may not be, " + "and as a result this module may be falsely considered incomplete. So it may be in your " + "interest to go back and take a look at this individual module to see if you can find the " + "missing enzyme in some other way. Best of luck to you." % (mnum)) + if cur_index < len(d) and d[cur_index] != " ": + raise ConfigError("Serious, serious parsing sadness is happening. We just processed a '--' in " + "a DEFINITION line for module %s, but did not see a space afterwards. Instead, we found %s. " + "WHAT DO WE DO NOW?" % (mnum, d[cur_index+1])) + # anything else that follows a '-' + else: + raise ConfigError("The following character follows a '-' in the DEFINITION line for module %s " + "and we just don't know what to do: %s" % (mnum, d[cur_index+1])) + + elif d[cur_index] == " ": + # if we are outside of parentheses, we are done processing the current step + if parens_level == 0: + module_step_list.append(d[last_step_end_index:cur_index]) + module_total_steps += 1 + step_is_present = eval(step_is_present_condition_statement) + if step_is_present: + module_complete_steps.append(d[last_step_end_index:cur_index]) + module_num_complete_steps += 1 + # reset for next step + step_is_present_condition_statement = "" + last_step_end_index = cur_index + 1 + cur_index += 1 + # otherwise, we are processing an alternative path so AND is required + else: + step_is_present_condition_statement += " and " + cur_index += 1 + + elif d[cur_index] == "M": + print("OH NO. We found a module (%s) defined by other modules. We don't know what to do about this, so we are just " + "giving up for now." % mnum) + # FIXME + # this happens when a module is defined by other modules + # for example, photosynthesis module M00611 is defined as (M00161,M00163) M00165 === (photosystem II or photosystem I) and calvin cycle + # I don't know what to do about this yet so we are just going to return empty things for now + # THIS WILL CAUSE ISSUES DOWN THE ROAD SO WATCH OUT! + return [], [], [], None, None, None, None, None, None + + else: + raise ConfigError("While parsing the DEFINITION field for module %s, (which is %s), anvi'o found the following character " + "that she didn't understand: %s. Unfortunately, this means we cannot determine the module " + "completeness. For context, here is the current index in the DEFINITION line: %s and the " + "surrounding characters: %s" % (mnum, d, d[cur_index], cur_index, d[cur_index-5:cur_index+6])) + + # once we have processed the whole line, we still need to eval the last step. Unless we already did (this can happen with non-essential steps) + if step_is_present_condition_statement != "": + module_step_list.append(d[last_step_end_index:cur_index]) + module_total_steps += 1 + step_is_present = eval(step_is_present_condition_statement) + if step_is_present: + module_complete_steps.append(d[last_step_end_index:cur_index]) + module_num_complete_steps += 1 + + # once we have processed all DEFINITION lines, we can compute the overall completeness + module_completeness = module_num_complete_steps / module_total_steps + over_complete_threshold = True if module_completeness > self.completeness_threshold else False + return module_step_list, module_complete_steps, module_nonessential_steps, module_total_steps, module_num_complete_steps, \ + module_num_nonessential_steps, module_num_complete_nonessential_steps, module_completeness, over_complete_threshold + + def estimate_for_genome(self, kofam_hits, genes_in_splits): """This is the metabolism estimation function for a contigs DB that contains a single genome. @@ -786,9 +984,32 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): splits_in_genome = [tpl[0] for tpl in genes_in_splits] # get KO presence in modules genome_metabolism_dict[self.contigs_db_project_name] = self.mark_kos_present_for_list_of_splits(ko_in_genome, split_list=splits_in_genome, - bin_name=self.contigs_db_project_name) - # TODO estimate module completeness - + bin_name=self.contigs_db_project_name) + num_complete_modules = 0 + # estimate completeness of each module + for mod in genome_metabolism_dict[self.contigs_db_project_name].keys(): + mod_steps, mod_complete_steps, mod_nonessential_steps, mod_num_steps, mod_num_complete_steps, mod_num_nonessential_steps, \ + mod_num_complete_nonessential_steps, mod_complete_fraction, mod_is_complete \ + = self.compute_module_completeness(mod, genome_metabolism_dict[self.contigs_db_project_name][mod]["present_kos"]) + # assign completeness info back to module dict + genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"] = mod_steps + genome_metabolism_dict[self.contigs_db_project_name][mod]["complete_step_list"] = mod_complete_steps + genome_metabolism_dict[self.contigs_db_project_name][mod]["nonessential_step_list"] = mod_nonessential_steps + genome_metabolism_dict[self.contigs_db_project_name][mod]["num_steps"] = mod_num_steps + genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_steps"] = mod_num_complete_steps + genome_metabolism_dict[self.contigs_db_project_name][mod]["num_nonessential_steps"] = mod_num_nonessential_steps + genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_nonessential_steps"] = mod_num_complete_nonessential_steps + genome_metabolism_dict[self.contigs_db_project_name][mod]["fraction_complete"] = mod_complete_fraction + genome_metabolism_dict[self.contigs_db_project_name][mod]["complete"] = mod_is_complete + + if mod_is_complete: + num_complete_modules += 1 + + genome_metabolism_dict[self.contigs_db_project_name]["num_complete_modules"] = num_complete_modules + + self.run.info("Module completion threshold", self.completeness_threshold) + self.run.info("Number of complete modules", num_complete_modules) + return genome_metabolism_dict From 8e38de6e3cf67d1dec463fab8d3a41a97fac8ab8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 10:35:34 -0500 Subject: [PATCH 238/400] add output file param --- anvio/kegg.py | 1 + bin/anvi-estimate-kegg-metabolism | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6a4b543be5..4c0041e053 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -611,6 +611,7 @@ def __init__(self, args, run=run, progress=progress): self.bin_ids_file = A('bin_ids_file') self.metagenome_mode = True if A('metagenome_mode') else False self.completeness_threshold = A('module-completion-threshold') or 0.75 + self.output_file_path = A('output_file') or "kegg-metabolism.txt" self.contigs_db_project_name = "Unknown" self.bin_ids_to_process = None diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index c9a2fe959b..02d1b06aa7 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -47,8 +47,10 @@ if __name__ == '__main__': groupP.add_argument(*anvio.A('bin-id'), **anvio.K('bin-id')) groupP.add_argument(*anvio.A('bin-ids-file'), **anvio.K('bin-ids-file')) - groupC = parser.add_argument_group('OUTPUT', "Parameters for controlling estimation output.") + groupC = parser.add_argument_group('OUTPUT', "Parameters for controlling estimation output. The output will be a TAB-delimited file which by \ + default is called kegg-metabolism.txt, but you can of course change that name here.") groupC.add_argument(*anvio.A('module-completion-threshold'), **anvio.K('module-completion-threshold')) + groupC.add_argument(*anvio.A('output-file'), **anvio.K('output-file')) args = anvio.get_args(parser) From 6c27cfe83baba237c49e01536ef35524bb7ad905 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 11:42:40 -0500 Subject: [PATCH 239/400] change fraction to percent completeness --- anvio/kegg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 4c0041e053..37efe36e9f 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -956,7 +956,7 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): module_num_complete_steps += 1 # once we have processed all DEFINITION lines, we can compute the overall completeness - module_completeness = module_num_complete_steps / module_total_steps + module_completeness = module_num_complete_steps / module_total_steps * 100.0 over_complete_threshold = True if module_completeness > self.completeness_threshold else False return module_step_list, module_complete_steps, module_nonessential_steps, module_total_steps, module_num_complete_steps, \ module_num_nonessential_steps, module_num_complete_nonessential_steps, module_completeness, over_complete_threshold @@ -990,7 +990,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): # estimate completeness of each module for mod in genome_metabolism_dict[self.contigs_db_project_name].keys(): mod_steps, mod_complete_steps, mod_nonessential_steps, mod_num_steps, mod_num_complete_steps, mod_num_nonessential_steps, \ - mod_num_complete_nonessential_steps, mod_complete_fraction, mod_is_complete \ + mod_num_complete_nonessential_steps, mod_percent_complete, mod_is_complete \ = self.compute_module_completeness(mod, genome_metabolism_dict[self.contigs_db_project_name][mod]["present_kos"]) # assign completeness info back to module dict genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"] = mod_steps @@ -1000,7 +1000,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_steps"] = mod_num_complete_steps genome_metabolism_dict[self.contigs_db_project_name][mod]["num_nonessential_steps"] = mod_num_nonessential_steps genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_nonessential_steps"] = mod_num_complete_nonessential_steps - genome_metabolism_dict[self.contigs_db_project_name][mod]["fraction_complete"] = mod_complete_fraction + genome_metabolism_dict[self.contigs_db_project_name][mod]["percent_complete"] = mod_percent_complete genome_metabolism_dict[self.contigs_db_project_name][mod]["complete"] = mod_is_complete if mod_is_complete: From 4b2848b27e56f362ab4b6e9f4d2ea79085f986cb Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 16:46:04 -0500 Subject: [PATCH 240/400] keep track of complete nonessential steps in module --- anvio/kegg.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 37efe36e9f..ec0cffcd55 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -808,9 +808,10 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): if anvio.DEBUG: self.run.warning("No KOs present for module %s. Parsing for completeness is still being done to obtain module steps." % mnum) - module_step_list = [] # while we are at it, we'll remember what the steps are + module_step_list = [] # while we are at it, we'll remember what the (essential) steps are module_complete_steps = [] # and what the complete steps are module_nonessential_steps = [] # steps that aren't necessary for module completeness + module_complete_nonessential_steps = [] # and those nonessential steps which we find are complete module_total_steps = 0 module_num_complete_steps = 0 module_num_nonessential_steps = 0 @@ -878,7 +879,9 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): "At this time, we are not counting this step in our completion estimates. If you have a problem with that, " "then...! Well. Let us know. " % (mnum, d[last_step_end_index:cur_index])) if nonessential_ko in present_list_for_mnum: + module_complete_nonessential_steps.append(d[last_step_end_index:cur_index]) module_num_complete_nonessential_steps += 1 + # reset for next step last_step_end_index = cur_index + 1 cur_index += 1 @@ -958,7 +961,7 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): # once we have processed all DEFINITION lines, we can compute the overall completeness module_completeness = module_num_complete_steps / module_total_steps * 100.0 over_complete_threshold = True if module_completeness > self.completeness_threshold else False - return module_step_list, module_complete_steps, module_nonessential_steps, module_total_steps, module_num_complete_steps, \ + return module_step_list, module_complete_steps, module_nonessential_steps, module_complete_nonessential_steps, module_total_steps, module_num_complete_steps, \ module_num_nonessential_steps, module_num_complete_nonessential_steps, module_completeness, over_complete_threshold @@ -989,13 +992,14 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): num_complete_modules = 0 # estimate completeness of each module for mod in genome_metabolism_dict[self.contigs_db_project_name].keys(): - mod_steps, mod_complete_steps, mod_nonessential_steps, mod_num_steps, mod_num_complete_steps, mod_num_nonessential_steps, \ + mod_steps, mod_complete_steps, mod_nonessential_steps, mod_complete_nonessential_steps, mod_num_steps, mod_num_complete_steps, mod_num_nonessential_steps, \ mod_num_complete_nonessential_steps, mod_percent_complete, mod_is_complete \ = self.compute_module_completeness(mod, genome_metabolism_dict[self.contigs_db_project_name][mod]["present_kos"]) # assign completeness info back to module dict genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"] = mod_steps genome_metabolism_dict[self.contigs_db_project_name][mod]["complete_step_list"] = mod_complete_steps genome_metabolism_dict[self.contigs_db_project_name][mod]["nonessential_step_list"] = mod_nonessential_steps + genome_metabolism_dict[self.contigs_db_project_name][mod]["complete_nonessential_step_list"]= mod_complete_nonessential_steps genome_metabolism_dict[self.contigs_db_project_name][mod]["num_steps"] = mod_num_steps genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_steps"] = mod_num_complete_steps genome_metabolism_dict[self.contigs_db_project_name][mod]["num_nonessential_steps"] = mod_num_nonessential_steps From 651ca1a69452a9982bc3a351cfb8cd33b9b6a221 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 18:14:19 -0500 Subject: [PATCH 241/400] fix bug in temporary return statment and prettify other returns --- anvio/kegg.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index ec0cffcd55..9daf305157 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -941,7 +941,7 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): # for example, photosynthesis module M00611 is defined as (M00161,M00163) M00165 === (photosystem II or photosystem I) and calvin cycle # I don't know what to do about this yet so we are just going to return empty things for now # THIS WILL CAUSE ISSUES DOWN THE ROAD SO WATCH OUT! - return [], [], [], None, None, None, None, None, None + return [], [], [], [], None, None, None, None, None, None else: raise ConfigError("While parsing the DEFINITION field for module %s, (which is %s), anvi'o found the following character " @@ -961,8 +961,10 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): # once we have processed all DEFINITION lines, we can compute the overall completeness module_completeness = module_num_complete_steps / module_total_steps * 100.0 over_complete_threshold = True if module_completeness > self.completeness_threshold else False - return module_step_list, module_complete_steps, module_nonessential_steps, module_complete_nonessential_steps, module_total_steps, module_num_complete_steps, \ - module_num_nonessential_steps, module_num_complete_nonessential_steps, module_completeness, over_complete_threshold + + return module_step_list, module_complete_steps, module_nonessential_steps, module_complete_nonessential_steps, \ + module_total_steps, module_num_complete_steps, module_num_nonessential_steps, module_num_complete_nonessential_steps, \ + module_completeness, over_complete_threshold def estimate_for_genome(self, kofam_hits, genes_in_splits): @@ -992,8 +994,8 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): num_complete_modules = 0 # estimate completeness of each module for mod in genome_metabolism_dict[self.contigs_db_project_name].keys(): - mod_steps, mod_complete_steps, mod_nonessential_steps, mod_complete_nonessential_steps, mod_num_steps, mod_num_complete_steps, mod_num_nonessential_steps, \ - mod_num_complete_nonessential_steps, mod_percent_complete, mod_is_complete \ + mod_steps, mod_complete_steps, mod_nonessential_steps, mod_complete_nonessential_steps, mod_num_steps, mod_num_complete_steps, \ + mod_num_nonessential_steps, mod_num_complete_nonessential_steps, mod_percent_complete, mod_is_complete \ = self.compute_module_completeness(mod, genome_metabolism_dict[self.contigs_db_project_name][mod]["present_kos"]) # assign completeness info back to module dict genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"] = mod_steps From e0b5826fae8b067d9ea657ef92982676c7951935 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 18:14:44 -0500 Subject: [PATCH 242/400] store dict as tab delimited output --- anvio/kegg.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 9daf305157..5877cf55dd 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1049,6 +1049,31 @@ def estimate_metabolism(self): else: raise ConfigError("This class doesn't know how to deal with that yet :/") + self.store_kegg_metabolism_superdict(kegg_metabolism_superdict) + + + def store_kegg_metabolism_superdict(self, kegg_superdict): + """This function writes the metabolism superdict to a tab-delimited file. + + The metabolism superdict is a three-level dictionary (genomes/bins, modules, and module completion information). + To distill this information into one line, we need to convert the dictionary on-the-fly to a dict of dicts, + where each genome/bin-module pair is keyed by an arbitrary integer. + """ + + d = {} + i = 0 + for bin, mod_dict in kegg_superdict.items(): + for mnum, c_dict in mod_dict.items(): + if mnum == "num_complete_modules": + continue + d[i] = c_dict + d[i]["bin_name"] = bin + d[i]["kegg_module"] = mnum + i += 1 + + + utils.store_dict_as_TAB_delimited_file(d, self.output_file_path, key_header="unique_id") + self.run.info("Output file", self.output_file_path, nl_before=1) class KeggModulesDatabase(KeggContext): From a0a56d711a43e113e52365590f03953d2d5c35c7 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 18:48:28 -0500 Subject: [PATCH 243/400] condense module warnings --- anvio/kegg.py | 53 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 5877cf55dd..d11b3fc573 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -808,6 +808,7 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): if anvio.DEBUG: self.run.warning("No KOs present for module %s. Parsing for completeness is still being done to obtain module steps." % mnum) + # module information to return module_step_list = [] # while we are at it, we'll remember what the (essential) steps are module_complete_steps = [] # and what the complete steps are module_nonessential_steps = [] # steps that aren't necessary for module completeness @@ -816,6 +817,8 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): module_num_complete_steps = 0 module_num_nonessential_steps = 0 module_num_complete_nonessential_steps = 0 + has_nonessential_step = False + has_no_ko_step = False def_lines = self.kegg_modules_db.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") for d in def_lines: @@ -872,12 +875,10 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): """ # if this is the first KO in the step and we find a space after this KO, then we have found a non-essential step if step_is_present_condition_statement == "" and (cur_index == len(d) or d[cur_index] == " "): + has_nonessential_step = True module_nonessential_steps.append(d[last_step_end_index:cur_index]) module_num_nonessential_steps += 1 - if not self.quiet: - self.run.warning("Just a note here - anvi'o found the following non-essential step in module %s: %s. " - "At this time, we are not counting this step in our completion estimates. If you have a problem with that, " - "then...! Well. Let us know. " % (mnum, d[last_step_end_index:cur_index])) + if nonessential_ko in present_list_for_mnum: module_complete_nonessential_steps.append(d[last_step_end_index:cur_index]) module_num_complete_nonessential_steps += 1 @@ -896,16 +897,10 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): elif d[cur_index+1] == "-": # when '--' in a DEFINITION line happens, it signifies a reaction step that has no associated KO. # we assume that such steps are not complete, because we really can't know if it is from the KOfam hits alone + has_no_ko_step = True step_is_present_condition_statement += "False" cur_index += 2 # skip over both '-', the next character should be a space or end of DEFINITION line - if not self.quiet: - self.run.warning("Just so you know, while estimating the completeness of KEGG module %s, anvi'o saw " - "'--' in the module DEFINITION. This indicates a step in the pathway that has no " - "associated KO. So we really cannot know just based on KOfam hits whether or not this " - "step is present. By default, anvi'o is marking this step incomplete. But it may not be, " - "and as a result this module may be falsely considered incomplete. So it may be in your " - "interest to go back and take a look at this individual module to see if you can find the " - "missing enzyme in some other way. Best of luck to you." % (mnum)) + if cur_index < len(d) and d[cur_index] != " ": raise ConfigError("Serious, serious parsing sadness is happening. We just processed a '--' in " "a DEFINITION line for module %s, but did not see a space afterwards. Instead, we found %s. " @@ -941,7 +936,7 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): # for example, photosynthesis module M00611 is defined as (M00161,M00163) M00165 === (photosystem II or photosystem I) and calvin cycle # I don't know what to do about this yet so we are just going to return empty things for now # THIS WILL CAUSE ISSUES DOWN THE ROAD SO WATCH OUT! - return [], [], [], [], None, None, None, None, None, None + return [], [], [], [], None, None, None, None, None, None, None, None else: raise ConfigError("While parsing the DEFINITION field for module %s, (which is %s), anvi'o found the following character " @@ -962,9 +957,10 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): module_completeness = module_num_complete_steps / module_total_steps * 100.0 over_complete_threshold = True if module_completeness > self.completeness_threshold else False + return module_step_list, module_complete_steps, module_nonessential_steps, module_complete_nonessential_steps, \ module_total_steps, module_num_complete_steps, module_num_nonessential_steps, module_num_complete_nonessential_steps, \ - module_completeness, over_complete_threshold + module_completeness, over_complete_threshold, has_nonessential_step, has_no_ko_step def estimate_for_genome(self, kofam_hits, genes_in_splits): @@ -992,10 +988,15 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): genome_metabolism_dict[self.contigs_db_project_name] = self.mark_kos_present_for_list_of_splits(ko_in_genome, split_list=splits_in_genome, bin_name=self.contigs_db_project_name) num_complete_modules = 0 + # modules to warn about + mods_with_unassociated_ko = [] # a list of modules that have "--" steps without an associated KO + mods_with_nonessential_steps = [] # a list of modules that have nonessential steps like "-K11024" + # estimate completeness of each module for mod in genome_metabolism_dict[self.contigs_db_project_name].keys(): mod_steps, mod_complete_steps, mod_nonessential_steps, mod_complete_nonessential_steps, mod_num_steps, mod_num_complete_steps, \ - mod_num_nonessential_steps, mod_num_complete_nonessential_steps, mod_percent_complete, mod_is_complete \ + mod_num_nonessential_steps, mod_num_complete_nonessential_steps, mod_percent_complete, \ + mod_is_complete, has_nonessential_step, has_no_ko_step \ = self.compute_module_completeness(mod, genome_metabolism_dict[self.contigs_db_project_name][mod]["present_kos"]) # assign completeness info back to module dict genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"] = mod_steps @@ -1011,9 +1012,31 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): if mod_is_complete: num_complete_modules += 1 + if has_nonessential_step: + mods_with_nonessential_steps.append(mod) + if has_no_ko_step: + mods_with_unassociated_ko.append(mod) genome_metabolism_dict[self.contigs_db_project_name]["num_complete_modules"] = num_complete_modules + # notify user of the modules that gave some fishy results + if not self.quiet: + if mods_with_nonessential_steps: + self.run.warning("Please note that anvi'o found one or more non-essential steps in the following KEGG modules: %s. " + "At this time, we are not counting these steps in our percent completion estimates. But we still kept track of which " + "of these non-essential steps were found to be complete. You can see this information in the output file." + % (", ".join(mods_with_nonessential_steps))) + + if mods_with_unassociated_ko: + self.run.warning("Just so you know, while estimating the completeness of some KEGG modules, anvi'o saw " + "'--' in the module DEFINITION. This indicates a step in the pathway that has no " + "associated KO. So we really cannot know just based on KOfam hits whether or not this " + "step is present. By default, anvi'o marks these steps incomplete. But they may not be, " + "and as a result their modules may be falsely considered incomplete. So it may be in your " + "interest to go back and take a look at these individual modules to see if you can find the " + "missing enzyme in some other way. Best of luck to you. Here is the list of modules to check out: %s" + % (", ".join(mods_with_unassociated_ko))) + self.run.info("Module completion threshold", self.completeness_threshold) self.run.info("Number of complete modules", num_complete_modules) From a3ee25b9f76ebcd0135326054bd9950e93ba899c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 22:46:45 -0500 Subject: [PATCH 244/400] update function deflines --- anvio/kegg.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index d11b3fc573..2695f915b1 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -769,6 +769,7 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): This requires some parsing of the module DEFINITION fields. In these fields, we have the following: "Kxxxxx" (KO numbers) indicating which enzyme contributes to a step in the module + "Mxxxxx" (module numbers) indicating that the module encompasses another module. This is rare. See note below. " " (spaces) separating module steps; indicating an AND operation "," (commas) separating alternatives (which can be singular KOs or entire pathways); indicating an OR operation "()" (parentheses) enclosing comma-separated alternatives @@ -781,10 +782,23 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): This will be parsed into the condition statement: (K13937 OR ((K00036 OR K19243) AND (K01057 OR K07404))) where the KOs will be replaced by True if they are present and False otherwise. - While we are parsing, we save the individual module steps in lists (one for all steps, one for complete steps) for easy access later. + While we are parsing, we save the individual module steps in lists (ie, one for all steps, one for complete steps) for easy access later. Afterwards we compute the completeness of the module based on the specified completion threshold. Then, we return a bunch of information about the completeness of the module, which can then be placed into the module completeness dictionary. + There are 3 special cases to consider here. + 1) Non-essential steps. These are steps that are marked with a preceding "-" to indicate that they are not required for the module to + be considered complete. They often occur in pathways with multiple forks. What we do with these is save and count them separately as + non-essential steps, but we do not use them in our module completeness calculations. Another thing we do is continue parsing the rest + of the module steps as normal, even though some of them may affect steps after the non-essential one. That may eventually change. + See comments in the code below. + 2) Steps without associated KOs. These are steps marked as "--". They may require an enzyme, but if so that enzyme is not in the KOfam + database, so we can't know whether they are complete or not from our KOfam hits. Therefore, we assume these steps are incomplete, and + warn the user to go back and check the module manually. + 3) Steps defined by entire modules. These steps have module numbers instead of KOs, so they require an entire module to be complete in + order to be complete. We can't figure this out until after we've evaluated all modules, so we simply parse these steps without marking + them complete, and later will go back to adjust the completeness score once all modules have been marked complete or not. + PARAMETERS ========== mnum string, module number to work on @@ -795,12 +809,16 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): module_step_list list of strings, each string is an individual step in the module (may have sub-steps if there are alternate pathways) module_complete_steps list of strings, each string is a step in the module that is considered complete based on KO availability module_nonessential_steps list of strings, each string is a step in the module that doesn't count for completeness estimates - module_total_steps int, the total number of steps in the module - module_num_complete_steps int, the number of complete steps in the module - module_num_nonessential_steps int, the total number of nonessential steps in the module - module_num_complete_nonessential_steps int, the number of nonessential steps in the module that were found to be complete + module_complete_nonessential_steps list of strings, each string is a non-essential step that is considered complete based on KO availability + module_total_steps int, the total number of steps in the module + module_num_complete_steps int, the number of complete steps in the module + module_num_nonessential_steps int, the total number of nonessential steps in the module + module_num_complete_nonessential_steps int, the number of nonessential steps in the module that were found to be complete module_completeness float, a decimal indicating the fraction of complete steps in the module over_complete_threshold boolean, whether or not the module is considered "complete" overall based on the threshold fraction of completeness + has_nonessential_step boolean, whether or not the module contains non-essential steps. Used for warning the user about these. + has_no_ko_step boolean, whether or not the module contains steps without associated KOs. Used for warning the user about these. + defined_by_modules boolean, whether or not the module contains steps defined by other modules. Used for going back to adjust completeness later. """ if not present_list_for_mnum: From d563afcc383f681706ac38ed0029a5e808a12621 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 22:47:44 -0500 Subject: [PATCH 245/400] controls for modules defined by other modules --- anvio/kegg.py | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 2695f915b1..86e640bd42 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -837,6 +837,7 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): module_num_complete_nonessential_steps = 0 has_nonessential_step = False has_no_ko_step = False + defined_by_modules = False def_lines = self.kegg_modules_db.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") for d in def_lines: @@ -849,6 +850,7 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): while cur_index < len(d): if d[cur_index] == "K": # we have found a KO ko = d[cur_index:cur_index+6] + defined_by_modules = False # reset this flag just in case KO-defined step comes after a module-defined step if ko in present_list_for_mnum: step_is_present_condition_statement += "True" else: @@ -933,10 +935,12 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): if parens_level == 0: module_step_list.append(d[last_step_end_index:cur_index]) module_total_steps += 1 - step_is_present = eval(step_is_present_condition_statement) - if step_is_present: - module_complete_steps.append(d[last_step_end_index:cur_index]) - module_num_complete_steps += 1 + # we do not evaluate completeness of this step yet if it is defined by other modules + if not defined_by_modules: + step_is_present = eval(step_is_present_condition_statement) + if step_is_present: + module_complete_steps.append(d[last_step_end_index:cur_index]) + module_num_complete_steps += 1 # reset for next step step_is_present_condition_statement = "" last_step_end_index = cur_index + 1 @@ -947,14 +951,17 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): cur_index += 1 elif d[cur_index] == "M": - print("OH NO. We found a module (%s) defined by other modules. We don't know what to do about this, so we are just " - "giving up for now." % mnum) - # FIXME - # this happens when a module is defined by other modules - # for example, photosynthesis module M00611 is defined as (M00161,M00163) M00165 === (photosystem II or photosystem I) and calvin cycle - # I don't know what to do about this yet so we are just going to return empty things for now - # THIS WILL CAUSE ISSUES DOWN THE ROAD SO WATCH OUT! - return [], [], [], [], None, None, None, None, None, None, None, None + """ + This happens when a module is defined by other modules. For example, photosynthesis module M00611 is defined as + (M00161,M00163) M00165 === (photosystem II or photosystem I) and calvin cycle + + We need all the modules to have been evaluated before we can determine completeness of steps with module numbers. + So what we will do here is just add the step to the appropriate lists without evaluating completeness, and use a + flag variable to keep track of the modules that have this sort of definition in a list so we can go back and + evaluate completeness of steps with module numbers later. + """ + defined_by_modules = True + cur_index += 6 else: raise ConfigError("While parsing the DEFINITION field for module %s, (which is %s), anvi'o found the following character " @@ -962,14 +969,17 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): "completeness. For context, here is the current index in the DEFINITION line: %s and the " "surrounding characters: %s" % (mnum, d, d[cur_index], cur_index, d[cur_index-5:cur_index+6])) - # once we have processed the whole line, we still need to eval the last step. Unless we already did (this can happen with non-essential steps) - if step_is_present_condition_statement != "": + # once we have processed the whole line, we still need to eval the last step. + # Unless we already did (this can happen with non-essential steps), which we check by seeing if the condition statement is empty + # However, if this step is defined by modules, the condition statement will be empty, but we still need to save the step + if step_is_present_condition_statement != "" or defined_by_modules: module_step_list.append(d[last_step_end_index:cur_index]) module_total_steps += 1 - step_is_present = eval(step_is_present_condition_statement) - if step_is_present: - module_complete_steps.append(d[last_step_end_index:cur_index]) - module_num_complete_steps += 1 + if not defined_by_modules: + step_is_present = eval(step_is_present_condition_statement) + if step_is_present: + module_complete_steps.append(d[last_step_end_index:cur_index]) + module_num_complete_steps += 1 # once we have processed all DEFINITION lines, we can compute the overall completeness module_completeness = module_num_complete_steps / module_total_steps * 100.0 From 5d7337c751bfcfebb9e6ba985261503af75efb0b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 22:49:02 -0500 Subject: [PATCH 246/400] use decimal for percent completion! making it into a percent got confusing and we over-estimated module completeness previously because the threshold was defined as a decimal --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 86e640bd42..0ccb9669e0 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -982,8 +982,8 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): module_num_complete_steps += 1 # once we have processed all DEFINITION lines, we can compute the overall completeness - module_completeness = module_num_complete_steps / module_total_steps * 100.0 - over_complete_threshold = True if module_completeness > self.completeness_threshold else False + module_completeness = module_num_complete_steps / module_total_steps + over_complete_threshold = True if module_completeness >= self.completeness_threshold else False return module_step_list, module_complete_steps, module_nonessential_steps, module_complete_nonessential_steps, \ From 9c681fd9ae90b5348f2b89807aa068f3a81b8b29 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 22:49:56 -0500 Subject: [PATCH 247/400] fix return to keep track of modules that need to be re-evaluated later --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 0ccb9669e0..2d2a77faaf 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -988,7 +988,7 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): return module_step_list, module_complete_steps, module_nonessential_steps, module_complete_nonessential_steps, \ module_total_steps, module_num_complete_steps, module_num_nonessential_steps, module_num_complete_nonessential_steps, \ - module_completeness, over_complete_threshold, has_nonessential_step, has_no_ko_step + module_completeness, over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules def estimate_for_genome(self, kofam_hits, genes_in_splits): From aa4e0e68a435a2c7b11c01140cf2580cab138bf9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 22:50:59 -0500 Subject: [PATCH 248/400] this is code to re-evaluate completeness of modules defined by other modules. it needs to go in its own function but for now i am leaving it here. all those print statements need to be removed too --- anvio/kegg.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 2d2a77faaf..8c62b9671c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1019,12 +1019,13 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): # modules to warn about mods_with_unassociated_ko = [] # a list of modules that have "--" steps without an associated KO mods_with_nonessential_steps = [] # a list of modules that have nonessential steps like "-K11024" + mods_def_by_modules = [] # a list of modules that have module numbers in their definitions # estimate completeness of each module for mod in genome_metabolism_dict[self.contigs_db_project_name].keys(): mod_steps, mod_complete_steps, mod_nonessential_steps, mod_complete_nonessential_steps, mod_num_steps, mod_num_complete_steps, \ mod_num_nonessential_steps, mod_num_complete_nonessential_steps, mod_percent_complete, \ - mod_is_complete, has_nonessential_step, has_no_ko_step \ + mod_is_complete, has_nonessential_step, has_no_ko_step, defined_by_modules \ = self.compute_module_completeness(mod, genome_metabolism_dict[self.contigs_db_project_name][mod]["present_kos"]) # assign completeness info back to module dict genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"] = mod_steps @@ -1044,6 +1045,85 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): mods_with_nonessential_steps.append(mod) if has_no_ko_step: mods_with_unassociated_ko.append(mod) + if defined_by_modules: + mods_def_by_modules.append(mod) + + # go back and adjust completeness of modules that are defined by other modules + if mods_def_by_modules: + for mod in mods_def_by_modules: + print("Re-calculating for module %s with steps %s" % (mod, " ".join(genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"]))) + for step in genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"]: + print("Step: ", step) + cur_index = 0 # current position in the step definition + parens_level = 0 # how deep we are in nested parentheses + step_is_present_condition_statement = "" + is_ko_step = False + while cur_index < len(step): + # we have found a KO so we can ignore this step; it has already been counted as complete or not + if step[cur_index] == "K": + print("breaking from step") + is_ko_step = True + break + + # we have found a module so we must evaluate this steps's completeness by checking if the module is complete + elif step[cur_index] == "M": + mnum = step[cur_index:cur_index+6] + if genome_metabolism_dict[self.contigs_db_project_name][mnum]["complete"]: + print("module %s found complete" % mnum) + step_is_present_condition_statement += "True" + else: + print("module %s found INcomplete" % mnum) + step_is_present_condition_statement += "False" + cur_index += 6 + + elif step[cur_index] == "(": + parens_level += 1 + step_is_present_condition_statement += "(" + cur_index += 1 + + elif step[cur_index] == ")": + parens_level -= 1 + step_is_present_condition_statement += ")" + cur_index += 1 + + elif step[cur_index] == ",": + step_is_present_condition_statement += " or " + cur_index += 1 + + elif step[cur_index] == " ": + # if we are outside of parentheses, something is wrong because this should all be just one step + if parens_level == 0: + raise ConfigError("Much parsing sadness. We thought we were re-evaluating the completeness of just one step in " + "module %s (step: %s), but we found a space that seems to indicate another step. HALP." % (mod, step)) + # otherwise, we are processing an alternative path so AND is required + else: + step_is_present_condition_statement += " and " + cur_index += 1 + + else: + raise ConfigError("While correcting completeness for module %s, (step %s), anvi'o found the following character " + "that she didn't understand: %s. Unfortunately, this means we cannot determine the module " + "completeness. For context, here is the current index in the DEFINITION line: %s and the " + "surrounding characters: %s" % (mod, step, step[cur_index], cur_index, step[cur_index-5:cur_index+6])) + # once we have processed everything, we need to re-evaluate the step (provided its not a KO step that has already been evaluated) + if not is_ko_step: + print("condition statement: ", step_is_present_condition_statement) + step_is_present = eval(step_is_present_condition_statement) + print("evaluates to ", step_is_present) + if step_is_present: + genome_metabolism_dict[self.contigs_db_project_name][mod]["complete_step_list"].append(step) + genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_steps"] += 1 + + # now, we recalculate module completeness + print("module previously had completeness ", genome_metabolism_dict[self.contigs_db_project_name][mod]["percent_complete"]) + genome_metabolism_dict[self.contigs_db_project_name][mod]["percent_complete"] = genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_steps"] / genome_metabolism_dict[self.contigs_db_project_name][mod]["num_steps"] + now_complete = True if genome_metabolism_dict[self.contigs_db_project_name][mod]["percent_complete"] >= self.completeness_threshold else False + genome_metabolism_dict[self.contigs_db_project_name][mod]["complete"] = now_complete + if now_complete: + print("module %s is now COMPLETE" % mod) + num_complete_modules += 1 + print("module now has completeness ", genome_metabolism_dict[self.contigs_db_project_name][mod]["percent_complete"]) + genome_metabolism_dict[self.contigs_db_project_name]["num_complete_modules"] = num_complete_modules From c83d9963ce9130fd1530af1d6cbf12be16a68f0c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 16 Mar 2020 22:51:19 -0500 Subject: [PATCH 249/400] skeleton function for re-evaluating module completeness --- anvio/kegg.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 8c62b9671c..6d82404439 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -991,6 +991,14 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): module_completeness, over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules + def adjust_module_completeness(self, mod, meta_dict_for_bin): + """This function adjusts completeness of modules that are defined by other modules. + + This can only be done after all other modules have been evaluated for completeness. + """ + pass + + def estimate_for_genome(self, kofam_hits, genes_in_splits): """This is the metabolism estimation function for a contigs DB that contains a single genome. From 26b57810c57536d01581c88aea7b7fc9c05f4d08 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 09:46:47 -0500 Subject: [PATCH 250/400] move adjustment code into its own func --- anvio/kegg.py | 142 +++++++++++++++++++++++--------------------------- 1 file changed, 66 insertions(+), 76 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6d82404439..3b2317e3bb 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -996,7 +996,69 @@ def adjust_module_completeness(self, mod, meta_dict_for_bin): This can only be done after all other modules have been evaluated for completeness. """ - pass + + for step in meta_dict_for_bin[mod]["step_list"]: + cur_index = 0 # current position in the step definition + parens_level = 0 # how deep we are in nested parentheses + step_is_present_condition_statement = "" + is_ko_step = False + while cur_index < len(step): + # we have found a KO so we can ignore this step; it has already been counted as complete or not + if step[cur_index] == "K": + is_ko_step = True + break + + # we have found a module so we must evaluate this steps's completeness by checking if the module is complete + elif step[cur_index] == "M": + mnum = step[cur_index:cur_index+6] + if meta_dict_for_bin[mnum]["complete"]: + step_is_present_condition_statement += "True" + else: + step_is_present_condition_statement += "False" + cur_index += 6 + + elif step[cur_index] == "(": + parens_level += 1 + step_is_present_condition_statement += "(" + cur_index += 1 + + elif step[cur_index] == ")": + parens_level -= 1 + step_is_present_condition_statement += ")" + cur_index += 1 + + elif step[cur_index] == ",": + step_is_present_condition_statement += " or " + cur_index += 1 + + elif step[cur_index] == " ": + # if we are outside of parentheses, something is wrong because this should all be just one step + if parens_level == 0: + raise ConfigError("Much parsing sadness. We thought we were re-evaluating the completeness of just one step in " + "module %s (step: %s), but we found a space that seems to indicate another step. HALP." % (mod, step)) + # otherwise, we are processing an alternative path so AND is required + else: + step_is_present_condition_statement += " and " + cur_index += 1 + + else: + raise ConfigError("While correcting completeness for module %s, (step %s), anvi'o found the following character " + "that she didn't understand: %s. Unfortunately, this means we cannot determine the module " + "completeness. For context, here is the current index in the DEFINITION line: %s and the " + "surrounding characters: %s" % (mod, step, step[cur_index], cur_index, step[cur_index-5:cur_index+6])) + # once we have processed everything, we need to re-evaluate the step (provided its not a KO step that has already been evaluated) + if not is_ko_step: + step_is_present = eval(step_is_present_condition_statement) + if step_is_present: + meta_dict_for_bin[mod]["complete_step_list"].append(step) + meta_dict_for_bin[mod]["num_complete_steps"] += 1 + + # now, we recalculate module completeness + meta_dict_for_bin[mod]["percent_complete"] = meta_dict_for_bin[mod]["num_complete_steps"] / meta_dict_for_bin[mod]["num_steps"] + now_complete = True if meta_dict_for_bin[mod]["percent_complete"] >= self.completeness_threshold else False + meta_dict_for_bin[mod]["complete"] = now_complete + if now_complete: + meta_dict_for_bin["num_complete_modules"] += 1 def estimate_for_genome(self, kofam_hits, genes_in_splits): @@ -1056,84 +1118,12 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): if defined_by_modules: mods_def_by_modules.append(mod) + genome_metabolism_dict[self.contigs_db_project_name]["num_complete_modules"] = num_complete_modules + # go back and adjust completeness of modules that are defined by other modules if mods_def_by_modules: for mod in mods_def_by_modules: - print("Re-calculating for module %s with steps %s" % (mod, " ".join(genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"]))) - for step in genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"]: - print("Step: ", step) - cur_index = 0 # current position in the step definition - parens_level = 0 # how deep we are in nested parentheses - step_is_present_condition_statement = "" - is_ko_step = False - while cur_index < len(step): - # we have found a KO so we can ignore this step; it has already been counted as complete or not - if step[cur_index] == "K": - print("breaking from step") - is_ko_step = True - break - - # we have found a module so we must evaluate this steps's completeness by checking if the module is complete - elif step[cur_index] == "M": - mnum = step[cur_index:cur_index+6] - if genome_metabolism_dict[self.contigs_db_project_name][mnum]["complete"]: - print("module %s found complete" % mnum) - step_is_present_condition_statement += "True" - else: - print("module %s found INcomplete" % mnum) - step_is_present_condition_statement += "False" - cur_index += 6 - - elif step[cur_index] == "(": - parens_level += 1 - step_is_present_condition_statement += "(" - cur_index += 1 - - elif step[cur_index] == ")": - parens_level -= 1 - step_is_present_condition_statement += ")" - cur_index += 1 - - elif step[cur_index] == ",": - step_is_present_condition_statement += " or " - cur_index += 1 - - elif step[cur_index] == " ": - # if we are outside of parentheses, something is wrong because this should all be just one step - if parens_level == 0: - raise ConfigError("Much parsing sadness. We thought we were re-evaluating the completeness of just one step in " - "module %s (step: %s), but we found a space that seems to indicate another step. HALP." % (mod, step)) - # otherwise, we are processing an alternative path so AND is required - else: - step_is_present_condition_statement += " and " - cur_index += 1 - - else: - raise ConfigError("While correcting completeness for module %s, (step %s), anvi'o found the following character " - "that she didn't understand: %s. Unfortunately, this means we cannot determine the module " - "completeness. For context, here is the current index in the DEFINITION line: %s and the " - "surrounding characters: %s" % (mod, step, step[cur_index], cur_index, step[cur_index-5:cur_index+6])) - # once we have processed everything, we need to re-evaluate the step (provided its not a KO step that has already been evaluated) - if not is_ko_step: - print("condition statement: ", step_is_present_condition_statement) - step_is_present = eval(step_is_present_condition_statement) - print("evaluates to ", step_is_present) - if step_is_present: - genome_metabolism_dict[self.contigs_db_project_name][mod]["complete_step_list"].append(step) - genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_steps"] += 1 - - # now, we recalculate module completeness - print("module previously had completeness ", genome_metabolism_dict[self.contigs_db_project_name][mod]["percent_complete"]) - genome_metabolism_dict[self.contigs_db_project_name][mod]["percent_complete"] = genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_steps"] / genome_metabolism_dict[self.contigs_db_project_name][mod]["num_steps"] - now_complete = True if genome_metabolism_dict[self.contigs_db_project_name][mod]["percent_complete"] >= self.completeness_threshold else False - genome_metabolism_dict[self.contigs_db_project_name][mod]["complete"] = now_complete - if now_complete: - print("module %s is now COMPLETE" % mod) - num_complete_modules += 1 - print("module now has completeness ", genome_metabolism_dict[self.contigs_db_project_name][mod]["percent_complete"]) - - - genome_metabolism_dict[self.contigs_db_project_name]["num_complete_modules"] = num_complete_modules + self.adjust_module_completeness(mod, genome_metabolism_dict[self.contigs_db_project_name]) # notify user of the modules that gave some fishy results if not self.quiet: From e7905331f5c0b0f0257a5f0bd27c78559135b849 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 09:47:04 -0500 Subject: [PATCH 251/400] print complete modules --- anvio/kegg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 3b2317e3bb..1f18ac0bac 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1086,6 +1086,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): genome_metabolism_dict[self.contigs_db_project_name] = self.mark_kos_present_for_list_of_splits(ko_in_genome, split_list=splits_in_genome, bin_name=self.contigs_db_project_name) num_complete_modules = 0 + complete_mods = [] # modules to warn about mods_with_unassociated_ko = [] # a list of modules that have "--" steps without an associated KO mods_with_nonessential_steps = [] # a list of modules that have nonessential steps like "-K11024" @@ -1111,6 +1112,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): if mod_is_complete: num_complete_modules += 1 + complete_mods.append(mod) if has_nonessential_step: mods_with_nonessential_steps.append(mod) if has_no_ko_step: @@ -1145,6 +1147,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): self.run.info("Module completion threshold", self.completeness_threshold) self.run.info("Number of complete modules", num_complete_modules) + self.run.info("Complete modules", ", ".join(complete_mods)) return genome_metabolism_dict From 7bec6bc76ce2cbd6a9e7d17bc9781d29c1b09498 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 09:54:08 -0500 Subject: [PATCH 252/400] rename func and update docstring --- anvio/kegg.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 1f18ac0bac..90f04d3b69 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -991,10 +991,17 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): module_completeness, over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules - def adjust_module_completeness(self, mod, meta_dict_for_bin): + def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): """This function adjusts completeness of modules that are defined by other modules. This can only be done after all other modules have been evaluated for completeness. + The function uses similar logic as compute_module_completeness_for_bin() to re-assess whether steps defined + by other modules are complete, and updates the metabolism completess dictionary accordingly. + + PARAMETERS + ========== + mod string, the module number to adjust + meta_dict_for_bin metabolism completeness dictionary for the current bin """ for step in meta_dict_for_bin[mod]["step_list"]: @@ -1125,7 +1132,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): # go back and adjust completeness of modules that are defined by other modules if mods_def_by_modules: for mod in mods_def_by_modules: - self.adjust_module_completeness(mod, genome_metabolism_dict[self.contigs_db_project_name]) + self.adjust_module_completeness_for_bin(mod, genome_metabolism_dict[self.contigs_db_project_name]) # notify user of the modules that gave some fishy results if not self.quiet: From 5ef7059679413e33792bab1e48cfe3a7a677cbde Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 11:39:40 -0500 Subject: [PATCH 253/400] typo --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 90f04d3b69..7e7a4b76e4 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -719,7 +719,7 @@ def init_hits_and_splits(self): return kofam_hits, genes_in_splits def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=None, bin_name=None): - """This function generates a bin-level dictionary of dictionary, which associates modules with the list of KOs + """This function generates a bin-level dictionary of dictionaries, which associates modules with the list of KOs that are present in the bin for each module. The structure of the dictionary is like this: From f000a96c10f16b239cfa543f01c1fef01fa70ff8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 11:40:47 -0500 Subject: [PATCH 254/400] refactor function to modify dict in place. it is much cleaner this way than returning a bunch of variables --- anvio/kegg.py | 50 +++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 7e7a4b76e4..e09cc00cd9 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -764,7 +764,7 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N return bin_level_module_dict - def compute_module_completeness(self, mnum, present_list_for_mnum): + def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): """This function calculates the completeness of the specified module. This requires some parsing of the module DEFINITION fields. In these fields, we have the following: @@ -802,9 +802,9 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): PARAMETERS ========== mnum string, module number to work on - present_list_for_mnum list of strings, the KOs found to be present in this module for the current genome/bin + meta_dict_for_bin metabolism completeness dict for the current bin, to be modified in-place - RETURNS + VARIABLES FOR UPDATING METABOLISM COMPLETENESS DICT ======= module_step_list list of strings, each string is an individual step in the module (may have sub-steps if there are alternate pathways) module_complete_steps list of strings, each string is a step in the module that is considered complete based on KO availability @@ -815,12 +815,16 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): module_num_nonessential_steps int, the total number of nonessential steps in the module module_num_complete_nonessential_steps int, the number of nonessential steps in the module that were found to be complete module_completeness float, a decimal indicating the fraction of complete steps in the module + + RETURNS + ======= over_complete_threshold boolean, whether or not the module is considered "complete" overall based on the threshold fraction of completeness has_nonessential_step boolean, whether or not the module contains non-essential steps. Used for warning the user about these. has_no_ko_step boolean, whether or not the module contains steps without associated KOs. Used for warning the user about these. defined_by_modules boolean, whether or not the module contains steps defined by other modules. Used for going back to adjust completeness later. """ + present_list_for_mnum = meta_dict_for_bin[mnum]["present_kos"] if not present_list_for_mnum: # no KOs in this module are present if anvio.DEBUG: @@ -985,10 +989,21 @@ def compute_module_completeness(self, mnum, present_list_for_mnum): module_completeness = module_num_complete_steps / module_total_steps over_complete_threshold = True if module_completeness >= self.completeness_threshold else False + # instead of returning everything, we update the metabolism completeness dictionary in place + meta_dict_for_bin[mnum]["step_list"] = module_step_list + meta_dict_for_bin[mnum]["complete_step_list"] = module_complete_steps + meta_dict_for_bin[mnum]["nonessential_step_list"] = module_nonessential_steps + meta_dict_for_bin[mnum]["complete_nonessential_step_list"]= module_complete_nonessential_steps + meta_dict_for_bin[mnum]["num_steps"] = module_total_steps + meta_dict_for_bin[mnum]["num_complete_steps"] = module_num_complete_steps + meta_dict_for_bin[mnum]["num_nonessential_steps"] = module_num_nonessential_steps + meta_dict_for_bin[mnum]["num_complete_nonessential_steps"] = module_num_complete_nonessential_steps + meta_dict_for_bin[mnum]["percent_complete"] = module_completeness + meta_dict_for_bin[mnum]["complete"] = over_complete_threshold + if over_complete_threshold: + meta_dict_for_bin["num_complete_modules"] += 1 - return module_step_list, module_complete_steps, module_nonessential_steps, module_complete_nonessential_steps, \ - module_total_steps, module_num_complete_steps, module_num_nonessential_steps, module_num_complete_nonessential_steps, \ - module_completeness, over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules + return over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): @@ -1092,33 +1107,22 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): # get KO presence in modules genome_metabolism_dict[self.contigs_db_project_name] = self.mark_kos_present_for_list_of_splits(ko_in_genome, split_list=splits_in_genome, bin_name=self.contigs_db_project_name) - num_complete_modules = 0 + genome_metabolism_dict[self.contigs_db_project_name]["num_complete_modules"] = 0 complete_mods = [] + mods_def_by_modules = [] # a list of modules that have module numbers in their definitions # modules to warn about mods_with_unassociated_ko = [] # a list of modules that have "--" steps without an associated KO mods_with_nonessential_steps = [] # a list of modules that have nonessential steps like "-K11024" - mods_def_by_modules = [] # a list of modules that have module numbers in their definitions # estimate completeness of each module for mod in genome_metabolism_dict[self.contigs_db_project_name].keys(): - mod_steps, mod_complete_steps, mod_nonessential_steps, mod_complete_nonessential_steps, mod_num_steps, mod_num_complete_steps, \ - mod_num_nonessential_steps, mod_num_complete_nonessential_steps, mod_percent_complete, \ + if mod == "num_complete_modules": + continue mod_is_complete, has_nonessential_step, has_no_ko_step, defined_by_modules \ - = self.compute_module_completeness(mod, genome_metabolism_dict[self.contigs_db_project_name][mod]["present_kos"]) - # assign completeness info back to module dict - genome_metabolism_dict[self.contigs_db_project_name][mod]["step_list"] = mod_steps - genome_metabolism_dict[self.contigs_db_project_name][mod]["complete_step_list"] = mod_complete_steps - genome_metabolism_dict[self.contigs_db_project_name][mod]["nonessential_step_list"] = mod_nonessential_steps - genome_metabolism_dict[self.contigs_db_project_name][mod]["complete_nonessential_step_list"]= mod_complete_nonessential_steps - genome_metabolism_dict[self.contigs_db_project_name][mod]["num_steps"] = mod_num_steps - genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_steps"] = mod_num_complete_steps - genome_metabolism_dict[self.contigs_db_project_name][mod]["num_nonessential_steps"] = mod_num_nonessential_steps - genome_metabolism_dict[self.contigs_db_project_name][mod]["num_complete_nonessential_steps"] = mod_num_complete_nonessential_steps - genome_metabolism_dict[self.contigs_db_project_name][mod]["percent_complete"] = mod_percent_complete - genome_metabolism_dict[self.contigs_db_project_name][mod]["complete"] = mod_is_complete + = self.compute_module_completeness_for_bin(mod, genome_metabolism_dict[self.contigs_db_project_name]) + if mod_is_complete: - num_complete_modules += 1 complete_mods.append(mod) if has_nonessential_step: mods_with_nonessential_steps.append(mod) From 8e5f9965984b8305a8eb7e633647743d17a08e2d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 11:41:33 -0500 Subject: [PATCH 255/400] adjustment func now returns completeness status so we can update the list of complete modules appropriately --- anvio/kegg.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index e09cc00cd9..3cde051625 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1017,6 +1017,10 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): ========== mod string, the module number to adjust meta_dict_for_bin metabolism completeness dictionary for the current bin + + RETURNS + ======= + now_complete boolean, whether or not the module is NOW considered "complete" overall based on the threshold fraction of completeness """ for step in meta_dict_for_bin[mod]["step_list"]: @@ -1082,6 +1086,8 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): if now_complete: meta_dict_for_bin["num_complete_modules"] += 1 + return now_complete + def estimate_for_genome(self, kofam_hits, genes_in_splits): """This is the metabolism estimation function for a contigs DB that contains a single genome. @@ -1131,12 +1137,13 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): if defined_by_modules: mods_def_by_modules.append(mod) - genome_metabolism_dict[self.contigs_db_project_name]["num_complete_modules"] = num_complete_modules - # go back and adjust completeness of modules that are defined by other modules if mods_def_by_modules: for mod in mods_def_by_modules: - self.adjust_module_completeness_for_bin(mod, genome_metabolism_dict[self.contigs_db_project_name]) + mod_is_complete = self.adjust_module_completeness_for_bin(mod, genome_metabolism_dict[self.contigs_db_project_name]) + + if mod_is_complete: + complete_mods.append(mod) # notify user of the modules that gave some fishy results if not self.quiet: From 7d6bf651f1c68dc3c35b361259004a6e02de94a7 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 11:42:01 -0500 Subject: [PATCH 256/400] fix complete modules output --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 3cde051625..f478c3c69e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1164,7 +1164,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): % (", ".join(mods_with_unassociated_ko))) self.run.info("Module completion threshold", self.completeness_threshold) - self.run.info("Number of complete modules", num_complete_modules) + self.run.info("Number of complete modules", genome_metabolism_dict[self.contigs_db_project_name]["num_complete_modules"]) self.run.info("Complete modules", ", ".join(complete_mods)) return genome_metabolism_dict From ca50a26054c31211da4ebd6ae26017bf2fd2fbd5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 15:50:02 -0500 Subject: [PATCH 257/400] cosmetic updates --- anvio/kegg.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index f478c3c69e..a017580a1c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1099,7 +1099,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): PARAMETERS ========== kofam_hits list of (gene_call_id, ko_num) tuples, all belong to this single genome - genes_in_splits list of (split, gene_call_id) tuples, all belong to this single genome <- MAYBE UNNECESSARY + genes_in_splits list of (split, gene_call_id) tuples, all belong to this single genome RETURNS ======= @@ -1127,7 +1127,6 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): mod_is_complete, has_nonessential_step, has_no_ko_step, defined_by_modules \ = self.compute_module_completeness_for_bin(mod, genome_metabolism_dict[self.contigs_db_project_name]) - if mod_is_complete: complete_mods.append(mod) if has_nonessential_step: @@ -1221,7 +1220,6 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): d[i]["kegg_module"] = mnum i += 1 - utils.store_dict_as_TAB_delimited_file(d, self.output_file_path, key_header="unique_id") self.run.info("Output file", self.output_file_path, nl_before=1) From b85b3298085584621f73340a4708ac3139a58af0 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 17:08:40 -0500 Subject: [PATCH 258/400] make atomic function to estimate for list of splits, and convert genome estimator to call that one --- anvio/kegg.py | 67 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index a017580a1c..902e7d7486 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1088,32 +1088,31 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): return now_complete + def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=None): + """This is the atomic metabolism estimator function, which builds a metabolism completeness dictionary for an arbitrary list of splits. - def estimate_for_genome(self, kofam_hits, genes_in_splits): - """This is the metabolism estimation function for a contigs DB that contains a single genome. - - It returns the initial metabolism completion dictionary for that genome, wrapped in the superdict format. - This dictionary at first contains the KOs that are present in the genome for each KEGG module. It is later - processed to estimate the completion of each module. + For example, the list of splits may represent a bin or a single isolate genome. + The metabolism completeness dictionary is first initialized to contain the KOs that are present in the genome for each KEGG module. + It is later updated with the individual steps and completion estimates for each module. PARAMETERS ========== - kofam_hits list of (gene_call_id, ko_num) tuples, all belong to this single genome - genes_in_splits list of (split, gene_call_id) tuples, all belong to this single genome + ko_hits_in_splits a list of KO numbers indicating the KOfam hits that have occurred in this list of splits + splits a list of splits identifiers + bin_name the name of the bin that we are working with RETURNS ======= - genome_metabolism_dict dictionary mapping genome name to its metabolism completeness dictionary + metabolism_dict_for_list_of_splits the metabolism completeness dictionary of dictionaries for this list of splits. It contains + one dictionary of module steps and completion information for each module (keyed by module number), + as well as one key num_complete_modules that tracks the number of complete modules found in these splits. + Calling functions should assign this dictionary to a metabolism superdict with the bin name as a key. """ - genome_metabolism_dict = {} - # get list of KOs only - since all splits belong to one genome, we can take all the hits - ko_in_genome = [tpl[1] for tpl in kofam_hits] - splits_in_genome = [tpl[0] for tpl in genes_in_splits] - # get KO presence in modules - genome_metabolism_dict[self.contigs_db_project_name] = self.mark_kos_present_for_list_of_splits(ko_in_genome, split_list=splits_in_genome, - bin_name=self.contigs_db_project_name) - genome_metabolism_dict[self.contigs_db_project_name]["num_complete_modules"] = 0 + metabolism_dict_for_list_of_splits = self.mark_kos_present_for_list_of_splits(ko_hits_in_splits, split_list=splits, + bin_name=bin_name) + metabolism_dict_for_list_of_splits["num_complete_modules"] = 0 + complete_mods = [] mods_def_by_modules = [] # a list of modules that have module numbers in their definitions # modules to warn about @@ -1121,11 +1120,11 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): mods_with_nonessential_steps = [] # a list of modules that have nonessential steps like "-K11024" # estimate completeness of each module - for mod in genome_metabolism_dict[self.contigs_db_project_name].keys(): + for mod in metabolism_dict_for_list_of_splits.keys(): if mod == "num_complete_modules": continue mod_is_complete, has_nonessential_step, has_no_ko_step, defined_by_modules \ - = self.compute_module_completeness_for_bin(mod, genome_metabolism_dict[self.contigs_db_project_name]) + = self.compute_module_completeness_for_bin(mod, metabolism_dict_for_list_of_splits) if mod_is_complete: complete_mods.append(mod) @@ -1139,7 +1138,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): # go back and adjust completeness of modules that are defined by other modules if mods_def_by_modules: for mod in mods_def_by_modules: - mod_is_complete = self.adjust_module_completeness_for_bin(mod, genome_metabolism_dict[self.contigs_db_project_name]) + mod_is_complete = self.adjust_module_completeness_for_bin(mod, metabolism_dict_for_list_of_splits) if mod_is_complete: complete_mods.append(mod) @@ -1163,9 +1162,35 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): % (", ".join(mods_with_unassociated_ko))) self.run.info("Module completion threshold", self.completeness_threshold) - self.run.info("Number of complete modules", genome_metabolism_dict[self.contigs_db_project_name]["num_complete_modules"]) + self.run.info("Number of complete modules", metabolism_dict_for_list_of_splits["num_complete_modules"]) self.run.info("Complete modules", ", ".join(complete_mods)) + return metabolism_dict_for_list_of_splits + + + def estimate_for_genome(self, kofam_hits, genes_in_splits): + """This is the metabolism estimation function for a contigs DB that contains a single genome. + + Assuming this contigs DB contains only one genome, it sends all of the splits and their kofam hits to the atomic + estimation function for processing. It then returns the metabolism completion dictionary for the genome, wrapped in the superdict format. + + PARAMETERS + ========== + kofam_hits list of (gene_call_id, ko_num) tuples, all belong to this single genome + genes_in_splits list of (split, gene_call_id) tuples, all belong to this single genome + + RETURNS + ======= + genome_metabolism_dict dictionary mapping genome name to its metabolism completeness dictionary + """ + + genome_metabolism_dict = {} + # get list of KOs only - since all splits belong to one genome, we can take all the hits + ko_in_genome = [tpl[1] for tpl in kofam_hits] + splits_in_genome = [tpl[0] for tpl in genes_in_splits] + + genome_metabolism_dict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(ko_in_genome, splits=splits_in_genome, bin_name=self.contigs_db_project_name) + return genome_metabolism_dict From e4431376bdf7702de2669c0c6a364b9f487ca9a4 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 17:08:57 -0500 Subject: [PATCH 259/400] import ccolections --- anvio/kegg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 902e7d7486..b32f48072d 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -16,6 +16,7 @@ import anvio.terminal as terminal import anvio.filesnpaths as filesnpaths import anvio.tables as t +import anvio.ccollections as ccollections from anvio.errors import ConfigError, FilesNPathsError from anvio.drivers.hmmer import HMMer From 0ca10c78f60008bace1ca621876bb15ddf226fb8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 17:10:38 -0500 Subject: [PATCH 260/400] rename to superdict just to be clear on what is returned --- anvio/kegg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index b32f48072d..17e4a55698 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1185,14 +1185,14 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): genome_metabolism_dict dictionary mapping genome name to its metabolism completeness dictionary """ - genome_metabolism_dict = {} + genome_metabolism_superdict = {} # get list of KOs only - since all splits belong to one genome, we can take all the hits ko_in_genome = [tpl[1] for tpl in kofam_hits] splits_in_genome = [tpl[0] for tpl in genes_in_splits] - genome_metabolism_dict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(ko_in_genome, splits=splits_in_genome, bin_name=self.contigs_db_project_name) + genome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(ko_in_genome, splits=splits_in_genome, bin_name=self.contigs_db_project_name) - return genome_metabolism_dict + return genome_metabolism_superdict def estimate_metabolism(self): From bc05a15737ecb37518a2e108a9e5b2172da9ea7b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 17 Mar 2020 17:13:02 -0500 Subject: [PATCH 261/400] estimator for bins, not tested yet --- anvio/kegg.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 17e4a55698..c7077ce697 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1195,6 +1195,24 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): return genome_metabolism_superdict + def estimate_for_bins_in_collection(self, kofam_hits, genes_in_splits): + bins_metabolism_superdict = {} + + bin_name_to_split_names_dict = ccollections.GetSplitNamesInBins(self.args).get_dict() + self.run.info_single("%s split names associated with %s bins of in collection '%s' have been " + "successfully recovered 🎊" % (pp(sum([len(v) for v in bin_name_to_split_names_dict.values()])), + pp(len(bin_name_to_split_names_dict)), + self.collection_name), nl_before=1) + + for bin_name in bin_name_to_split_names_dict: + splits_in_bin = bin_name_to_split_names_dict[bin_name] + genes_in_bin = [tpl[1] for tpl in genes_in_splits if tpl[0] in splits_in_bin] + ko_in_bin = [tpl[1] for tpl in kofam_hits if tpl[0] in genes_in_bin] + bins_metabolism_superdict[bin_name] = self.estimate_for_list_of_splits(ko_in_bin, splits=splits_in_bin, bin_name=self.bin_name) + + return bins_metabolism_superdict + + def estimate_metabolism(self): """This is the driver function for estimating metabolism. @@ -1208,9 +1226,7 @@ def estimate_metabolism(self): kegg_metabolism_superdict = {} if self.profile_db_path and not self.metagenome_mode: - raise ConfigError("This class doesn't know how to deal with that yet :/") - # isolate genome, with profiling - #something like self.estimate_for_bins_in_collection() + kegg_metabolism_superdict = self.estimate_for_bins_in_collection(hits_to_consider, splits_to_consider) elif not self.profile_db_path and not self.metagenome_mode: kegg_metabolism_superdict = self.estimate_for_genome(hits_to_consider, splits_to_consider) elif self.profile_db_path and self.metagenome_mode: From 9947c4e61d1e8088f6bb6b4f171fca5d64202953 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 18 Mar 2020 12:49:17 -0500 Subject: [PATCH 262/400] remove rogue self --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index c7077ce697..2c42b11a01 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1208,7 +1208,7 @@ def estimate_for_bins_in_collection(self, kofam_hits, genes_in_splits): splits_in_bin = bin_name_to_split_names_dict[bin_name] genes_in_bin = [tpl[1] for tpl in genes_in_splits if tpl[0] in splits_in_bin] ko_in_bin = [tpl[1] for tpl in kofam_hits if tpl[0] in genes_in_bin] - bins_metabolism_superdict[bin_name] = self.estimate_for_list_of_splits(ko_in_bin, splits=splits_in_bin, bin_name=self.bin_name) + bins_metabolism_superdict[bin_name] = self.estimate_for_list_of_splits(ko_in_bin, splits=splits_in_bin, bin_name=bin_name) return bins_metabolism_superdict From 9db61ec39a7d51ae04d0f66da5838c2d008430a5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 18 Mar 2020 12:49:28 -0500 Subject: [PATCH 263/400] conditional output --- anvio/kegg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 2c42b11a01..f0b1e06f29 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -716,6 +716,8 @@ def init_hits_and_splits(self): self.run.info("KOfam hits", "%d found" % len(kofam_hits), quiet=self.quiet) self.run.info("Profile DB", self.profile_db_path, quiet=self.quiet) self.run.info('Metagenome mode', self.metagenome_mode) + if self.collection_name: + self.run.info('Collection', self.collection_name) return kofam_hits, genes_in_splits @@ -1164,7 +1166,8 @@ def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=N self.run.info("Module completion threshold", self.completeness_threshold) self.run.info("Number of complete modules", metabolism_dict_for_list_of_splits["num_complete_modules"]) - self.run.info("Complete modules", ", ".join(complete_mods)) + if complete_mods: + self.run.info("Complete modules", ", ".join(complete_mods)) return metabolism_dict_for_list_of_splits @@ -1189,7 +1192,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): # get list of KOs only - since all splits belong to one genome, we can take all the hits ko_in_genome = [tpl[1] for tpl in kofam_hits] splits_in_genome = [tpl[0] for tpl in genes_in_splits] - + genome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(ko_in_genome, splits=splits_in_genome, bin_name=self.contigs_db_project_name) return genome_metabolism_superdict From 25c9e9ccb68f19ac3cb4214de72f47ccaa0976c2 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 18 Mar 2020 13:06:05 -0500 Subject: [PATCH 264/400] clarify that number of genes refers to HMM model --- anvio/drivers/hmmer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index 41b01ae82f..0a51526600 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -93,7 +93,7 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.info('Context', context) self.run.info('Domain', domain if domain else 'N\\A') self.run.info('HMM model path', hmm) - self.run.info('Number of genes', num_genes_in_model) + self.run.info('Number of genes in HMM model', num_genes_in_model) self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) From 59668ab9ba26a76d3ad0a972c1a965468734be23 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 18 Mar 2020 16:36:59 -0500 Subject: [PATCH 265/400] fix separator between module class and name annotations. it will not put them on separate lines but at least they are distinct from each other now --- anvio/kegg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index f0b1e06f29..c7a4f81f55 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -547,12 +547,12 @@ def process_kofam_hmms(self): # FIXME? some KOs are not associated with modules. Should we report this? if mods: mod_annotation = "\n".join(mods) - mod_class_annotation = "\n".join(classes) + mod_class_annotation = "!!!".join(classes) # why do we split by '!!!'? Because that is how it is done in COGs. So so sorry. :'( mod_name_annotation = "" for mod in mods: if mod_name_annotation: - mod_name_annotation += "\n" + names[mod] + mod_name_annotation += "!!!" + names[mod] else: mod_name_annotation = names[mod] @@ -1192,7 +1192,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): # get list of KOs only - since all splits belong to one genome, we can take all the hits ko_in_genome = [tpl[1] for tpl in kofam_hits] splits_in_genome = [tpl[0] for tpl in genes_in_splits] - + genome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(ko_in_genome, splits=splits_in_genome, bin_name=self.contigs_db_project_name) return genome_metabolism_superdict From 90d1ac286620973e11fb825d587b4a445e33c006 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 18 Mar 2020 16:44:19 -0500 Subject: [PATCH 266/400] bin output --- anvio/kegg.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index c7a4f81f55..2789f5a8c9 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -718,6 +718,10 @@ def init_hits_and_splits(self): self.run.info('Metagenome mode', self.metagenome_mode) if self.collection_name: self.run.info('Collection', self.collection_name) + if self.bin_id: + self.run.info('Bin ID', self.bin_id) + elif self.bin_ids_file: + self.run.info('Bin IDs file', self.bin_ids_file) return kofam_hits, genes_in_splits From 8a2a1258d48d925e1703e17e1f6686b6bba23812 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 18 Mar 2020 21:48:52 -0500 Subject: [PATCH 267/400] output bin name --- anvio/kegg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 2789f5a8c9..c36190af32 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1168,6 +1168,7 @@ def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=N "missing enzyme in some other way. Best of luck to you. Here is the list of modules to check out: %s" % (", ".join(mods_with_unassociated_ko))) + self.run.info("Bin name", bin_name) self.run.info("Module completion threshold", self.completeness_threshold) self.run.info("Number of complete modules", metabolism_dict_for_list_of_splits["num_complete_modules"]) if complete_mods: From bb747d67c5fe26545fd1bf9bd02b00eeb64a6ffa Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 18 Mar 2020 22:14:54 -0500 Subject: [PATCH 268/400] fetch genes in contigs --- anvio/kegg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index c36190af32..d589908b26 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -663,6 +663,7 @@ def init_hits_and_splits(self): self.contigs_db_project_name = contigs_db.meta['project_name'] self.progress.update('Splits') genes_in_splits = contigs_db.db.get_some_columns_from_table(t.genes_in_splits_table_name, "split, gene_callers_id") + genes_in_contigs = contigs_db.db.get_some_columns_from_table(t.genes_in_contigs_table_name, "contig, gene_callers_id") self.progress.update('KOfam hits') kofam_hits = contigs_db.db.get_some_columns_from_table(t.gene_function_calls_table_name, "gene_callers_id, accession", where_clause="source = 'KOfam'") From 6ca72b80dbf145c75656afdf6cc2c4edb0554d90 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 19 Mar 2020 16:07:05 -0500 Subject: [PATCH 269/400] modified init function to gather contig and gene call info as well as split and kofam hit --- anvio/kegg.py | 55 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index d589908b26..e06226543d 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -644,49 +644,47 @@ def __init__(self, args, run=run, progress=progress): self.kegg_modules_db = KeggModulesDatabase(os.path.join(self.kegg_data_dir, "MODULES.db"), args=self.args) def init_hits_and_splits(self): - """This function loads splits and KOfam hits from the contigs DB. + """This function loads KOfam hits, gene calls, splits, and contigs from the contigs DB. We will need the hits with their KO numbers (accessions) so that we can go through the MODULES.db and determine - which steps are present in each module. And we will need the splits so that we can determine which hits belong - to which genomes/bins when we are handling multiple of these. This function gets these hits and splits (as lists - of tuples), and it makes sure that these lists don't include hits/splits we shouldn't be considering. + which steps are present in each module. And we will need the other information so that we can determine which hits belong + to which genomes/bins when we are handling multiple of these, and for help in computing redundancy. + This function gets this info as a list of tuples (one tuple per kofam hit), and it makes sure that these lists don't include + hits that we shouldn't be considering. RETURNS ======= - kofam_hits list of (gene_call_id, ko_num) tuples - genes_in_splits list of (split, gene_call_id) tuples + kofam_gene_split_contig list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering """ - self.progress.new('Loading') - self.progress.update('Contigs DB') + self.progress.new('Loading data from Contigs DB') contigs_db = ContigsDatabase(self.contigs_db_path, run=self.run, progress=self.progress) self.contigs_db_project_name = contigs_db.meta['project_name'] - self.progress.update('Splits') - genes_in_splits = contigs_db.db.get_some_columns_from_table(t.genes_in_splits_table_name, "split, gene_callers_id") - genes_in_contigs = contigs_db.db.get_some_columns_from_table(t.genes_in_contigs_table_name, "contig, gene_callers_id") - self.progress.update('KOfam hits') + genes_in_splits = contigs_db.db.get_some_columns_from_table(t.genes_in_splits_table_name, "gene_callers_id, split") + genes_in_contigs = contigs_db.db.get_some_columns_from_table(t.genes_in_contigs_table_name, "gene_callers_id, contig") kofam_hits = contigs_db.db.get_some_columns_from_table(t.gene_function_calls_table_name, "gene_callers_id, accession", where_clause="source = 'KOfam'") min_contig_length_in_contigs_db = contigs_db.db.get_max_value_in_column(t.contigs_info_table_name, "length", return_min_instead=True) contigs_db.disconnect() - # get rid of gene calls in genes_in_splits that are not associated with KOfam hits. - # Perhaps this is not a necessary step. But it makes me feel clean. - all_gene_calls_in_splits = set([tpl[1] for tpl in genes_in_splits]) + # get rid of gene calls that are not associated with KOfam hits. + all_gene_calls_in_splits = set([tpl[0] for tpl in genes_in_splits]) gene_calls_with_kofam_hits = set([tpl[0] for tpl in kofam_hits]) gene_calls_without_kofam_hits = all_gene_calls_in_splits.difference(gene_calls_with_kofam_hits) if gene_calls_without_kofam_hits: self.progress.update("Removing %s gene calls without KOfam hits" % len(gene_calls_without_kofam_hits)) - genes_in_splits = [tpl for tpl in genes_in_splits if tpl[1] not in gene_calls_without_kofam_hits] + genes_in_splits = [tpl for tpl in genes_in_splits if tpl[0] not in gene_calls_without_kofam_hits] + genes_in_contigs = [tpl for tpl in genes_in_contigs if tpl[0] not in gene_calls_without_kofam_hits] if anvio.DEBUG: self.run.warning("The following gene calls in your contigs DB were removed from consideration as they \ do not have any hits to the KOfam database: %s" % (gene_calls_without_kofam_hits)) - # get rid of splits (and their associated gene calls) that are not in the profile DB + + # get rid of splits and contigs (and their associated gene calls) that are not in the profile DB if self.profile_db_path: split_names_in_profile_db = set(utils.get_all_item_names_from_the_database(self.profile_db_path)) - split_names_in_contigs_db = set([tpl[0] for tpl in genes_in_splits]) + split_names_in_contigs_db = set([tpl[1] for tpl in genes_in_splits]) splits_missing_in_profile_db = split_names_in_contigs_db.difference(split_names_in_profile_db) min_contig_length_in_profile_db = ProfileDatabase(self.profile_db_path).meta['min_contig_length'] @@ -707,10 +705,25 @@ def init_hits_and_splits(self): pp(min_contig_length_in_profile_db))) self.progress.update("Removing %s splits (and associated gene calls) that were missing from the profile db" % pp(len(splits_missing_in_profile_db))) - genes_in_splits = [tpl for tpl in genes_in_splits if tpl[0] not in splits_missing_in_profile_db] - remaining_gene_calls = [tpl[1] for tpl in genes_in_splits] + genes_in_splits = [tpl for tpl in genes_in_splits if tpl[1] not in splits_missing_in_profile_db] + remaining_gene_calls = [tpl[0] for tpl in genes_in_splits] + genes_in_contigs = [tpl for tpl in genes_in_contigs if tpl[0] in remaining_gene_calls] kofam_hits = [tpl for tpl in kofam_hits if tpl[0] in remaining_gene_calls] + # combine the information for each gene call into neat tuples for returning + # each gene call is only on one split of one contig, so we can convert these lists of tuples into dictionaries for easy access + # but some gene calls have multiple kofam hits (and some kofams have multiple gene calls), so we must keep the tuple structure for those + self.progress.update("Organizing KOfam hit data") + gene_calls_splits_dict = {tpl[0] : tpl[1] for tpl in genes_in_splits} + gene_calls_contigs_dict = {tpl[0] : tpl[1] for tpl in genes_in_contigs} + assert len(gene_calls_splits_dict.keys()) == len(genes_in_splits) + assert len(gene_calls_splits_dict.keys()) == len(genes_in_contigs) + + kofam_gene_split_contig = [] + for gene_call_id, ko in kofam_hits: + kofam_gene_split_contig.append((ko, gene_call_id, gene_calls_splits_dict[gene_call_id], gene_calls_contigs_dict[gene_call_id])) + + self.progress.update("Done") self.progress.end() self.run.info("Contigs DB", self.contigs_db_path, quiet=self.quiet) @@ -724,7 +737,7 @@ def init_hits_and_splits(self): elif self.bin_ids_file: self.run.info('Bin IDs file', self.bin_ids_file) - return kofam_hits, genes_in_splits + return kofam_gene_split_contig def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=None, bin_name=None): """This function generates a bin-level dictionary of dictionaries, which associates modules with the list of KOs From a5942a1d38c69b1d006104cd794aa7fdfb9ae904 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 19 Mar 2020 17:17:14 -0500 Subject: [PATCH 270/400] update function docstrings --- anvio/kegg.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index e06226543d..24468b7c7b 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -654,7 +654,7 @@ def init_hits_and_splits(self): RETURNS ======= - kofam_gene_split_contig list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering + kofam_gene_split_contig list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering """ self.progress.new('Loading data from Contigs DB') @@ -1217,7 +1217,19 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): return genome_metabolism_superdict - def estimate_for_bins_in_collection(self, kofam_hits, genes_in_splits): + def estimate_for_bins_in_collection(self, kofam_gene_split_contig): + """ + This function calls metabolism estimation for every bin the user requests. + + PARAMETERS + ========== + kofam_gene_split_contig list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering + + RETURNS + ======= + bins_metabolism_superdict dictionary mapping bin name to its metabolism completeness dictionary + """ + bins_metabolism_superdict = {} bin_name_to_split_names_dict = ccollections.GetSplitNamesInBins(self.args).get_dict() From fcc0617799e43f05cca02fac87fdad1cd862331c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 19 Mar 2020 17:17:56 -0500 Subject: [PATCH 271/400] update estimate function and bin-specific estimate function to use new kofam hit tuple structure --- anvio/kegg.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 24468b7c7b..65f98ae74e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1240,8 +1240,7 @@ def estimate_for_bins_in_collection(self, kofam_gene_split_contig): for bin_name in bin_name_to_split_names_dict: splits_in_bin = bin_name_to_split_names_dict[bin_name] - genes_in_bin = [tpl[1] for tpl in genes_in_splits if tpl[0] in splits_in_bin] - ko_in_bin = [tpl[1] for tpl in kofam_hits if tpl[0] in genes_in_bin] + ko_in_bin = [tpl for tpl in kofam_gene_split_contig if tpl[2] in splits_in_bin] bins_metabolism_superdict[bin_name] = self.estimate_for_list_of_splits(ko_in_bin, splits=splits_in_bin, bin_name=bin_name) return bins_metabolism_superdict @@ -1252,17 +1251,17 @@ def estimate_metabolism(self): It will decide what to do based on whether the input contigs DB is a genome or metagenome. It returns the metabolism superdict which contains a metabolism completion dictionary for each genome/bin in the contigs db. - The metabolism completion dictionary is keyed by KEGG module number. + The metabolism completion dictionary is keyed by KEGG module number, with a few exceptions for summary data (ie, 'num_complete_modules'). """ - hits_to_consider, splits_to_consider = self.init_hits_and_splits() + kofam_hits_info = self.init_hits_and_splits() kegg_metabolism_superdict = {} if self.profile_db_path and not self.metagenome_mode: - kegg_metabolism_superdict = self.estimate_for_bins_in_collection(hits_to_consider, splits_to_consider) + kegg_metabolism_superdict = self.estimate_for_bins_in_collection(kofam_hits_info) elif not self.profile_db_path and not self.metagenome_mode: - kegg_metabolism_superdict = self.estimate_for_genome(hits_to_consider, splits_to_consider) + kegg_metabolism_superdict = self.estimate_for_genome(kofam_hits_info) elif self.profile_db_path and self.metagenome_mode: raise ConfigError("This class doesn't know how to deal with that yet :/") # metagenome, with profiling From 6a3c5695869ffc2aedb6dac927265a8df5e909df Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 19 Mar 2020 17:22:47 -0500 Subject: [PATCH 272/400] update genome estimate func to use new kofam hit tuple structure --- anvio/kegg.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 65f98ae74e..b61b08acf9 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1191,7 +1191,7 @@ def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=N return metabolism_dict_for_list_of_splits - def estimate_for_genome(self, kofam_hits, genes_in_splits): + def estimate_for_genome(self, kofam_gene_split_contig): """This is the metabolism estimation function for a contigs DB that contains a single genome. Assuming this contigs DB contains only one genome, it sends all of the splits and their kofam hits to the atomic @@ -1199,8 +1199,7 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): PARAMETERS ========== - kofam_hits list of (gene_call_id, ko_num) tuples, all belong to this single genome - genes_in_splits list of (split, gene_call_id) tuples, all belong to this single genome + kofam_gene_split_contig list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering RETURNS ======= @@ -1208,11 +1207,10 @@ def estimate_for_genome(self, kofam_hits, genes_in_splits): """ genome_metabolism_superdict = {} - # get list of KOs only - since all splits belong to one genome, we can take all the hits - ko_in_genome = [tpl[1] for tpl in kofam_hits] - splits_in_genome = [tpl[0] for tpl in genes_in_splits] + # since all hits belong to one genome, we can take the split info from all the hits + splits_in_genome = unique([tpl[2] for tpl in kofam_gene_split_contig]) - genome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(ko_in_genome, splits=splits_in_genome, bin_name=self.contigs_db_project_name) + genome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(kofam_gene_split_contig, splits=splits_in_genome, bin_name=self.contigs_db_project_name) return genome_metabolism_superdict From 0c139c29622117f06cf3784db460e8bb0bd018de Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 19 Mar 2020 17:39:29 -0500 Subject: [PATCH 273/400] now all functions used in estimation can handle the kofam hit tuples --- anvio/kegg.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index b61b08acf9..cd48c070d6 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -749,7 +749,7 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N PARAMETERS ========== - kofam_hits_in_splits list of KO numbers that are hits in the current list of splits + kofam_hits_in_splits list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering split_list list of splits we are considering, this is only for debugging output bin_name name of the bin containing these splits, this is only for debugging output @@ -770,7 +770,7 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N bin_level_module_dict[mnum] = {"present_kos" : []} kos_not_in_modules = [] - for ko in kofam_hits_in_splits: + for ko, gene_call_id, split, contig in kofam_hits_in_splits: present_in_mods = self.kegg_modules_db.get_modules_for_knum(ko) if not present_in_mods: kos_not_in_modules.append(ko) @@ -1118,7 +1118,7 @@ def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=N PARAMETERS ========== - ko_hits_in_splits a list of KO numbers indicating the KOfam hits that have occurred in this list of splits + ko_hits_in_splits list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering splits a list of splits identifiers bin_name the name of the bin that we are working with @@ -1208,7 +1208,7 @@ def estimate_for_genome(self, kofam_gene_split_contig): genome_metabolism_superdict = {} # since all hits belong to one genome, we can take the split info from all the hits - splits_in_genome = unique([tpl[2] for tpl in kofam_gene_split_contig]) + splits_in_genome = list(set([tpl[2] for tpl in kofam_gene_split_contig])) genome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(kofam_gene_split_contig, splits=splits_in_genome, bin_name=self.contigs_db_project_name) From 40bde328ceb15daaf276fd23c30582fdbaa0d3b8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 19 Mar 2020 17:40:31 -0500 Subject: [PATCH 274/400] update comment --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index cd48c070d6..7d58f337d8 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1207,7 +1207,7 @@ def estimate_for_genome(self, kofam_gene_split_contig): """ genome_metabolism_superdict = {} - # since all hits belong to one genome, we can take the split info from all the hits + # since all hits belong to one genome, we can take the UNIQUE splits from all the hits splits_in_genome = list(set([tpl[2] for tpl in kofam_gene_split_contig])) genome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(kofam_gene_split_contig, splits=splits_in_genome, bin_name=self.contigs_db_project_name) From 1992ffc5bb64c67785995dbabb8c9e3298aa95ac Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 20 Mar 2020 09:34:34 -0500 Subject: [PATCH 275/400] add genes, contigs, new style of kofam hits to module dictionaries --- anvio/kegg.py | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 7d58f337d8..33e31a62b8 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -740,12 +740,22 @@ def init_hits_and_splits(self): return kofam_gene_split_contig def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=None, bin_name=None): - """This function generates a bin-level dictionary of dictionaries, which associates modules with the list of KOs + """This function generates a bin-level dictionary of dictionaries, which associates modules with the KOs that are present in the bin for each module. - The structure of the dictionary is like this: - {mnum: {present_kos: [knum1, knum2, ....]}} - Why do we need an inner dictionary with just one list? Well. This dictionary will be expanded later by other functions, not to worry. + The structure of the dictionary is like this example: + {mnum: {"gene_caller_ids" : set([132, 133, 431, 6777]) + "kofam_hits" : {'K00033' : [431, 6777], + 'K01057' : [133], + 'K00036' : [132] }, + "genes_to_contigs": {132: 0, + 133: 0, + 431: 2, + 6777: 1 }, + "contigs_to_genes": { 0: set([132, 133]), + 1: set(6777), + 2: set(431) },}} + This dictionary will be expanded later by other functions. PARAMETERS ========== @@ -764,10 +774,15 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N self.run.info("Marking KOs present for bin", bin_name) self.run.info("Number of splits", len(split_list)) - # initialize all modules with empty presence list + # initialize all modules with empty lists and dicts for kos, gene calls modules = self.kegg_modules_db.get_all_modules_as_list() for mnum in modules: - bin_level_module_dict[mnum] = {"present_kos" : []} + bin_level_module_dict[mnum] = {"present_kos" : [], # TODO: get rid of this key eventually + "gene_caller_ids" : set(), + "kofam_hits" : {}, + "genes_to_contigs" : {}, + "contigs_to_genes" : {} + } kos_not_in_modules = [] for ko, gene_call_id, split, contig in kofam_hits_in_splits: @@ -775,7 +790,17 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N if not present_in_mods: kos_not_in_modules.append(ko) for m in present_in_mods: - bin_level_module_dict[m]["present_kos"].append(ko) + bin_level_module_dict[m]["present_kos"].append(ko) # TODO: get rid of this eventually + bin_level_module_dict[m]["gene_caller_ids"].add(gene_call_id) + if ko in bin_level_module_dict[m]["kofam_hits"]: + bin_level_module_dict[m]["kofam_hits"][ko].append(gene_call_id) + else: + bin_level_module_dict[m]["kofam_hits"][ko] = [gene_call_id] + bin_level_module_dict[m]["genes_to_contigs"][gene_call_id] = contig + if contig in bin_level_module_dict[m]["contigs_to_genes"]: + bin_level_module_dict[m]["contigs_to_genes"][contig].add(gene_call_id) + else: + bin_level_module_dict[m]["contigs_to_genes"][contig] = set([gene_call_id]) if anvio.DEBUG: self.run.info("KOs processed", "%d in bin" % len(kofam_hits_in_splits)) From 299b4a60b91824165894ecc1f427ef4e86f7a15d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 20 Mar 2020 10:02:55 -0500 Subject: [PATCH 276/400] add todo for later --- anvio/kegg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 33e31a62b8..ee77a0fc0e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -802,6 +802,7 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N else: bin_level_module_dict[m]["contigs_to_genes"][contig] = set([gene_call_id]) + # TODO: at some point I think we should save these KOs somewhere so that the user can look at them manually if anvio.DEBUG: self.run.info("KOs processed", "%d in bin" % len(kofam_hits_in_splits)) if kos_not_in_modules: From 89b9b7baff960f282e0b47e42cb18a13f5ffa4e7 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sat, 21 Mar 2020 21:36:17 -0500 Subject: [PATCH 277/400] deprecate old completeness function and start replacement. replacement will not be finished until some changes are made to kegg setup --- anvio/kegg.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index ee77a0fc0e..d709ec941c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -811,7 +811,8 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N return bin_level_module_dict - def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): + + def compute_module_completeness_for_bin_DEPRECATED(self, mnum, meta_dict_for_bin): """This function calculates the completeness of the specified module. This requires some parsing of the module DEFINITION fields. In these fields, we have the following: @@ -1053,6 +1054,27 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): return over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules + def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): + """This calculates the completeness of the specified module within the given bin metabolism dictionary.""" + + present_list_for_mnum = meta_dict_for_bin[mnum]["kofam_hits"].keys() + if not present_list_for_mnum: + # no KOs in this module are present + if anvio.DEBUG: + self.run.warning("No KOs present for module %s. Parsing for completeness is still being done to obtain module steps." % mnum) + + # Now I have decided that we need to have all possible paths through a module unrolled during KEGG setup. + # That way it is done once for all modules, and we can just load the path list into memory during an init. + # Then, here in this function, we can just access the path list quickly for the purposes of computing completeness. + + # these are just here to remind myself what I need to be returning later + over_complete_threshold = False + has_nonessential_step = False + has_no_ko_step = False + defined_by_modules = False + return over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules + + def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): """This function adjusts completeness of modules that are defined by other modules. From 7fd4477cabdf01be3a2706db77d1043090e2db7a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sat, 21 Mar 2020 21:39:28 -0500 Subject: [PATCH 278/400] copy over code for DEFINITION parsing so it can be adapted for path unrolling --- anvio/kegg.py | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index d709ec941c..39efea10bc 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1774,6 +1774,146 @@ def get_kegg_module_class_dict(self, mnum): class_value = self.get_data_value_entries_for_module_by_data_name(mnum, "CLASS")[0] return self.parse_kegg_class_value(class_value) + def unroll_module_definition(self, mnum): + """This function accesses the DEFINITION of a module, unrolls it into all possible paths, and returns the list of all paths.""" + + def_lines = self.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") + all_paths = [] + seed_path = [] + + for d in def_lines: + d = d.strip() + # cur_index = 0 # current position in the DEFINITION line + # parens_level = 0 # how deep we are in nested parentheses + # step_is_present_condition_statement = "" + # last_step_end_index = 0 + # + # while cur_index < len(d): + # if d[cur_index] == "K": # we have found a KO + # ko = d[cur_index:cur_index+6] + # defined_by_modules = False # reset this flag just in case KO-defined step comes after a module-defined step + # if ko in present_list_for_mnum: + # step_is_present_condition_statement += "True" + # else: + # step_is_present_condition_statement += "False" + # cur_index += 6 + # + # elif d[cur_index] == "(": + # parens_level += 1 + # step_is_present_condition_statement += "(" + # cur_index += 1 + # + # elif d[cur_index] == ")": + # parens_level -= 1 + # step_is_present_condition_statement += ")" + # cur_index += 1 + # + # elif d[cur_index] == ",": + # step_is_present_condition_statement += " or " + # cur_index += 1 + # + # elif d[cur_index] == "+": + # step_is_present_condition_statement += " and " + # cur_index += 1 + # + # elif d[cur_index] == "-": + # # either a singular KO or a set of KOs in parentheses can follow this character + # # since the following KO(s) are non-essential in the complex, we skip over them to ignore them + # # unless this is its own step, in which case we consider the whole step non-essential + # + # # singular nonessential KO + # if d[cur_index+1] == "K": + # nonessential_ko = d[cur_index+1:cur_index+7] + # cur_index += 7 + # """ + # OKAY, SO HERE WE HAVE SOME POOPINESS THAT MAY NEED TO BE FIXED EVENTUALLY. + # Basically, some DEFINITION lines have KOs that seem to be marked non-essential; + # ie, "-K11024" in "K11023 -K11024 K11025 K11026 K11027". + # It was difficult to decide whether we should consider only K11024, or K11024 and all following KOs, to be non-essential. + # For instance, the module M00778 is a complex case that gave us pause - see Fiesta issue 955. + # But for now, we have decided to just track only the one KO as a 'non-essential step', and to not include such steps in + # the module completeness estimate. + # """ + # # if this is the first KO in the step and we find a space after this KO, then we have found a non-essential step + # if step_is_present_condition_statement == "" and (cur_index == len(d) or d[cur_index] == " "): + # has_nonessential_step = True + # module_nonessential_steps.append(d[last_step_end_index:cur_index]) + # module_num_nonessential_steps += 1 + # + # if nonessential_ko in present_list_for_mnum: + # module_complete_nonessential_steps.append(d[last_step_end_index:cur_index]) + # module_num_complete_nonessential_steps += 1 + # + # # reset for next step + # last_step_end_index = cur_index + 1 + # cur_index += 1 + # + # # a whole set of nonessential KOs + # elif d[cur_index+1] == "(": + # while d[cur_index] != ")": + # cur_index += 1 + # cur_index += 1 # skip over the ')' + # + # # the '--' (no KO) situation + # elif d[cur_index+1] == "-": + # # when '--' in a DEFINITION line happens, it signifies a reaction step that has no associated KO. + # # we assume that such steps are not complete, because we really can't know if it is from the KOfam hits alone + # has_no_ko_step = True + # step_is_present_condition_statement += "False" + # cur_index += 2 # skip over both '-', the next character should be a space or end of DEFINITION line + # + # if cur_index < len(d) and d[cur_index] != " ": + # raise ConfigError("Serious, serious parsing sadness is happening. We just processed a '--' in " + # "a DEFINITION line for module %s, but did not see a space afterwards. Instead, we found %s. " + # "WHAT DO WE DO NOW?" % (mnum, d[cur_index+1])) + # # anything else that follows a '-' + # else: + # raise ConfigError("The following character follows a '-' in the DEFINITION line for module %s " + # "and we just don't know what to do: %s" % (mnum, d[cur_index+1])) + # + # elif d[cur_index] == " ": + # # if we are outside of parentheses, we are done processing the current step + # if parens_level == 0: + # module_step_list.append(d[last_step_end_index:cur_index]) + # module_total_steps += 1 + # # we do not evaluate completeness of this step yet if it is defined by other modules + # if not defined_by_modules: + # step_is_present = eval(step_is_present_condition_statement) + # if step_is_present: + # module_complete_steps.append(d[last_step_end_index:cur_index]) + # module_num_complete_steps += 1 + # # reset for next step + # step_is_present_condition_statement = "" + # last_step_end_index = cur_index + 1 + # cur_index += 1 + # # otherwise, we are processing an alternative path so AND is required + # else: + # step_is_present_condition_statement += " and " + # cur_index += 1 + # + # elif d[cur_index] == "M": + # """ + # This happens when a module is defined by other modules. For example, photosynthesis module M00611 is defined as + # (M00161,M00163) M00165 === (photosystem II or photosystem I) and calvin cycle + # + # We need all the modules to have been evaluated before we can determine completeness of steps with module numbers. + # So what we will do here is just add the step to the appropriate lists without evaluating completeness, and use a + # flag variable to keep track of the modules that have this sort of definition in a list so we can go back and + # evaluate completeness of steps with module numbers later. + # """ + # defined_by_modules = True + # cur_index += 6 + # + # else: + # raise ConfigError("While parsing the DEFINITION field for module %s, (which is %s), anvi'o found the following character " + # "that she didn't understand: %s. Unfortunately, this means we cannot determine the module " + # "completeness. For context, here is the current index in the DEFINITION line: %s and the " + # "surrounding characters: %s" % (mnum, d, d[cur_index], cur_index, d[cur_index-5:cur_index+6])) + + + + return all_paths + class KeggModulesTable: """This class defines operations for creating the KEGG Modules table in Modules.db""" From a2d052610a640f9fd05a1b7ea5e6ebdb49952173 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sat, 21 Mar 2020 22:06:31 -0500 Subject: [PATCH 279/400] add vars for kegg pathway download --- anvio/kegg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 2789f5a8c9..a7a8d421e1 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -46,6 +46,7 @@ def __init__(self, args): self.kegg_data_dir = A('kegg_data_dir') or os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') self.orphan_data_dir = os.path.join(self.kegg_data_dir, "orphan_data") self.module_data_dir = os.path.join(self.kegg_data_dir, "modules") + self.pathway_data_dir = os.path.join(self.kegg_data_dir, "pathways") self.quiet = A('quiet') or False self.just_do_it = A('just_do_it') @@ -53,6 +54,7 @@ def __init__(self, args): self.kofam_hmm_file_path = os.path.join(self.kegg_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms self.ko_list_file_path = os.path.join(self.kegg_data_dir, "ko_list") self.kegg_module_file = os.path.join(self.kegg_data_dir, "ko00002.keg") + self.kegg_pathway_file = os.path.join(self.kegg_data_dir, "br08901.keg") def setup_ko_dict(self): @@ -173,6 +175,7 @@ def __init__(self, args, run=run, progress=progress): # Kegg module text files self.kegg_module_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=htext&filedir=" + self.kegg_pathway_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=br08901.keg&format=htext&filedir=" self.kegg_rest_api_get = "http://rest.kegg.jp/get" From 85d5ccfcc41296f53bc54e57c803a79c13294763 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sat, 21 Mar 2020 22:07:05 -0500 Subject: [PATCH 280/400] funcs for processing and downloading pathways --- anvio/kegg.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index a7a8d421e1..2e031bc0b7 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -270,6 +270,76 @@ def process_module_file(self): made the file unparseable. Sad. :(" % (self.kegg_module_file, first_char)) self.progress.end() + + def process_pathway_file(self): + """This function reads the kegg pathway map file into a dictionary. It should be called during setup to get the KEGG pathway ids so the pathways can be downloaded. + + The structure of this file is like this: + + +C Map number + #

  KEGG Pathway Maps

+ ! + AMetabolism + B Global and overview maps + C 01100 Metabolic pathways + C 01110 Biosynthesis of secondary metabolites + C 01120 Microbial metabolism in diverse environments + C 01200 Carbon metabolism + C 01210 2-Oxocarboxylic acid metabolism + + Initial lines can be ignored and thereafter the line's information can be determined by the one-letter code at the start. + A = Category of Pathway Map + B = Sub-category of Pathway Map + C = Pathway Map identifier number and name + + We only want the Pathway files that have KOs, not any that are just maps or don't have associated KOs. We can ignore identifiers + that start with the following codes, as they belong to categories or sub-categories that won't have an ORTHOLOGY section: + 011 global map (lines linked to KOs) + 012 overview map (lines linked to KOs) + 07 drug structure map (no KO expansion) + + NOTE: this may change at some point. Global and overview maps may not have KOs, but they can be made up of MODULES. So we may eventually + want to integrate these with the Modules information at some point. + """ + + self.pathway_dict = {} + + filesnpaths.is_file_exists(self.kegg_pathway_file) + filesnpaths.is_file_plain_text(self.kegg_pathway_file) + + f = open(self.kegg_pathway_file, 'rU') + self.progress.new("Parsing KEGG Pathway file") + + current_category = None + current_subcategory = None + + for line in f.readlines(): + line = line.strip('\n') + first_char = line[0] + + # garbage lines + if first_char in ["+", "#", "!"]: + continue + else: + # Category + if first_char == "A": + fields = re.split('<[^>]*>', line) # we split by the html tag here + current_category = fields[1] + # Sub-category + elif first_char == "B": + fields = re.split('\s{2,}', line) # don't want to split the subcategory name, so we have to split at least 2 spaces + current_subcategory = fields[1] + elif first_char == "C": + fields = re.split('\s{2,}', line) + konum = fields[1] + self.pathway_dict[konum] = {"name" : fields[2], "category" : current_category, "subcategory" : current_subcategory} + # unknown code + else: + raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has \ + made the file unparseable. Sad. :(" % (self.kegg_pathway_file, first_char)) + self.progress.end() + + def download_modules(self): """This function downloads the KEGG modules. @@ -301,6 +371,38 @@ def download_modules(self): to be '///', but instead it was %s." % (file_path, last_line)) + def download_pathways(self): + """This function downloads the KEGG Pathways. + + To do so, it first processes a KEGG file containing pathway and map identifiers into a dictionary via the process_pathway_file() + function. To verify that each file has been downloaded properly, we check that the last line is '///'. + """ + + # note that this is the same as the REST API for modules - perhaps at some point this should be printed elsewhere so we don't repeat ourselves. + self.run.info("KEGG Pathway Database URL", self.kegg_rest_api_get) + + # download the kegg pathway file, which lists all modules + utils.download_file(self.kegg_pathway_download_path, self.kegg_pathway_file, progress=self.progress, run=self.run) + + # get pathway dict + self.process_pathway_file() + self.run.info("Number of KEGG Pathways", len(self.pathway_dict.keys())) + + # download all pathways + for konum in self.pathway_dict.keys(): + file_path = os.path.join(self.pathway_data_dir, konum) + utils.download_file(self.kegg_rest_api_get + '/' + konum, + file_path, progress=self.progress, run=self.run) + # verify entire file has been downloaded + f = open(file_path, 'rU') + f.seek(0, os.SEEK_END) + f.seek(f.tell() - 4, os.SEEK_SET) + last_line = f.readline().strip('\n') + if not last_line == '///': + raise ConfigError("The KEGG pathway file %s was not downloaded properly. We were expecting the last line in the file \ + to be '///', but instead it was %s." % (file_path, last_line)) + + def decompress_files(self): """This function decompresses the Kofam profiles.""" From 251544d4be6c936ba8632b72d991465c9ee57e30 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sat, 21 Mar 2020 22:15:39 -0500 Subject: [PATCH 281/400] ignore certain pathway files --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 2e031bc0b7..302999f699 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -332,7 +332,8 @@ def process_pathway_file(self): elif first_char == "C": fields = re.split('\s{2,}', line) konum = fields[1] - self.pathway_dict[konum] = {"name" : fields[2], "category" : current_category, "subcategory" : current_subcategory} + if konum[:2] != "07" and konum[:3] != "011" and konum[:3] != "012": + self.pathway_dict[konum] = {"name" : fields[2], "category" : current_category, "subcategory" : current_subcategory} # unknown code else: raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has \ From a826278c2619f0e80541285893d6cce31fad8de8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sat, 21 Mar 2020 22:25:47 -0500 Subject: [PATCH 282/400] pathway data dir and some sanity checks for pathway data --- anvio/kegg.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 302999f699..b371b60d25 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -166,6 +166,7 @@ def __init__(self, args, run=run, progress=progress): filesnpaths.gen_output_directory(self.kegg_data_dir, delete_if_exists=args.reset) filesnpaths.gen_output_directory(self.orphan_data_dir, delete_if_exists=args.reset) filesnpaths.gen_output_directory(self.module_data_dir, delete_if_exists=args.reset) + filesnpaths.gen_output_directory(self.pathway_data_dir, delete_if_exists=args.reset) # ftp path for HMM profiles and KO list # for ko list, add /ko_list.gz to end of url @@ -189,10 +190,18 @@ def is_database_exists(self): raise ConfigError("Interestingly, though KOfam HMM profiles are not installed on your system, KEGG module information seems to have been \ already downloaded in %s. Please use the --reset flag to re-download everything from scratch." % self.kegg_data_dir) + if os.path.exists(self.kegg_pathway_file): + raise ConfigError("Interestingly, though KOfam HMM profiles are not installed on your system, KEGG pathway information seems to have been \ + already downloaded in %s. Please use the --reset flag to re-download everything from scratch." % self.kegg_data_dir) + if os.path.exists(self.module_data_dir): raise ConfigError("It seems the KEGG module directory %s already exists on your system. This is even more strange because Kofam HMM \ profiles have not been downloaded. We suggest you to use the --reset flag to download everything from scratch." % self.module_data_dir) + if os.path.exists(self.pathway_data_dir): + raise ConfigError("It seems the KEGG pathway directory %s already exists on your system. This is even more strange because Kofam HMM \ + profiles have not been downloaded. We suggest you to use the --reset flag to download everything from scratch." % self.pathway_data_dir) + def download_profiles(self): """This function downloads the Kofam profiles.""" From ede613a216b14ab8b4be20e36bc4e0defc30d937 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sat, 21 Mar 2020 22:50:31 -0500 Subject: [PATCH 283/400] update exclusion of certain identifiers when downloading --- anvio/kegg.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index b371b60d25..1973edccc2 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -301,14 +301,11 @@ def process_pathway_file(self): B = Sub-category of Pathway Map C = Pathway Map identifier number and name - We only want the Pathway files that have KOs, not any that are just maps or don't have associated KOs. We can ignore identifiers - that start with the following codes, as they belong to categories or sub-categories that won't have an ORTHOLOGY section: - 011 global map (lines linked to KOs) - 012 overview map (lines linked to KOs) - 07 drug structure map (no KO expansion) - - NOTE: this may change at some point. Global and overview maps may not have KOs, but they can be made up of MODULES. So we may eventually - want to integrate these with the Modules information at some point. + Note that not all Pathway Maps that we download will have ORTHOLOGY fields. We don't exclude these here, but processing later + will have to be aware of the fact that not all pathways will have associated KOs. + + We do, however, exclude Pathway Maps that don't have existing `koXXXXX` identifiers (these yield 404 errors when attempting to + download them). For instance, we exclude those that start with the code 010 (chemical structure maps). """ self.pathway_dict = {} @@ -322,6 +319,7 @@ def process_pathway_file(self): current_category = None current_subcategory = None + for line in f.readlines(): line = line.strip('\n') first_char = line[0] @@ -340,8 +338,8 @@ def process_pathway_file(self): current_subcategory = fields[1] elif first_char == "C": fields = re.split('\s{2,}', line) - konum = fields[1] - if konum[:2] != "07" and konum[:3] != "011" and konum[:3] != "012": + konum = "ko" + fields[1] + if konum[:5] != "ko010": self.pathway_dict[konum] = {"name" : fields[2], "category" : current_category, "subcategory" : current_subcategory} # unknown code else: From 882e94c20a1bf23c0b8bd7303c06b71324979356 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Sat, 21 Mar 2020 22:53:32 -0500 Subject: [PATCH 284/400] exclude drug structure maps as well --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 1973edccc2..e43e664181 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -305,7 +305,7 @@ def process_pathway_file(self): will have to be aware of the fact that not all pathways will have associated KOs. We do, however, exclude Pathway Maps that don't have existing `koXXXXX` identifiers (these yield 404 errors when attempting to - download them). For instance, we exclude those that start with the code 010 (chemical structure maps). + download them). For instance, we exclude those that start with the code 010 (chemical structure maps) or with 07 (drug structure maps). """ self.pathway_dict = {} @@ -339,7 +339,7 @@ def process_pathway_file(self): elif first_char == "C": fields = re.split('\s{2,}', line) konum = "ko" + fields[1] - if konum[:5] != "ko010": + if konum[:5] != "ko010" and konum[:4] != "ko07": self.pathway_dict[konum] = {"name" : fields[2], "category" : current_category, "subcategory" : current_subcategory} # unknown code else: From 4dfc70444bd395e4812a1b81a7b69f3e8ae4c456 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 25 Mar 2020 13:40:14 -0500 Subject: [PATCH 285/400] So here is a partial path unrolling function that I have been working on for waaay too long. It is not done or tested and is probably not working, but I feel quite bad about not committing anything for so long so I am just going to commit to save this partial state. I kind of hate the way it is going because the logic is complicated and there are way too many if statements and edge cases that will just break everything. How can something so easily done with your human brain be so difficult to make a computer do? There must be a better way. But this is what I have so far and I will run with it for now. That being said, I am now going to go on an actual run and stop thinking about DEFINITION lines for a bit. See you. --- anvio/kegg.py | 249 ++++++++++++++++++++++++++++---------------------- 1 file changed, 139 insertions(+), 110 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 39efea10bc..939bf58b2f 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -8,6 +8,7 @@ import requests import glob import re +import copy import anvio import anvio.db as db @@ -1775,121 +1776,149 @@ def get_kegg_module_class_dict(self, mnum): return self.parse_kegg_class_value(class_value) def unroll_module_definition(self, mnum): - """This function accesses the DEFINITION of a module, unrolls it into all possible paths, and returns the list of all paths.""" + """This function accesses the DEFINITION of a module, unrolls it into all possible paths, and returns the list of all paths. + + Here is how we handle some specific caveats of module definitions: + 1) Protein complexes are designated by a series of KOs separated by '+' (essential component) or '-' (non-essential component). + We keep the entire complex together (ie, as KXXXXX+KYYYYY-KZZZZZ) in each path. + 2) Some modules have non-essential steps, marked by a leading '-'. These steps are also placed into each path with the leading '-'. + 3) Some modules have steps without associated KOs, '--'. These steps are also placed into each path. + + Note that the above means that downstream processing of the paths through a module will require additional parsing. + """ def_lines = self.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") - all_paths = [] - seed_path = [] + all_paths = [[]] # a list of lists, where each inner list is a path + #seed_path = [] + num_paths = 1 for d in def_lines: d = d.strip() - # cur_index = 0 # current position in the DEFINITION line - # parens_level = 0 # how deep we are in nested parentheses - # step_is_present_condition_statement = "" - # last_step_end_index = 0 - # - # while cur_index < len(d): - # if d[cur_index] == "K": # we have found a KO - # ko = d[cur_index:cur_index+6] - # defined_by_modules = False # reset this flag just in case KO-defined step comes after a module-defined step - # if ko in present_list_for_mnum: - # step_is_present_condition_statement += "True" - # else: - # step_is_present_condition_statement += "False" - # cur_index += 6 - # - # elif d[cur_index] == "(": - # parens_level += 1 - # step_is_present_condition_statement += "(" - # cur_index += 1 - # - # elif d[cur_index] == ")": - # parens_level -= 1 - # step_is_present_condition_statement += ")" - # cur_index += 1 - # - # elif d[cur_index] == ",": - # step_is_present_condition_statement += " or " - # cur_index += 1 - # - # elif d[cur_index] == "+": - # step_is_present_condition_statement += " and " - # cur_index += 1 - # - # elif d[cur_index] == "-": - # # either a singular KO or a set of KOs in parentheses can follow this character - # # since the following KO(s) are non-essential in the complex, we skip over them to ignore them - # # unless this is its own step, in which case we consider the whole step non-essential - # - # # singular nonessential KO - # if d[cur_index+1] == "K": - # nonessential_ko = d[cur_index+1:cur_index+7] - # cur_index += 7 - # """ - # OKAY, SO HERE WE HAVE SOME POOPINESS THAT MAY NEED TO BE FIXED EVENTUALLY. - # Basically, some DEFINITION lines have KOs that seem to be marked non-essential; - # ie, "-K11024" in "K11023 -K11024 K11025 K11026 K11027". - # It was difficult to decide whether we should consider only K11024, or K11024 and all following KOs, to be non-essential. - # For instance, the module M00778 is a complex case that gave us pause - see Fiesta issue 955. - # But for now, we have decided to just track only the one KO as a 'non-essential step', and to not include such steps in - # the module completeness estimate. - # """ - # # if this is the first KO in the step and we find a space after this KO, then we have found a non-essential step - # if step_is_present_condition_statement == "" and (cur_index == len(d) or d[cur_index] == " "): - # has_nonessential_step = True - # module_nonessential_steps.append(d[last_step_end_index:cur_index]) - # module_num_nonessential_steps += 1 - # - # if nonessential_ko in present_list_for_mnum: - # module_complete_nonessential_steps.append(d[last_step_end_index:cur_index]) - # module_num_complete_nonessential_steps += 1 - # - # # reset for next step - # last_step_end_index = cur_index + 1 - # cur_index += 1 - # - # # a whole set of nonessential KOs - # elif d[cur_index+1] == "(": - # while d[cur_index] != ")": - # cur_index += 1 - # cur_index += 1 # skip over the ')' - # - # # the '--' (no KO) situation - # elif d[cur_index+1] == "-": - # # when '--' in a DEFINITION line happens, it signifies a reaction step that has no associated KO. - # # we assume that such steps are not complete, because we really can't know if it is from the KOfam hits alone - # has_no_ko_step = True - # step_is_present_condition_statement += "False" - # cur_index += 2 # skip over both '-', the next character should be a space or end of DEFINITION line - # - # if cur_index < len(d) and d[cur_index] != " ": - # raise ConfigError("Serious, serious parsing sadness is happening. We just processed a '--' in " - # "a DEFINITION line for module %s, but did not see a space afterwards. Instead, we found %s. " - # "WHAT DO WE DO NOW?" % (mnum, d[cur_index+1])) - # # anything else that follows a '-' - # else: - # raise ConfigError("The following character follows a '-' in the DEFINITION line for module %s " - # "and we just don't know what to do: %s" % (mnum, d[cur_index+1])) - # - # elif d[cur_index] == " ": - # # if we are outside of parentheses, we are done processing the current step - # if parens_level == 0: - # module_step_list.append(d[last_step_end_index:cur_index]) - # module_total_steps += 1 - # # we do not evaluate completeness of this step yet if it is defined by other modules - # if not defined_by_modules: - # step_is_present = eval(step_is_present_condition_statement) - # if step_is_present: - # module_complete_steps.append(d[last_step_end_index:cur_index]) - # module_num_complete_steps += 1 - # # reset for next step - # step_is_present_condition_statement = "" - # last_step_end_index = cur_index + 1 - # cur_index += 1 - # # otherwise, we are processing an alternative path so AND is required - # else: - # step_is_present_condition_statement += " and " - # cur_index += 1 + cur_index = 0 # current position in the DEFINITION line + parens_level = 0 # how deep we are in nested parentheses + ko_in_step = {} # dictionary keyed by parens_level, containing a list of KOs in compound step at that level + + + while cur_index < len(d): + if d[cur_index] == "K": # we have found a KO + ko = d[cur_index:cur_index+6] + cur_index += 6 + defined_by_modules = False # reset this flag just in case KO-defined step comes after a module-defined step + + # our heuristic for + and - is to keep the complex together as one "mega-KO" in the path. + # downstream completeness estimation can parse it (I'm sorry future me T.T ) + # so here we check if this is a complex + while d[cur_index] == "+" or d[cur_index] == "-": + # adding an entire set of alternative KOs into the complex + # Ex. K00239+K00240+K00241-(K00242,K18859,K18860) + if d[cur_index + 1] == "(": + ko += d[cur_index] # add the + or - to the end of the ko complex string + cur_index += 2 # skip over + or - and ( + parens_level += 1 + ko_in_step[parens_level] = [] + while d[cur_index] != ")": + if d[cur_index] == "K": + nonessential_alternative_ko = d[cur_index:cur_index+6] + cur_index += 6 + ko_copy_with_alternative = ko + nonessential_alternative_ko + ko_in_step[parens_level].append(ko_copy_with_alternative) + elif d[cur_index] == ",": + cur_index += 1 + else: + raise ConfigError("Another niche parsing error. We were parsing a non-essential complex in definition " + "line %s, and we found a character we couldn't handle there: %s" % (d, d[cur_index])) + # now we've found all alternatives at this level. Below, we will add the alternatives + # if this is a set of inner parentheses, we add these to alternatives at the lower parens level + cur_index += 1 # skip over the ) + + + else: # adding just one KO into the complex + ko += d[cur_index:cur_index+7] + cur_index += 7 + + # singular KO or KO-complex - add directly to end of each path + if parens_level == 0: + for i in range(num_paths): + all_paths[i].append(ko) + # parentheses encircle a compound step - kos must be collected and added to paths later + elif parens_level == 1: + ko_in_step[parens_level].append(ko) + elif parens_level == 2: + for alt in ko_in_step[parens_level]: + ko_in_step[parens_level - 1].append(alt) + parens_level -= 1 + + elif d[cur_index] == "(": + parens_level += 1 + ko_in_step[parens_level] = [] # initialize list to keep track of kos in this step + cur_index += 1 + + elif d[cur_index] == ")": + ko_at_level = ko_in_step[parens_level] + # now that we've ended a compound step, we distribute the kos to paths + for path in all_paths: + for i in range(len(ko_at_level) - 1): + new_path_copy = copy.copy(path) + new_path_copy.append(ko_at_level[i+1]) + all_paths.append(new_path_copy) + path.append(ko_at_level[0]) + cur_index += 1 + ko_in_step[parens_level] = None + parens_level -= 1 + + elif d[cur_index] == ",": + + cur_index += 1 + + + elif d[cur_index] == "+": + raise ConfigError("While unrolling a module definition, we found a rogue '+' character that was not part of a " + "KO complex. Not sure what happened here but this is the definition line: %s" % d) + + # here, outside of complexes, the '-' denotes either non-essential steps or -- steps that don't have associated KOs + elif d[cur_index] == "-": + # a nonessential KO + if d[cur_index+1] == "K": + nonessential_ko = d[cur_index+1:cur_index+7] + cur_index += 7 + + if parens_level == 0: + for i in range(num_paths): + all_paths[i].append(nonessential_ko) + else: + raise ConfigError("Found a nonessential KO that was within some parentheses. Definition line is %s." % d) + + # the '--' (no KO) situation + elif d[cur_index+1] == "-": + if parens_level == 0: + for i in range(num_paths): + all_paths[i].append("--") + else: + raise ConfigError("Found a '--' step that was within some parentheses. Definition line is %s." % d) + + + cur_index += 2 # skip over both '-', the next character should be a space or end of DEFINITION line + + if cur_index < len(d) and d[cur_index] != " ": + raise ConfigError("Serious, serious parsing sadness is happening. We just processed a '--' in " + "a DEFINITION line for module %s, but did not see a space afterwards. Instead, we found %s. " + "WHAT DO WE DO NOW?" % (mnum, d[cur_index+1])) + # anything else that follows a '-' + else: + raise ConfigError("The following character follows a '-' in the DEFINITION line for module %s " + "and we just don't know what to do: %s" % (mnum, d[cur_index+1])) + + ########################### FIXME STARTING FROM HERE + elif d[cur_index] == " ": + # if we are outside of parentheses, we are done processing the current step + if parens_level == 0: + # reset for next step + + cur_index += 1 + # otherwise, we are processing an alternative path so AND is required + else: + step_is_present_condition_statement += " and " + cur_index += 1 # # elif d[cur_index] == "M": # """ From ce69eda2dd1fc1ed15843d39ca87e350416800fc Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 25 Mar 2020 17:09:57 -0500 Subject: [PATCH 286/400] Revert "deprecate old completeness function and start replacement. replacement will not be finished until some changes are made to kegg setup" This reverts commit 89b9b7baff960f282e0b47e42cb18a13f5ffa4e7. --- anvio/kegg.py | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 939bf58b2f..9bacf27728 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -812,8 +812,7 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N return bin_level_module_dict - - def compute_module_completeness_for_bin_DEPRECATED(self, mnum, meta_dict_for_bin): + def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): """This function calculates the completeness of the specified module. This requires some parsing of the module DEFINITION fields. In these fields, we have the following: @@ -1055,27 +1054,6 @@ def compute_module_completeness_for_bin_DEPRECATED(self, mnum, meta_dict_for_bin return over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules - def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): - """This calculates the completeness of the specified module within the given bin metabolism dictionary.""" - - present_list_for_mnum = meta_dict_for_bin[mnum]["kofam_hits"].keys() - if not present_list_for_mnum: - # no KOs in this module are present - if anvio.DEBUG: - self.run.warning("No KOs present for module %s. Parsing for completeness is still being done to obtain module steps." % mnum) - - # Now I have decided that we need to have all possible paths through a module unrolled during KEGG setup. - # That way it is done once for all modules, and we can just load the path list into memory during an init. - # Then, here in this function, we can just access the path list quickly for the purposes of computing completeness. - - # these are just here to remind myself what I need to be returning later - over_complete_threshold = False - has_nonessential_step = False - has_no_ko_step = False - defined_by_modules = False - return over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules - - def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): """This function adjusts completeness of modules that are defined by other modules. From cfb96599b2a2cb3753664ebc97192f1a8a29a6f1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 25 Mar 2020 17:10:56 -0500 Subject: [PATCH 287/400] Revert "So here is a partial path unrolling function that I have been working on for waaay too long. It is not done or tested and is probably not working," This reverts commit 4dfc70444bd395e4812a1b81a7b69f3e8ae4c456. --- anvio/kegg.py | 249 ++++++++++++++++++++++---------------------------- 1 file changed, 110 insertions(+), 139 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 9bacf27728..3cc99a5fe4 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -8,7 +8,6 @@ import requests import glob import re -import copy import anvio import anvio.db as db @@ -1754,149 +1753,121 @@ def get_kegg_module_class_dict(self, mnum): return self.parse_kegg_class_value(class_value) def unroll_module_definition(self, mnum): - """This function accesses the DEFINITION of a module, unrolls it into all possible paths, and returns the list of all paths. - - Here is how we handle some specific caveats of module definitions: - 1) Protein complexes are designated by a series of KOs separated by '+' (essential component) or '-' (non-essential component). - We keep the entire complex together (ie, as KXXXXX+KYYYYY-KZZZZZ) in each path. - 2) Some modules have non-essential steps, marked by a leading '-'. These steps are also placed into each path with the leading '-'. - 3) Some modules have steps without associated KOs, '--'. These steps are also placed into each path. - - Note that the above means that downstream processing of the paths through a module will require additional parsing. - """ + """This function accesses the DEFINITION of a module, unrolls it into all possible paths, and returns the list of all paths.""" def_lines = self.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") - all_paths = [[]] # a list of lists, where each inner list is a path - #seed_path = [] - num_paths = 1 + all_paths = [] + seed_path = [] for d in def_lines: d = d.strip() - cur_index = 0 # current position in the DEFINITION line - parens_level = 0 # how deep we are in nested parentheses - ko_in_step = {} # dictionary keyed by parens_level, containing a list of KOs in compound step at that level - - - while cur_index < len(d): - if d[cur_index] == "K": # we have found a KO - ko = d[cur_index:cur_index+6] - cur_index += 6 - defined_by_modules = False # reset this flag just in case KO-defined step comes after a module-defined step - - # our heuristic for + and - is to keep the complex together as one "mega-KO" in the path. - # downstream completeness estimation can parse it (I'm sorry future me T.T ) - # so here we check if this is a complex - while d[cur_index] == "+" or d[cur_index] == "-": - # adding an entire set of alternative KOs into the complex - # Ex. K00239+K00240+K00241-(K00242,K18859,K18860) - if d[cur_index + 1] == "(": - ko += d[cur_index] # add the + or - to the end of the ko complex string - cur_index += 2 # skip over + or - and ( - parens_level += 1 - ko_in_step[parens_level] = [] - while d[cur_index] != ")": - if d[cur_index] == "K": - nonessential_alternative_ko = d[cur_index:cur_index+6] - cur_index += 6 - ko_copy_with_alternative = ko + nonessential_alternative_ko - ko_in_step[parens_level].append(ko_copy_with_alternative) - elif d[cur_index] == ",": - cur_index += 1 - else: - raise ConfigError("Another niche parsing error. We were parsing a non-essential complex in definition " - "line %s, and we found a character we couldn't handle there: %s" % (d, d[cur_index])) - # now we've found all alternatives at this level. Below, we will add the alternatives - # if this is a set of inner parentheses, we add these to alternatives at the lower parens level - cur_index += 1 # skip over the ) - - - else: # adding just one KO into the complex - ko += d[cur_index:cur_index+7] - cur_index += 7 - - # singular KO or KO-complex - add directly to end of each path - if parens_level == 0: - for i in range(num_paths): - all_paths[i].append(ko) - # parentheses encircle a compound step - kos must be collected and added to paths later - elif parens_level == 1: - ko_in_step[parens_level].append(ko) - elif parens_level == 2: - for alt in ko_in_step[parens_level]: - ko_in_step[parens_level - 1].append(alt) - parens_level -= 1 - - elif d[cur_index] == "(": - parens_level += 1 - ko_in_step[parens_level] = [] # initialize list to keep track of kos in this step - cur_index += 1 - - elif d[cur_index] == ")": - ko_at_level = ko_in_step[parens_level] - # now that we've ended a compound step, we distribute the kos to paths - for path in all_paths: - for i in range(len(ko_at_level) - 1): - new_path_copy = copy.copy(path) - new_path_copy.append(ko_at_level[i+1]) - all_paths.append(new_path_copy) - path.append(ko_at_level[0]) - cur_index += 1 - ko_in_step[parens_level] = None - parens_level -= 1 - - elif d[cur_index] == ",": - - cur_index += 1 - - - elif d[cur_index] == "+": - raise ConfigError("While unrolling a module definition, we found a rogue '+' character that was not part of a " - "KO complex. Not sure what happened here but this is the definition line: %s" % d) - - # here, outside of complexes, the '-' denotes either non-essential steps or -- steps that don't have associated KOs - elif d[cur_index] == "-": - # a nonessential KO - if d[cur_index+1] == "K": - nonessential_ko = d[cur_index+1:cur_index+7] - cur_index += 7 - - if parens_level == 0: - for i in range(num_paths): - all_paths[i].append(nonessential_ko) - else: - raise ConfigError("Found a nonessential KO that was within some parentheses. Definition line is %s." % d) - - # the '--' (no KO) situation - elif d[cur_index+1] == "-": - if parens_level == 0: - for i in range(num_paths): - all_paths[i].append("--") - else: - raise ConfigError("Found a '--' step that was within some parentheses. Definition line is %s." % d) - - - cur_index += 2 # skip over both '-', the next character should be a space or end of DEFINITION line - - if cur_index < len(d) and d[cur_index] != " ": - raise ConfigError("Serious, serious parsing sadness is happening. We just processed a '--' in " - "a DEFINITION line for module %s, but did not see a space afterwards. Instead, we found %s. " - "WHAT DO WE DO NOW?" % (mnum, d[cur_index+1])) - # anything else that follows a '-' - else: - raise ConfigError("The following character follows a '-' in the DEFINITION line for module %s " - "and we just don't know what to do: %s" % (mnum, d[cur_index+1])) - - ########################### FIXME STARTING FROM HERE - elif d[cur_index] == " ": - # if we are outside of parentheses, we are done processing the current step - if parens_level == 0: - # reset for next step - - cur_index += 1 - # otherwise, we are processing an alternative path so AND is required - else: - step_is_present_condition_statement += " and " - cur_index += 1 + # cur_index = 0 # current position in the DEFINITION line + # parens_level = 0 # how deep we are in nested parentheses + # step_is_present_condition_statement = "" + # last_step_end_index = 0 + # + # while cur_index < len(d): + # if d[cur_index] == "K": # we have found a KO + # ko = d[cur_index:cur_index+6] + # defined_by_modules = False # reset this flag just in case KO-defined step comes after a module-defined step + # if ko in present_list_for_mnum: + # step_is_present_condition_statement += "True" + # else: + # step_is_present_condition_statement += "False" + # cur_index += 6 + # + # elif d[cur_index] == "(": + # parens_level += 1 + # step_is_present_condition_statement += "(" + # cur_index += 1 + # + # elif d[cur_index] == ")": + # parens_level -= 1 + # step_is_present_condition_statement += ")" + # cur_index += 1 + # + # elif d[cur_index] == ",": + # step_is_present_condition_statement += " or " + # cur_index += 1 + # + # elif d[cur_index] == "+": + # step_is_present_condition_statement += " and " + # cur_index += 1 + # + # elif d[cur_index] == "-": + # # either a singular KO or a set of KOs in parentheses can follow this character + # # since the following KO(s) are non-essential in the complex, we skip over them to ignore them + # # unless this is its own step, in which case we consider the whole step non-essential + # + # # singular nonessential KO + # if d[cur_index+1] == "K": + # nonessential_ko = d[cur_index+1:cur_index+7] + # cur_index += 7 + # """ + # OKAY, SO HERE WE HAVE SOME POOPINESS THAT MAY NEED TO BE FIXED EVENTUALLY. + # Basically, some DEFINITION lines have KOs that seem to be marked non-essential; + # ie, "-K11024" in "K11023 -K11024 K11025 K11026 K11027". + # It was difficult to decide whether we should consider only K11024, or K11024 and all following KOs, to be non-essential. + # For instance, the module M00778 is a complex case that gave us pause - see Fiesta issue 955. + # But for now, we have decided to just track only the one KO as a 'non-essential step', and to not include such steps in + # the module completeness estimate. + # """ + # # if this is the first KO in the step and we find a space after this KO, then we have found a non-essential step + # if step_is_present_condition_statement == "" and (cur_index == len(d) or d[cur_index] == " "): + # has_nonessential_step = True + # module_nonessential_steps.append(d[last_step_end_index:cur_index]) + # module_num_nonessential_steps += 1 + # + # if nonessential_ko in present_list_for_mnum: + # module_complete_nonessential_steps.append(d[last_step_end_index:cur_index]) + # module_num_complete_nonessential_steps += 1 + # + # # reset for next step + # last_step_end_index = cur_index + 1 + # cur_index += 1 + # + # # a whole set of nonessential KOs + # elif d[cur_index+1] == "(": + # while d[cur_index] != ")": + # cur_index += 1 + # cur_index += 1 # skip over the ')' + # + # # the '--' (no KO) situation + # elif d[cur_index+1] == "-": + # # when '--' in a DEFINITION line happens, it signifies a reaction step that has no associated KO. + # # we assume that such steps are not complete, because we really can't know if it is from the KOfam hits alone + # has_no_ko_step = True + # step_is_present_condition_statement += "False" + # cur_index += 2 # skip over both '-', the next character should be a space or end of DEFINITION line + # + # if cur_index < len(d) and d[cur_index] != " ": + # raise ConfigError("Serious, serious parsing sadness is happening. We just processed a '--' in " + # "a DEFINITION line for module %s, but did not see a space afterwards. Instead, we found %s. " + # "WHAT DO WE DO NOW?" % (mnum, d[cur_index+1])) + # # anything else that follows a '-' + # else: + # raise ConfigError("The following character follows a '-' in the DEFINITION line for module %s " + # "and we just don't know what to do: %s" % (mnum, d[cur_index+1])) + # + # elif d[cur_index] == " ": + # # if we are outside of parentheses, we are done processing the current step + # if parens_level == 0: + # module_step_list.append(d[last_step_end_index:cur_index]) + # module_total_steps += 1 + # # we do not evaluate completeness of this step yet if it is defined by other modules + # if not defined_by_modules: + # step_is_present = eval(step_is_present_condition_statement) + # if step_is_present: + # module_complete_steps.append(d[last_step_end_index:cur_index]) + # module_num_complete_steps += 1 + # # reset for next step + # step_is_present_condition_statement = "" + # last_step_end_index = cur_index + 1 + # cur_index += 1 + # # otherwise, we are processing an alternative path so AND is required + # else: + # step_is_present_condition_statement += " and " + # cur_index += 1 # # elif d[cur_index] == "M": # """ From 5ea36ad8f7c316aa5b17c920e6845296a6b41ab1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 25 Mar 2020 17:12:53 -0500 Subject: [PATCH 288/400] Revert "Revert "deprecate old completeness function and start replacement. replacement will not be finished until some changes are made to kegg setup"" This reverts commit ce69eda2dd1fc1ed15843d39ca87e350416800fc. --- anvio/kegg.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 3cc99a5fe4..39efea10bc 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -811,7 +811,8 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N return bin_level_module_dict - def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): + + def compute_module_completeness_for_bin_DEPRECATED(self, mnum, meta_dict_for_bin): """This function calculates the completeness of the specified module. This requires some parsing of the module DEFINITION fields. In these fields, we have the following: @@ -1053,6 +1054,27 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): return over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules + def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): + """This calculates the completeness of the specified module within the given bin metabolism dictionary.""" + + present_list_for_mnum = meta_dict_for_bin[mnum]["kofam_hits"].keys() + if not present_list_for_mnum: + # no KOs in this module are present + if anvio.DEBUG: + self.run.warning("No KOs present for module %s. Parsing for completeness is still being done to obtain module steps." % mnum) + + # Now I have decided that we need to have all possible paths through a module unrolled during KEGG setup. + # That way it is done once for all modules, and we can just load the path list into memory during an init. + # Then, here in this function, we can just access the path list quickly for the purposes of computing completeness. + + # these are just here to remind myself what I need to be returning later + over_complete_threshold = False + has_nonessential_step = False + has_no_ko_step = False + defined_by_modules = False + return over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules + + def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): """This function adjusts completeness of modules that are defined by other modules. From 5d5e369df2b3a3f86c76c687ca0e02fb17437d27 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 25 Mar 2020 17:13:26 -0500 Subject: [PATCH 289/400] Revert "copy over code for DEFINITION parsing so it can be adapted for path unrolling" This reverts commit 7fd4477cabdf01be3a2706db77d1043090e2db7a. --- anvio/kegg.py | 140 -------------------------------------------------- 1 file changed, 140 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 39efea10bc..d709ec941c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1774,146 +1774,6 @@ def get_kegg_module_class_dict(self, mnum): class_value = self.get_data_value_entries_for_module_by_data_name(mnum, "CLASS")[0] return self.parse_kegg_class_value(class_value) - def unroll_module_definition(self, mnum): - """This function accesses the DEFINITION of a module, unrolls it into all possible paths, and returns the list of all paths.""" - - def_lines = self.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") - all_paths = [] - seed_path = [] - - for d in def_lines: - d = d.strip() - # cur_index = 0 # current position in the DEFINITION line - # parens_level = 0 # how deep we are in nested parentheses - # step_is_present_condition_statement = "" - # last_step_end_index = 0 - # - # while cur_index < len(d): - # if d[cur_index] == "K": # we have found a KO - # ko = d[cur_index:cur_index+6] - # defined_by_modules = False # reset this flag just in case KO-defined step comes after a module-defined step - # if ko in present_list_for_mnum: - # step_is_present_condition_statement += "True" - # else: - # step_is_present_condition_statement += "False" - # cur_index += 6 - # - # elif d[cur_index] == "(": - # parens_level += 1 - # step_is_present_condition_statement += "(" - # cur_index += 1 - # - # elif d[cur_index] == ")": - # parens_level -= 1 - # step_is_present_condition_statement += ")" - # cur_index += 1 - # - # elif d[cur_index] == ",": - # step_is_present_condition_statement += " or " - # cur_index += 1 - # - # elif d[cur_index] == "+": - # step_is_present_condition_statement += " and " - # cur_index += 1 - # - # elif d[cur_index] == "-": - # # either a singular KO or a set of KOs in parentheses can follow this character - # # since the following KO(s) are non-essential in the complex, we skip over them to ignore them - # # unless this is its own step, in which case we consider the whole step non-essential - # - # # singular nonessential KO - # if d[cur_index+1] == "K": - # nonessential_ko = d[cur_index+1:cur_index+7] - # cur_index += 7 - # """ - # OKAY, SO HERE WE HAVE SOME POOPINESS THAT MAY NEED TO BE FIXED EVENTUALLY. - # Basically, some DEFINITION lines have KOs that seem to be marked non-essential; - # ie, "-K11024" in "K11023 -K11024 K11025 K11026 K11027". - # It was difficult to decide whether we should consider only K11024, or K11024 and all following KOs, to be non-essential. - # For instance, the module M00778 is a complex case that gave us pause - see Fiesta issue 955. - # But for now, we have decided to just track only the one KO as a 'non-essential step', and to not include such steps in - # the module completeness estimate. - # """ - # # if this is the first KO in the step and we find a space after this KO, then we have found a non-essential step - # if step_is_present_condition_statement == "" and (cur_index == len(d) or d[cur_index] == " "): - # has_nonessential_step = True - # module_nonessential_steps.append(d[last_step_end_index:cur_index]) - # module_num_nonessential_steps += 1 - # - # if nonessential_ko in present_list_for_mnum: - # module_complete_nonessential_steps.append(d[last_step_end_index:cur_index]) - # module_num_complete_nonessential_steps += 1 - # - # # reset for next step - # last_step_end_index = cur_index + 1 - # cur_index += 1 - # - # # a whole set of nonessential KOs - # elif d[cur_index+1] == "(": - # while d[cur_index] != ")": - # cur_index += 1 - # cur_index += 1 # skip over the ')' - # - # # the '--' (no KO) situation - # elif d[cur_index+1] == "-": - # # when '--' in a DEFINITION line happens, it signifies a reaction step that has no associated KO. - # # we assume that such steps are not complete, because we really can't know if it is from the KOfam hits alone - # has_no_ko_step = True - # step_is_present_condition_statement += "False" - # cur_index += 2 # skip over both '-', the next character should be a space or end of DEFINITION line - # - # if cur_index < len(d) and d[cur_index] != " ": - # raise ConfigError("Serious, serious parsing sadness is happening. We just processed a '--' in " - # "a DEFINITION line for module %s, but did not see a space afterwards. Instead, we found %s. " - # "WHAT DO WE DO NOW?" % (mnum, d[cur_index+1])) - # # anything else that follows a '-' - # else: - # raise ConfigError("The following character follows a '-' in the DEFINITION line for module %s " - # "and we just don't know what to do: %s" % (mnum, d[cur_index+1])) - # - # elif d[cur_index] == " ": - # # if we are outside of parentheses, we are done processing the current step - # if parens_level == 0: - # module_step_list.append(d[last_step_end_index:cur_index]) - # module_total_steps += 1 - # # we do not evaluate completeness of this step yet if it is defined by other modules - # if not defined_by_modules: - # step_is_present = eval(step_is_present_condition_statement) - # if step_is_present: - # module_complete_steps.append(d[last_step_end_index:cur_index]) - # module_num_complete_steps += 1 - # # reset for next step - # step_is_present_condition_statement = "" - # last_step_end_index = cur_index + 1 - # cur_index += 1 - # # otherwise, we are processing an alternative path so AND is required - # else: - # step_is_present_condition_statement += " and " - # cur_index += 1 - # - # elif d[cur_index] == "M": - # """ - # This happens when a module is defined by other modules. For example, photosynthesis module M00611 is defined as - # (M00161,M00163) M00165 === (photosystem II or photosystem I) and calvin cycle - # - # We need all the modules to have been evaluated before we can determine completeness of steps with module numbers. - # So what we will do here is just add the step to the appropriate lists without evaluating completeness, and use a - # flag variable to keep track of the modules that have this sort of definition in a list so we can go back and - # evaluate completeness of steps with module numbers later. - # """ - # defined_by_modules = True - # cur_index += 6 - # - # else: - # raise ConfigError("While parsing the DEFINITION field for module %s, (which is %s), anvi'o found the following character " - # "that she didn't understand: %s. Unfortunately, this means we cannot determine the module " - # "completeness. For context, here is the current index in the DEFINITION line: %s and the " - # "surrounding characters: %s" % (mnum, d, d[cur_index], cur_index, d[cur_index-5:cur_index+6])) - - - - return all_paths - class KeggModulesTable: """This class defines operations for creating the KEGG Modules table in Modules.db""" From 95014645a591c66422e22c402029128d6a7a6163 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 25 Mar 2020 17:23:36 -0500 Subject: [PATCH 290/400] starting over --- anvio/kegg.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index d709ec941c..fac0558a85 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1775,6 +1775,26 @@ def get_kegg_module_class_dict(self, mnum): return self.parse_kegg_class_value(class_value) + def unroll_module_definition(self, mnum): + """This function accesses the DEFINITION line of a KEGG Module, unrolls it into all possible paths through the module, and returns the list of all paths.""" + + all_paths = [] + # call recursive function here + + return all_paths + + def recursive_definition_unroller(self, step): + """This function recursively splits a module step into its components.""" + + # base case: step is a KO or module number, just return it + + # otherwise, if there are spaces, split by space (not in parens) and recurse on each + # otherwise, if there are commas, split by comma (not in parens) and recurse on each but put the result in copies of current path + # complexes? Keep them together? + # -- or -K0000s? Keep them as is? + # T.T + + class KeggModulesTable: """This class defines operations for creating the KEGG Modules table in Modules.db""" From b4ef56dc5aa891e859b8054dd64c2137930ee05e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 25 Mar 2020 18:15:39 -0500 Subject: [PATCH 291/400] more ideas after brainstorming --- anvio/kegg.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index fac0558a85..d17b182d64 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1784,15 +1784,30 @@ def unroll_module_definition(self, mnum): return all_paths def recursive_definition_unroller(self, step): - """This function recursively splits a module step into its components.""" - - # base case: step is a KO or module number, just return it - - # otherwise, if there are spaces, split by space (not in parens) and recurse on each - # otherwise, if there are commas, split by comma (not in parens) and recurse on each but put the result in copies of current path - # complexes? Keep them together? - # -- or -K0000s? Keep them as is? - # T.T + """This function recursively splits a module definition into its components.""" + + # first, split definition into steps by spaces outside parentheses + # establish a list to save the path + # for each step, + # base case: step is a ko, mnum, etc, so we extend the list with it + # parentheses case: step has alternative paths, so we call the split path function which will return the alternatives + # for each alternative, make a new copy of the path and extend() with the alternative + # complex case: could have alternatives so call a function to return those? + + # split_path function: takes a step as input + # first, get rid of surrounding parentheses + # second, split by comma into substeps + # for each substep: + # make a copy of the path so far to append to + # call recursive_definition_unroller to extend() to the path from this step + # return all path lists + + # base case: step is a KO, module number, --, or -K0000, just return it + if (len(step) == 6 and step[0] == "K") or (len(step) == 6 and step[0] == "M") or (step == "--") or (len(step) == 7 and step[0] == "-"): + return step + + + # anyway we need to extend() each path list with the recursive return value class KeggModulesTable: From 1d7dfca066504063db6321157b1e1b336534dfce Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 25 Mar 2020 22:53:35 -0500 Subject: [PATCH 292/400] finally an unrolling strategy that works! --- anvio/kegg.py | 90 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 17 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index d17b182d64..cc0f07b7fb 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -8,6 +8,7 @@ import requests import glob import re +import copy import anvio import anvio.db as db @@ -1778,36 +1779,91 @@ def get_kegg_module_class_dict(self, mnum): def unroll_module_definition(self, mnum): """This function accesses the DEFINITION line of a KEGG Module, unrolls it into all possible paths through the module, and returns the list of all paths.""" - all_paths = [] - # call recursive function here + all_paths = [[]] + def_lines = self.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") + for d in def_lines: + def_line_paths = self.recursive_definition_unroller(d) + new_paths_list = [] + # for each path we got back, make a new copy of each path so far and extend() + for a in def_line_paths: + for p in all_paths: + p_copy = copy.copy(p) + p_copy.extend(a) + new_paths_list.append(p_copy) + all_paths = new_paths_list return all_paths + def split_by_delim_not_within_parens(self, d, delim): + """Takes a string, and splits it on the given delimter as long as the delimeter is not within parentheses. Returns the list of strings.""" + parens_level = 0 + last_split_index = 0 + splits = [] + for i in range(len(d)): + # only split if not within parentheses + if d[i] == delim and parens_level == 0: + splits.append(d[last_split_index:i]) + last_split_index = i + 1 # we add 1 here to skip the space + elif d[i] == "(": + parens_level += 1 + elif d[i] == ")": + parens_level -= 1 + splits.append(d[last_split_index:len(d)]) + return splits + def recursive_definition_unroller(self, step): """This function recursively splits a module definition into its components.""" # first, split definition into steps by spaces outside parentheses - # establish a list to save the path - # for each step, - # base case: step is a ko, mnum, etc, so we extend the list with it + split_steps = self.split_by_delim_not_within_parens(step, " ") + # establish a list to save all paths in, with an initial empty list to extend from + paths_list = [[]] + for s in split_steps: + # base case: step is a ko, mnum, non-essential step, etc, so we extend each list with it + if (len(s) == 6 and s[0] == "K") or (len(s) == 6 and s[0] == "M") or (s == "--") or (len(s) == 7 and s[0] == "-"): + for p in paths_list: + p.extend([s]) # parentheses case: step has alternative paths, so we call the split path function which will return the alternatives - # for each alternative, make a new copy of the path and extend() with the alternative + elif s[0] == "(": + alts = self.split_path(s) + new_paths_list = [] + # for each alternative, make a new copy of each path and extend() with the alternative + for a in alts: + for p in paths_list: + p_copy = copy.copy(p) + p_copy.extend(a) + new_paths_list.append(p_copy) + paths_list = new_paths_list + # complex case: could have alternatives so call a function to return those? + else: + print("don't know what to do with this step of length %d: %s" % (len(s),s)) - # split_path function: takes a step as input - # first, get rid of surrounding parentheses - # second, split by comma into substeps - # for each substep: - # make a copy of the path so far to append to - # call recursive_definition_unroller to extend() to the path from this step - # return all path lists + # return list of list where each list is a path + return paths_list - # base case: step is a KO, module number, --, or -K0000, just return it - if (len(step) == 6 and step[0] == "K") or (len(step) == 6 and step[0] == "M") or (step == "--") or (len(step) == 7 and step[0] == "-"): - return step + def split_path(self, step): + """This function handles steps that should be split into multiple alternative paths. + It first splits the input step into substeps, and then since each substep could be its own mini-definition, + we recursively call the definition unrolling function to parse it. + """ + # first, get rid of surrounding parentheses + step = step[1:-1] + # second, split by comma into substeps (not commas in parentheses) + substeps = self.split_by_delim_not_within_parens(step, ",") + # make a final list for returning + alt_path_list = [] + for s in substeps: + # call recursive_definition_unroller to extend() to the path from this step + alt_paths_from_substep = self.recursive_definition_unroller(s) + # this will pass back a list of lists where each list is an alternative path + # for each alternative, make copy of path and extend with alternative + for a in alt_paths_from_substep: + # stick all paths from this substep into final list for returning + alt_path_list.append(a) + return alt_path_list - # anyway we need to extend() each path list with the recursive return value class KeggModulesTable: From 7e3063c0d40cf02653c3a8df6f0fbda5f1aa51e9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 26 Mar 2020 08:55:47 -0500 Subject: [PATCH 293/400] handle protein complexes in path unrolling --- anvio/kegg.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index cc0f07b7fb..57c98f4a0d 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1836,7 +1836,27 @@ def recursive_definition_unroller(self, step): paths_list = new_paths_list # complex case: could have alternatives so call a function to return those? + elif len(s) > 6 and s[6] == "+" or s[6] == "-": + # find out location of opening parentheses, if it has one + parens_loc = s.find('(') + if parens_loc == -1: # otherwise just extend with the whole complex as one element of a list + for p in paths_list: + p.extend([s]) + else: # if so, take out the parentheses section and send to split_path to get back alternatives list + prefix = s[:parens_loc] + alts = self.split_path(s[parens_loc:]) + new_paths_list = [] + # for each alternative, make a new copy of each path and extend() with the alternative + for a in alts: + extended_complex = prefix + a[0] + for p in paths_list: + p_copy = copy.copy(p) + p_copy.extend(extended_complex) + new_paths_list.append(p_copy) + paths_list = new_paths_list + else: + ### TODO FIXME print("don't know what to do with this step of length %d: %s" % (len(s),s)) # return list of list where each list is a path @@ -1865,7 +1885,6 @@ def split_path(self, step): return alt_path_list - class KeggModulesTable: """This class defines operations for creating the KEGG Modules table in Modules.db""" From 276927df055cefd63e44b6ad366870564538c2c4 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 26 Mar 2020 08:59:23 -0500 Subject: [PATCH 294/400] fix a little bug with extending the list --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 57c98f4a0d..8c151e6927 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1851,7 +1851,7 @@ def recursive_definition_unroller(self, step): extended_complex = prefix + a[0] for p in paths_list: p_copy = copy.copy(p) - p_copy.extend(extended_complex) + p_copy.extend([extended_complex]) new_paths_list.append(p_copy) paths_list = new_paths_list From a418fad157e09696b953f8c2b0d132576fd7bfca Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 26 Mar 2020 13:42:22 -0500 Subject: [PATCH 295/400] path unrolling now works on complexes, still some bugs tho --- anvio/kegg.py | 90 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 32 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 8c151e6927..619627f076 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1808,6 +1808,13 @@ def split_by_delim_not_within_parens(self, d, delim): parens_level += 1 elif d[i] == ")": parens_level -= 1 + + # we should catch the case when the parentheses are unbalanced, because this means we shouldn't have removed the outer set of parens + # this happens in cases like (K00963 (K00693+K00750,K16150,K16153,K13679,K20812)),(K00975 (K00703,K13679,K20812)) + # where both sides of the comma have balanced parentheses but there is no () around the whole thing + if parens_level < 0: + #print("found unbalanced parentheses at position %d in definition %s" % (i,d)) + return False splits.append(d[last_split_index:len(d)]) return splits @@ -1816,6 +1823,7 @@ def recursive_definition_unroller(self, step): # first, split definition into steps by spaces outside parentheses split_steps = self.split_by_delim_not_within_parens(step, " ") + #print("split definition into steps: ", split_steps) # establish a list to save all paths in, with an initial empty list to extend from paths_list = [[]] for s in split_steps: @@ -1823,41 +1831,57 @@ def recursive_definition_unroller(self, step): if (len(s) == 6 and s[0] == "K") or (len(s) == 6 and s[0] == "M") or (s == "--") or (len(s) == 7 and s[0] == "-"): for p in paths_list: p.extend([s]) - # parentheses case: step has alternative paths, so we call the split path function which will return the alternatives - elif s[0] == "(": - alts = self.split_path(s) - new_paths_list = [] - # for each alternative, make a new copy of each path and extend() with the alternative - for a in alts: - for p in paths_list: - p_copy = copy.copy(p) - p_copy.extend(a) - new_paths_list.append(p_copy) - paths_list = new_paths_list - - # complex case: could have alternatives so call a function to return those? - elif len(s) > 6 and s[6] == "+" or s[6] == "-": - # find out location of opening parentheses, if it has one - parens_loc = s.find('(') - if parens_loc == -1: # otherwise just extend with the whole complex as one element of a list - for p in paths_list: - p.extend([s]) - else: # if so, take out the parentheses section and send to split_path to get back alternatives list - prefix = s[:parens_loc] - alts = self.split_path(s[parens_loc:]) + #print("found base case: ", s) + else: + # try splitting to see if there are commas outside parentheses + # (the only way to figure this out is to try it because regex cannot handle nested parentheses) + substeps = self.split_by_delim_not_within_parens(s[1:-1], ",") + if not substeps: # if it doesn't work, try without removing surrounding parentheses + substeps = self.split_by_delim_not_within_parens(s, ",") + + # complex case: no commas outside parentheses so we are still at an atomic definition, but its a protein complex rather than a base case step + # complex cases taken care of by this block: A+B+C ; A+(B,C)+D ; A+B+C+(D,E) ; (A,B+C)+D+E + if len(substeps) == 1: + # remove external () if present + if s[0] == '(' and s[-1] == ')': + s = s[1:-1] + # find out location of parentheses, if it has them + open_parens_loc = s.find('(') + close_parens_loc = s.find(')') + if open_parens_loc == -1: # no () so just extend with the whole complex as one element of a list + for p in paths_list: + p.extend([s]) + #print("found complex case: ", s) + else: # if so, take out the parentheses section and send to split_path to get back alternatives list + prefix = s[:open_parens_loc] + suffix = s[close_parens_loc+1:] + alts = self.split_path(s[open_parens_loc:close_parens_loc+1]) + new_paths_list = [] + # for each alternative (should just be one in each list), make a new copy of each path and extend() with the alternative + for a in alts: + if len(a) > 1: + raise ConfigError("Uh oh. We found a protein complex with more than one KO per alternative option here: %s" % s) + extended_complex = prefix + a[0] + suffix + for p in paths_list: + p_copy = copy.copy(p) + p_copy.extend([extended_complex]) + new_paths_list.append(p_copy) + paths_list = new_paths_list + #print("after processing complex parentheses case, paths_list is now: ", paths_list) + + + # alternatives case: step has alternative paths, so we call the split path function which will return the alternatives + else: + alts = self.split_path(s) new_paths_list = [] # for each alternative, make a new copy of each path and extend() with the alternative for a in alts: - extended_complex = prefix + a[0] for p in paths_list: p_copy = copy.copy(p) - p_copy.extend([extended_complex]) + p_copy.extend(a) new_paths_list.append(p_copy) paths_list = new_paths_list - - else: - ### TODO FIXME - print("don't know what to do with this step of length %d: %s" % (len(s),s)) + #print("after processing parentheses case, paths_list is now: ", paths_list) # return list of list where each list is a path return paths_list @@ -1868,10 +1892,11 @@ def split_path(self, step): It first splits the input step into substeps, and then since each substep could be its own mini-definition, we recursively call the definition unrolling function to parse it. """ - # first, get rid of surrounding parentheses - step = step[1:-1] - # second, split by comma into substeps (not commas in parentheses) - substeps = self.split_by_delim_not_within_parens(step, ",") + # first, try to split after getting rid of surrounding parentheses + substeps = self.split_by_delim_not_within_parens(step[1:-1], ",") + if not substeps: # if it doesn't work, try without removing surrounding parentheses + substeps = self.split_by_delim_not_within_parens(step, ",") + #print("split path %s into substeps: %s" % (step, substeps)) # make a final list for returning alt_path_list = [] for s in substeps: @@ -1882,6 +1907,7 @@ def split_path(self, step): for a in alt_paths_from_substep: # stick all paths from this substep into final list for returning alt_path_list.append(a) + #print("alt_path_list: ", alt_path_list) return alt_path_list From c11eb89cb029e3bdc7a9f9430f1e6cddbbfc9519 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 26 Mar 2020 15:45:52 -0500 Subject: [PATCH 296/400] now unrolling works for real with complexes, but not with multiple definition lines --- anvio/kegg.py | 98 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 39 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 619627f076..be816bfbee 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1794,15 +1794,18 @@ def unroll_module_definition(self, mnum): return all_paths - def split_by_delim_not_within_parens(self, d, delim): - """Takes a string, and splits it on the given delimter as long as the delimeter is not within parentheses. Returns the list of strings.""" + def split_by_delim_not_within_parens(self, d, delims, return_delims=False): + """Takes a string, and splits it on the given delimeter(s) as long as the delimeter is not within parentheses. Returns the list of strings.""" + #print("splitting on delimiters: ", delims) parens_level = 0 last_split_index = 0 splits = [] + delim_list = [] for i in range(len(d)): # only split if not within parentheses - if d[i] == delim and parens_level == 0: + if d[i] in delims and parens_level == 0: splits.append(d[last_split_index:i]) + delim_list.append(d[i]) last_split_index = i + 1 # we add 1 here to skip the space elif d[i] == "(": parens_level += 1 @@ -1813,9 +1816,11 @@ def split_by_delim_not_within_parens(self, d, delim): # this happens in cases like (K00963 (K00693+K00750,K16150,K16153,K13679,K20812)),(K00975 (K00703,K13679,K20812)) # where both sides of the comma have balanced parentheses but there is no () around the whole thing if parens_level < 0: - #print("found unbalanced parentheses at position %d in definition %s" % (i,d)) + #print("found unbalanced parentheses at position %d in definition %s" % (i,d)) return False splits.append(d[last_split_index:len(d)]) + if return_delims: + return splits, delim_list return splits def recursive_definition_unroller(self, step): @@ -1823,7 +1828,7 @@ def recursive_definition_unroller(self, step): # first, split definition into steps by spaces outside parentheses split_steps = self.split_by_delim_not_within_parens(step, " ") - #print("split definition into steps: ", split_steps) + #print("split definition into steps: ", split_steps) # establish a list to save all paths in, with an initial empty list to extend from paths_list = [[]] for s in split_steps: @@ -1831,43 +1836,58 @@ def recursive_definition_unroller(self, step): if (len(s) == 6 and s[0] == "K") or (len(s) == 6 and s[0] == "M") or (s == "--") or (len(s) == 7 and s[0] == "-"): for p in paths_list: p.extend([s]) - #print("found base case: ", s) + #print("found base case: ", s) else: - # try splitting to see if there are commas outside parentheses + # try splitting to see if there are commas or spaces outside parentheses # (the only way to figure this out is to try it because regex cannot handle nested parentheses) - substeps = self.split_by_delim_not_within_parens(s[1:-1], ",") - if not substeps: # if it doesn't work, try without removing surrounding parentheses - substeps = self.split_by_delim_not_within_parens(s, ",") - - # complex case: no commas outside parentheses so we are still at an atomic definition, but its a protein complex rather than a base case step + comma_substeps = self.split_by_delim_not_within_parens(s[1:-1], ",") + if not comma_substeps: # if it doesn't work, try without removing surrounding parentheses + comma_substeps = self.split_by_delim_not_within_parens(s, ",") + space_substeps = self.split_by_delim_not_within_parens(s[1:-1], " ") + if not space_substeps: # if it doesn't work, try without removing surrounding parentheses + space_substeps = self.split_by_delim_not_within_parens(s, " ") + + # complex case: no commas OR spaces outside parentheses so we are still at an atomic definition, but its a protein complex rather than a base case step # complex cases taken care of by this block: A+B+C ; A+(B,C)+D ; A+B+C+(D,E) ; (A,B+C)+D+E - if len(substeps) == 1: - # remove external () if present - if s[0] == '(' and s[-1] == ')': - s = s[1:-1] - # find out location of parentheses, if it has them - open_parens_loc = s.find('(') - close_parens_loc = s.find(')') - if open_parens_loc == -1: # no () so just extend with the whole complex as one element of a list + if len(comma_substeps) == 1 and len(space_substeps) == 1: + # split on + or - + complex_components, delimiters = self.split_by_delim_not_within_parens(s, ["+","-"], return_delims=True) + #print("split complex into components: ", complex_components) + # for each component we need to reconstruct the complex (or alternate possible complexes) while keeping the +/- structure the same + complex_strs = [""] + for i in range(len(complex_components)): + c = complex_components[i] + if c[0] == '(': + alts = self.split_path(c) + #print("alts in complex are: ", alts) + new_complex_strs = [] + # for each alternative (should just be one in each list), make a new copy of the complex and extend with the alternative + for a in alts: + if len(a) > 1: + raise ConfigError("Uh oh. We found a protein complex with more than one KO per alternative option here: %s" % s) + for cs in complex_strs: + extended_complex = cs + a[0] + new_complex_strs.append(extended_complex) + complex_strs = new_complex_strs + else: + for j in range(len(complex_strs)): + complex_strs[j] += c + + if i < len(delimiters): + for j in range(len(complex_strs)): + complex_strs[j] += delimiters[i] + + # add all possible complexes to end of each path + #print("all possible complexes: ", complex_strs) + new_paths_list = [] + # for each alternative, make a new copy of each path and extend() with the alternative + for cs in complex_strs: for p in paths_list: - p.extend([s]) - #print("found complex case: ", s) - else: # if so, take out the parentheses section and send to split_path to get back alternatives list - prefix = s[:open_parens_loc] - suffix = s[close_parens_loc+1:] - alts = self.split_path(s[open_parens_loc:close_parens_loc+1]) - new_paths_list = [] - # for each alternative (should just be one in each list), make a new copy of each path and extend() with the alternative - for a in alts: - if len(a) > 1: - raise ConfigError("Uh oh. We found a protein complex with more than one KO per alternative option here: %s" % s) - extended_complex = prefix + a[0] + suffix - for p in paths_list: - p_copy = copy.copy(p) - p_copy.extend([extended_complex]) - new_paths_list.append(p_copy) - paths_list = new_paths_list - #print("after processing complex parentheses case, paths_list is now: ", paths_list) + p_copy = copy.copy(p) + p_copy.extend([cs]) + new_paths_list.append(p_copy) + paths_list = new_paths_list + #print("after processing complex case, paths_list is now: ", paths_list) # alternatives case: step has alternative paths, so we call the split path function which will return the alternatives @@ -1896,7 +1916,7 @@ def split_path(self, step): substeps = self.split_by_delim_not_within_parens(step[1:-1], ",") if not substeps: # if it doesn't work, try without removing surrounding parentheses substeps = self.split_by_delim_not_within_parens(step, ",") - #print("split path %s into substeps: %s" % (step, substeps)) + #print("split path %s into substeps: %s" % (step, substeps)) # make a final list for returning alt_path_list = [] for s in substeps: From d24f6fb1b6d99de97704f18f805ec52f6d140b82 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 26 Mar 2020 15:58:06 -0500 Subject: [PATCH 297/400] actually, the prior bug was caused by a space in the definition line. so now we strip those before unrolling --- anvio/kegg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index be816bfbee..30a9584eb6 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1782,6 +1782,7 @@ def unroll_module_definition(self, mnum): all_paths = [[]] def_lines = self.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") for d in def_lines: + d = d.strip() def_line_paths = self.recursive_definition_unroller(d) new_paths_list = [] # for each path we got back, make a new copy of each path so far and extend() From 29276f10e8a94a9eac83bd9ebb34f17207368b12 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 26 Mar 2020 21:53:34 -0500 Subject: [PATCH 298/400] clean up comments and update function docstrings --- anvio/kegg.py | 94 +++++++++++++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 30a9584eb6..becb45cbb1 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1777,7 +1777,11 @@ def get_kegg_module_class_dict(self, mnum): def unroll_module_definition(self, mnum): - """This function accesses the DEFINITION line of a KEGG Module, unrolls it into all possible paths through the module, and returns the list of all paths.""" + """This function accesses the DEFINITION line of a KEGG Module, unrolls it into all possible paths through the module, and + returns the list of all paths. + + This is a driver for the recursive functions that do the actual unrolling of each definition line. + """ all_paths = [[]] def_lines = self.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") @@ -1785,7 +1789,6 @@ def unroll_module_definition(self, mnum): d = d.strip() def_line_paths = self.recursive_definition_unroller(d) new_paths_list = [] - # for each path we got back, make a new copy of each path so far and extend() for a in def_line_paths: for p in all_paths: p_copy = copy.copy(p) @@ -1795,9 +1798,22 @@ def unroll_module_definition(self, mnum): return all_paths + def split_by_delim_not_within_parens(self, d, delims, return_delims=False): - """Takes a string, and splits it on the given delimeter(s) as long as the delimeter is not within parentheses. Returns the list of strings.""" - #print("splitting on delimiters: ", delims) + """Takes a string, and splits it on the given delimiter(s) as long as the delimeter is not within parentheses. + + PARAMETERS + ========== + d string + delims a single delimiter, or a list of delimiters + return_delims boolean, if this is true then the list of delimiters found at between each split is also returned + + RETURNS + ======= + splits list of strings that were split from d + delim_list list of delimiters that were ofund between each split + """ + parens_level = 0 last_split_index = 0 splits = [] @@ -1813,59 +1829,61 @@ def split_by_delim_not_within_parens(self, d, delims, return_delims=False): elif d[i] == ")": parens_level -= 1 - # we should catch the case when the parentheses are unbalanced, because this means we shouldn't have removed the outer set of parens - # this happens in cases like (K00963 (K00693+K00750,K16150,K16153,K13679,K20812)),(K00975 (K00703,K13679,K20812)) - # where both sides of the comma have balanced parentheses but there is no () around the whole thing + # if parentheses become unbalanced, return False to indicate this if parens_level < 0: - #print("found unbalanced parentheses at position %d in definition %s" % (i,d)) return False splits.append(d[last_split_index:len(d)]) + if return_delims: return splits, delim_list return splits + def recursive_definition_unroller(self, step): - """This function recursively splits a module definition into its components.""" + """This function recursively splits a module definition into its components. + + First, the definition is split into its component steps (separated by spaces). + Each step is either an atomic step (a single KO, module number '--', or nonessential KO starting with '-'), + a protein complex, or a compound step. + + Atomic steps are used to extend each path that has been found so far. Protein complexes are split into + their respective components, which may be split further by the split_paths() function to find all possible + alternative complexes, before being used to extend each path. Compound steps are split and recursively processed + by the split_paths() function before the resulting downstream paths are used to extend each path. + """ - # first, split definition into steps by spaces outside parentheses split_steps = self.split_by_delim_not_within_parens(step, " ") - #print("split definition into steps: ", split_steps) - # establish a list to save all paths in, with an initial empty list to extend from - paths_list = [[]] + paths_list = [[]] # list to save all paths, with initial empty path list to extend from for s in split_steps: - # base case: step is a ko, mnum, non-essential step, etc, so we extend each list with it + # base case: step is a ko, mnum, non-essential step, or '--' if (len(s) == 6 and s[0] == "K") or (len(s) == 6 and s[0] == "M") or (s == "--") or (len(s) == 7 and s[0] == "-"): for p in paths_list: p.extend([s]) - #print("found base case: ", s) else: - # try splitting to see if there are commas or spaces outside parentheses + # here we try splitting to see if there are commas or spaces outside parentheses # (the only way to figure this out is to try it because regex cannot handle nested parentheses) comma_substeps = self.split_by_delim_not_within_parens(s[1:-1], ",") if not comma_substeps: # if it doesn't work, try without removing surrounding parentheses comma_substeps = self.split_by_delim_not_within_parens(s, ",") space_substeps = self.split_by_delim_not_within_parens(s[1:-1], " ") - if not space_substeps: # if it doesn't work, try without removing surrounding parentheses + if not space_substeps: space_substeps = self.split_by_delim_not_within_parens(s, " ") - # complex case: no commas OR spaces outside parentheses so we are still at an atomic definition, but its a protein complex rather than a base case step - # complex cases taken care of by this block: A+B+C ; A+(B,C)+D ; A+B+C+(D,E) ; (A,B+C)+D+E + # complex case: no commas OR spaces outside parentheses so this is a protein complex rather than a compound step if len(comma_substeps) == 1 and len(space_substeps) == 1: - # split on + or - complex_components, delimiters = self.split_by_delim_not_within_parens(s, ["+","-"], return_delims=True) - #print("split complex into components: ", complex_components) - # for each component we need to reconstruct the complex (or alternate possible complexes) while keeping the +/- structure the same complex_strs = [""] + + # reconstruct the complex (and any alternate possible complexes) while keeping the +/- structure the same for i in range(len(complex_components)): c = complex_components[i] if c[0] == '(': alts = self.split_path(c) - #print("alts in complex are: ", alts) new_complex_strs = [] - # for each alternative (should just be one in each list), make a new copy of the complex and extend with the alternative for a in alts: if len(a) > 1: - raise ConfigError("Uh oh. We found a protein complex with more than one KO per alternative option here: %s" % s) + raise ConfigError("Uh oh. recursive_definition_unroller() speaking. We found a protein complex with more " + "than one KO per alternative option here: %s" % s) for cs in complex_strs: extended_complex = cs + a[0] new_complex_strs.append(extended_complex) @@ -1878,57 +1896,45 @@ def recursive_definition_unroller(self, step): for j in range(len(complex_strs)): complex_strs[j] += delimiters[i] - # add all possible complexes to end of each path - #print("all possible complexes: ", complex_strs) new_paths_list = [] - # for each alternative, make a new copy of each path and extend() with the alternative for cs in complex_strs: for p in paths_list: p_copy = copy.copy(p) p_copy.extend([cs]) new_paths_list.append(p_copy) paths_list = new_paths_list - #print("after processing complex case, paths_list is now: ", paths_list) - - # alternatives case: step has alternative paths, so we call the split path function which will return the alternatives + # compound step case: else: alts = self.split_path(s) new_paths_list = [] - # for each alternative, make a new copy of each path and extend() with the alternative for a in alts: for p in paths_list: p_copy = copy.copy(p) p_copy.extend(a) new_paths_list.append(p_copy) paths_list = new_paths_list - #print("after processing parentheses case, paths_list is now: ", paths_list) - # return list of list where each list is a path return paths_list def split_path(self, step): - """This function handles steps that should be split into multiple alternative paths. + """This function handles compound steps that should be split into multiple alternative paths. It first splits the input step into substeps, and then since each substep could be its own mini-definition, - we recursively call the definition unrolling function to parse it. + it recursively calls the definition unrolling function to parse it. The list of all alternative paths + that can be made from this step is returned. """ - # first, try to split after getting rid of surrounding parentheses + substeps = self.split_by_delim_not_within_parens(step[1:-1], ",") if not substeps: # if it doesn't work, try without removing surrounding parentheses substeps = self.split_by_delim_not_within_parens(step, ",") - #print("split path %s into substeps: %s" % (step, substeps)) - # make a final list for returning + alt_path_list = [] for s in substeps: - # call recursive_definition_unroller to extend() to the path from this step alt_paths_from_substep = self.recursive_definition_unroller(s) - # this will pass back a list of lists where each list is an alternative path - # for each alternative, make copy of path and extend with alternative for a in alt_paths_from_substep: - # stick all paths from this substep into final list for returning alt_path_list.append(a) - #print("alt_path_list: ", alt_path_list) + return alt_path_list From f1cd2c05d01356c5edaf566999713dc75fd0ebb8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 31 Mar 2020 11:12:10 -0500 Subject: [PATCH 299/400] update function documentation --- anvio/kegg.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index becb45cbb1..1ea981645e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -114,8 +114,8 @@ def get_ko_skip_list(self): K23749 - - - - 1 1 2266 2266 0.39 0.592 spectinabilin polyketide synthase system NorC [EC:2.3.1.290] Returns: - skip_list list of strings, each string is a KO number - no_threshold_list list of strings, each string is a KO number + skip_list list of strings, each string is a KO number that has no associated data (ie, RNAs) + no_threshold_list list of strings, each string is a KO number that has no scoring threshold """ col_names_to_check = ["threshold","score_type","profile_type","F-measure","nseq","nseq_used","alen","mlen","eff_nseq","re/pos"] @@ -428,7 +428,9 @@ def setup_modules_db(self): def setup_profiles(self): - """This is a driver function which executes the KEGG setup process by downloading, decompressing, and hmmpressing the profiles.""" + """This is a driver function which executes the KEGG setup process by downloading, decompressing, and hmmpressing the KOfam profiles. + It also downloads and processes the KEGG Module files into the MODULES.db. + """ self.download_profiles() self.decompress_files() From f59157c9c54c079f3241acba498a91c74ff8cd67 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 31 Mar 2020 11:12:35 -0500 Subject: [PATCH 300/400] change separator for kegg module annotation --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 1ea981645e..ad330b03ba 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -549,7 +549,7 @@ def process_kofam_hmms(self): # FIXME? some KOs are not associated with modules. Should we report this? if mods: - mod_annotation = "\n".join(mods) + mod_annotation = "!!!".join(mods) mod_class_annotation = "!!!".join(classes) # why do we split by '!!!'? Because that is how it is done in COGs. So so sorry. :'( mod_name_annotation = "" From 9488a84cfe76cf13cda60b05fb5b1665791f8366 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 31 Mar 2020 11:23:05 -0500 Subject: [PATCH 301/400] get rid of old present_kos list in module completeness dict --- anvio/kegg.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index ad330b03ba..515eed7128 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -780,8 +780,7 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N # initialize all modules with empty lists and dicts for kos, gene calls modules = self.kegg_modules_db.get_all_modules_as_list() for mnum in modules: - bin_level_module_dict[mnum] = {"present_kos" : [], # TODO: get rid of this key eventually - "gene_caller_ids" : set(), + bin_level_module_dict[mnum] = {"gene_caller_ids" : set(), "kofam_hits" : {}, "genes_to_contigs" : {}, "contigs_to_genes" : {} @@ -793,7 +792,6 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N if not present_in_mods: kos_not_in_modules.append(ko) for m in present_in_mods: - bin_level_module_dict[m]["present_kos"].append(ko) # TODO: get rid of this eventually bin_level_module_dict[m]["gene_caller_ids"].add(gene_call_id) if ko in bin_level_module_dict[m]["kofam_hits"]: bin_level_module_dict[m]["kofam_hits"][ko].append(gene_call_id) From fd37815ae92acaee40e0426e6bbcf4f15caa30a4 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 31 Mar 2020 14:59:26 -0500 Subject: [PATCH 302/400] bug fix to make interactive work with kegg module annotations. since kegg module annotations will not have an evalue, we need to check if one exists before doing a comparison. --- anvio/dbops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/dbops.py b/anvio/dbops.py index 6a762f3c5f..182e9e916b 100644 --- a/anvio/dbops.py +++ b/anvio/dbops.py @@ -504,7 +504,7 @@ def init_functions(self, requested_sources=[], dont_panic=False): if gene_callers_id not in self.gene_function_calls_dict: self.gene_function_calls_dict[gene_callers_id] = dict([(s, None) for s in self.gene_function_call_sources]) - if self.gene_function_calls_dict[gene_callers_id][source]: + if self.gene_function_calls_dict[gene_callers_id][source] and e_value: if self.gene_function_calls_dict[gene_callers_id][source][2] < e_value: # 'what we have:', self.gene_function_calls_dict[gene_callers_id][source] # 'rejected :', ('%s :: %s' % (function if function else 'unknown', accession), e_value) From 574626296eef400f6e4570c74386a996313e6c94 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 31 Mar 2020 18:24:39 -0500 Subject: [PATCH 303/400] a new function for estimating module completeness --- anvio/kegg.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 4 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 515eed7128..755ec6880a 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1064,15 +1064,99 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): if anvio.DEBUG: self.run.warning("No KOs present for module %s. Parsing for completeness is still being done to obtain module steps." % mnum) - # Now I have decided that we need to have all possible paths through a module unrolled during KEGG setup. - # That way it is done once for all modules, and we can just load the path list into memory during an init. - # Then, here in this function, we can just access the path list quickly for the purposes of computing completeness. + # stuff to put in the module's dictionary + module_nonessential_kos = [] # KOs that are present but unnecessary for module completeness - # these are just here to remind myself what I need to be returning later + # stuff that will be returned over_complete_threshold = False has_nonessential_step = False has_no_ko_step = False defined_by_modules = False + + # unroll the module definition to get all possible paths + meta_dict_for_bin[mnum]["paths"] = self.kegg_modules_db.unroll_module_definition(mnum) + meta_dict_for_bin[mnum]["pathway_completeness"] = [] + + for p in meta_dict_for_bin[mnum]["paths"]: + num_complete_steps_in_path = 0 + num_nonessential_steps_in_path = 0 # so that we don't count nonessential steps when computing completeness + for atomic_step in p: + # there are 5 types of atomic steps to take care of + # 1) regular old single KOs, ie Kxxxxx + if atomic_step[0] == "K" and len(atomic_step) == 6: + if atomic_step in present_list_for_mnum: + num_complete_steps_in_path += 1 + # 2) protein complexes, ie Kxxxxx+Kyyyyy-Kzzzzz (2 types of complex components - essential and nonessential) + elif atomic_step[0] == "K" and (atomic_step[6] == "+" or atomic_step[6] == "-"): + idx = 6 + essential_components = [atomic_step[0:idx]] + while idx < len(atomic_step): + component_ko = atomic_step[idx+1:idx+7] + if atomic_step[idx] == "+": + essential_components.append(component_ko) + else: + has_nonessential_step = True + if component_ko not in module_nonessential_kos: + module_nonessential_kos.append(component_ko) + idx += 7 + + num_present_components = 0 + for c in essential_components: + if c in present_list_for_mnum: + num_present_components += 1 + component_completeness = num_present_components / len(essential_components) + num_complete_steps_in_path += component_completeness + # 3) non-essential KOs, ie -Kxxxxx + elif atomic_step[0:2] == "-K" and len(atomic_step) == 7: + """ + OKAY, SO HERE WE HAVE SOME POOPINESS THAT MAY NEED TO BE FIXED EVENTUALLY. + Basically, some DEFINITION lines have KOs that seem to be marked non-essential; + ie, "-K11024" in "K11023 -K11024 K11025 K11026 K11027". + It was difficult to decide whether we should consider only K11024, or K11024 and all following KOs, to be non-essential. + For instance, the module M00778 is a complex case that gave us pause - see Fiesta issue 955. + But for now, we have decided to just track only the one KO as a 'non-essential step', and to not include such steps in + the module completeness estimate. + """ + if atomic_step[1:] not in module_nonessential_kos: + module_nonessential_kos.append(atomic_step[1:]) + num_nonessential_steps_in_path += 1 + has_nonessential_step = True + # 4) steps without associated KOs, ie -- + elif atomic_step == "--": + # when '--' in a DEFINITION line happens, it signifies a reaction step that has no associated KO. + # we assume that such steps are not complete, because we really can't know if it is from the KOfam hits alone + has_no_ko_step = True + # 5) Module numbers, ie Mxxxxx + elif atomic_step[0] == "M" and len(atomic_step) == 6: + """ + This happens when a module is defined by other modules. For example, photosynthesis module M00611 is defined as + (M00161,M00163) M00165 === (photosystem II or photosystem I) and calvin cycle + + We need all the modules to have been evaluated before we can determine completeness of steps with module numbers. + So what we will do here is to use a flag variable to keep track of the modules that have this sort of definition + in a list so we can go back and evaluate completeness of steps with module numbers later. + """ + defined_by_modules = True + else: + raise ConfigError("Well. While estimating completeness for module %m, we found an atomic step in the pathway that we " + "are not quite sure what to do with. Here it is: %s" % (mnum, atomic_step)) + + + path_completeness = num_complete_steps_in_path / (len(p) - num_nonessential_steps_in_path) + meta_dict_for_bin[mnum]["pathway_completeness"].append(path_completeness) + + # once all paths have been evaluated, we find the path(s) of maximum completeness and set that as the overall module completeness + # this is not very efficient as it takes two passes over the list but okay + meta_dict_for_bin[mnum]["percent_complete"] = max(meta_dict_for_bin[mnum]["pathway_completeness"]) + meta_dict_for_bin[mnum]["most_complete_paths"] = [meta_dict_for_bin[mnum]["paths"][i] for i, pc in enumerate(meta_dict_for_bin[mnum]["pathway_completeness"]) if pc == meta_dict_for_bin[mnum]["percent_complete"]] + + # I am just printing this for now to see how often this happens + if len(meta_dict_for_bin[mnum]["most_complete_paths"]) > 1: + print("Found multiple complete paths for module %s. Here they are: %s" % (mnum, meta_dict_for_bin[mnum]["most_complete_paths"])) + over_complete_threshold = True if meta_dict_for_bin[mnum]["percent_complete"] >= self.completeness_threshold else False + meta_dict_for_bin[mnum]["complete"] = over_complete_threshold + meta_dict_for_bin[mnum]["present_nonessential_kos"] = module_nonessential_kos + return over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules From eded0435929683ec4db748c52068dc4d89834fd4 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 31 Mar 2020 18:24:50 -0500 Subject: [PATCH 304/400] updated function documentation --- anvio/kegg.py | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 755ec6880a..fd7f647bc2 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1056,7 +1056,49 @@ def compute_module_completeness_for_bin_DEPRECATED(self, mnum, meta_dict_for_bin def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): - """This calculates the completeness of the specified module within the given bin metabolism dictionary.""" + """This function calculates the completeness of the specified module within the given bin metabolism dictionary. + + To do this, it unrolls the module definition into a list of all possible paths, where each path is a list of atomic steps. + Atomic steps include singular KOs, protein complexes, modules, non-essential steps, and steps without associated KOs. + An atomic step (or parts of a protein complex) can be considered 'present' if the corresponding KO(s) has a hit in the bin. + For each path, the function computes the path completeness as the number of present (essential) steps divided by the number of total steps in the path. + The module completeness is simply the highest path completeness. + + There are some special cases to consider here. + 1) Non-essential steps. These are steps that are marked with a preceding "-" to indicate that they are not required for the module to + be considered complete. They often occur in pathways with multiple forks. What we do with these is save and count them separately as + non-essential steps, but we do not use them in our module completeness calculations. Another thing we do is continue parsing the rest + of the module steps as normal, even though some of them may affect steps after the non-essential one. That may eventually change. + See comments in the code below. + 2) Steps without associated KOs. These are steps marked as "--". They may require an enzyme, but if so that enzyme is not in the KOfam + database, so we can't know whether they are complete or not from our KOfam hits. Therefore, we assume these steps are incomplete, and + warn the user to go back and check the module manually. + 3) Steps defined by entire modules. These steps have module numbers instead of KOs, so they require an entire module to be complete in + order to be complete. We can't figure this out until after we've evaluated all modules, so we simply parse these steps without marking + them complete, and later will go back to adjust the completeness score once all modules have been marked complete or not. + + + PARAMETERS + ========== + mnum string, module number to work on + meta_dict_for_bin metabolism completeness dict for the current bin, to be modified in-place + + NEW KEYS ADDED TO METABOLISM COMPLETENESS DICT + ======= + "paths" a list of all possible paths (each is a list of atomic) through the module DEFINITION + "pathway_completeness" a list of the completeness of each pathway + "present_nonessential_kos" a list of non-essential KOs in the module that were found to be present + "most_complete_paths" a list of the paths with maximum completeness + "percent_complete" the completeness of the module, which is the maximum pathway completeness + "complete" whether the module completeness falls over the completeness threshold + + RETURNS + ======= + over_complete_threshold boolean, whether or not the module is considered "complete" overall based on the threshold fraction of completeness + has_nonessential_step boolean, whether or not the module contains non-essential steps. Used for warning the user about these. + has_no_ko_step boolean, whether or not the module contains steps without associated KOs. Used for warning the user about these. + defined_by_modules boolean, whether or not the module contains steps defined by other modules. Used for going back to adjust completeness later. + """ present_list_for_mnum = meta_dict_for_bin[mnum]["kofam_hits"].keys() if not present_list_for_mnum: From 70ff4943bc4efc3ba82d499856d800f96b8d8969 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 11:02:56 -0500 Subject: [PATCH 305/400] fix complete paths output for no KO hits case --- anvio/kegg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index fd7f647bc2..9b8a9998ba 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1104,7 +1104,7 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): if not present_list_for_mnum: # no KOs in this module are present if anvio.DEBUG: - self.run.warning("No KOs present for module %s. Parsing for completeness is still being done to obtain module steps." % mnum) + self.run.warning("No KOs present for module %s. Parsing for completeness is still being done to obtain module information." % mnum) # stuff to put in the module's dictionary module_nonessential_kos = [] # KOs that are present but unnecessary for module completeness @@ -1190,7 +1190,10 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): # once all paths have been evaluated, we find the path(s) of maximum completeness and set that as the overall module completeness # this is not very efficient as it takes two passes over the list but okay meta_dict_for_bin[mnum]["percent_complete"] = max(meta_dict_for_bin[mnum]["pathway_completeness"]) - meta_dict_for_bin[mnum]["most_complete_paths"] = [meta_dict_for_bin[mnum]["paths"][i] for i, pc in enumerate(meta_dict_for_bin[mnum]["pathway_completeness"]) if pc == meta_dict_for_bin[mnum]["percent_complete"]] + if meta_dict_for_bin[mnum]["percent_complete"] > 0: + meta_dict_for_bin[mnum]["most_complete_paths"] = [meta_dict_for_bin[mnum]["paths"][i] for i, pc in enumerate(meta_dict_for_bin[mnum]["pathway_completeness"]) if pc == meta_dict_for_bin[mnum]["percent_complete"]] + else: + meta_dict_for_bin[mnum]["most_complete_paths"] = [] # I am just printing this for now to see how often this happens if len(meta_dict_for_bin[mnum]["most_complete_paths"]) > 1: From 2bc13f001c8ff00a0dc17c625c7abc0055f74738 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 11:48:26 -0500 Subject: [PATCH 306/400] increment number of complete modules when appropriate --- anvio/kegg.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 9b8a9998ba..65670feded 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1201,6 +1201,8 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): over_complete_threshold = True if meta_dict_for_bin[mnum]["percent_complete"] >= self.completeness_threshold else False meta_dict_for_bin[mnum]["complete"] = over_complete_threshold meta_dict_for_bin[mnum]["present_nonessential_kos"] = module_nonessential_kos + if over_complete_threshold: + meta_dict_for_bin["num_complete_modules"] += 1 return over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules From 77d39dd333ee9eb11c4f1ce17c8079d7b7e4cc01 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 11:48:57 -0500 Subject: [PATCH 307/400] updated module completeness adjustment function for pathway unrolling strategy --- anvio/kegg.py | 89 ++++++++++++++++++--------------------------------- 1 file changed, 32 insertions(+), 57 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 65670feded..bd75654b5c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1224,71 +1224,46 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): now_complete boolean, whether or not the module is NOW considered "complete" overall based on the threshold fraction of completeness """ - for step in meta_dict_for_bin[mod]["step_list"]: - cur_index = 0 # current position in the step definition - parens_level = 0 # how deep we are in nested parentheses - step_is_present_condition_statement = "" - is_ko_step = False - while cur_index < len(step): - # we have found a KO so we can ignore this step; it has already been counted as complete or not - if step[cur_index] == "K": - is_ko_step = True - break - - # we have found a module so we must evaluate this steps's completeness by checking if the module is complete - elif step[cur_index] == "M": - mnum = step[cur_index:cur_index+6] - if meta_dict_for_bin[mnum]["complete"]: - step_is_present_condition_statement += "True" - else: - step_is_present_condition_statement += "False" - cur_index += 6 - - elif step[cur_index] == "(": - parens_level += 1 - step_is_present_condition_statement += "(" - cur_index += 1 - - elif step[cur_index] == ")": - parens_level -= 1 - step_is_present_condition_statement += ")" - cur_index += 1 + for p in meta_dict_for_bin[mnum]["paths"]: + num_essential_steps_in_path = 0 # note that the len(p) will include nonessential steps; we should count only essential ones + num_complete_module_steps = 0 + + for i in range(len(p)): + atomic_step = p[i] + # single KOs and protein complexes and '--' steps; were already counted as complete by previous function + if atomic_step[0] == "K" or atomic_step == "--":: + num_essential_steps_in_path += 1 + # non-essential KO, don't count as a step in the path + elif atomic_step[0:2] == "-K" and len(atomic_step) == 7: + pass + # module step; we need to count these based on previously computed module completeness + elif atomic_step[0] == "M" and len(atomic_step) == 6: + num_complete_module_steps += meta_dict_for_bin[atomic_step]["percent_complete"] + num_essential_steps_in_path += 1 + else: + raise ConfigError("Well. While adjusting completeness estimates for module %m, we found an atomic step in the pathway that we " + "are not quite sure what to do with. Here it is: %s" % (mnum, atomic_step)) - elif step[cur_index] == ",": - step_is_present_condition_statement += " or " - cur_index += 1 + # now we adjust the previous pathway completeness + old_complete_steps_in_path = meta_dict_for_bin[mnum]["pathway_completeness"][i] * num_essential_steps_in_path + adjusted_num_complete_steps_in_path = old_complete_steps_in_path + num_complete_module_steps + meta_dict_for_bin[mnum]["pathway_completeness"][i] = adjusted_num_complete_steps_in_path / num_essential_steps_in_path - elif step[cur_index] == " ": - # if we are outside of parentheses, something is wrong because this should all be just one step - if parens_level == 0: - raise ConfigError("Much parsing sadness. We thought we were re-evaluating the completeness of just one step in " - "module %s (step: %s), but we found a space that seems to indicate another step. HALP." % (mod, step)) - # otherwise, we are processing an alternative path so AND is required - else: - step_is_present_condition_statement += " and " - cur_index += 1 + # after adjusting for all paths, adjust overall module completeness + meta_dict_for_bin[mnum]["percent_complete"] = max(meta_dict_for_bin[mnum]["pathway_completeness"]) + if meta_dict_for_bin[mnum]["percent_complete"] > 0: + meta_dict_for_bin[mnum]["most_complete_paths"] = [meta_dict_for_bin[mnum]["paths"][i] for i, pc in enumerate(meta_dict_for_bin[mnum]["pathway_completeness"]) if pc == meta_dict_for_bin[mnum]["percent_complete"]] + else: + meta_dict_for_bin[mnum]["most_complete_paths"] = [] - else: - raise ConfigError("While correcting completeness for module %s, (step %s), anvi'o found the following character " - "that she didn't understand: %s. Unfortunately, this means we cannot determine the module " - "completeness. For context, here is the current index in the DEFINITION line: %s and the " - "surrounding characters: %s" % (mod, step, step[cur_index], cur_index, step[cur_index-5:cur_index+6])) - # once we have processed everything, we need to re-evaluate the step (provided its not a KO step that has already been evaluated) - if not is_ko_step: - step_is_present = eval(step_is_present_condition_statement) - if step_is_present: - meta_dict_for_bin[mod]["complete_step_list"].append(step) - meta_dict_for_bin[mod]["num_complete_steps"] += 1 - - # now, we recalculate module completeness - meta_dict_for_bin[mod]["percent_complete"] = meta_dict_for_bin[mod]["num_complete_steps"] / meta_dict_for_bin[mod]["num_steps"] - now_complete = True if meta_dict_for_bin[mod]["percent_complete"] >= self.completeness_threshold else False - meta_dict_for_bin[mod]["complete"] = now_complete + now_complete = True if meta_dict_for_bin[mnum]["percent_complete"] >= self.completeness_threshold else False + meta_dict_for_bin[mnum]["complete"] = now_complete if now_complete: meta_dict_for_bin["num_complete_modules"] += 1 return now_complete + def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=None): """This is the atomic metabolism estimator function, which builds a metabolism completeness dictionary for an arbitrary list of splits. From bbbd09408ca5f44dbc661dc8234e893ab1343dd5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 11:51:42 -0500 Subject: [PATCH 308/400] rename i var to unique_id just to be clear on where that id is coming from --- anvio/kegg.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index bd75654b5c..1803886694 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1438,15 +1438,15 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): """ d = {} - i = 0 + unique_id = 0 for bin, mod_dict in kegg_superdict.items(): for mnum, c_dict in mod_dict.items(): if mnum == "num_complete_modules": continue - d[i] = c_dict - d[i]["bin_name"] = bin - d[i]["kegg_module"] = mnum - i += 1 + d[unique_id] = c_dict + d[unique_id]["bin_name"] = bin + d[unique_id]["kegg_module"] = mnum + unique_id += 1 utils.store_dict_as_TAB_delimited_file(d, self.output_file_path, key_header="unique_id") self.run.info("Output file", self.output_file_path, nl_before=1) From 1b61ad108acb6c1a3db6c225dc7a4f8afc9bf149 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 11:54:26 -0500 Subject: [PATCH 309/400] little syntax buggy bug --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 1803886694..d7f3bc7bbf 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1231,7 +1231,7 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): for i in range(len(p)): atomic_step = p[i] # single KOs and protein complexes and '--' steps; were already counted as complete by previous function - if atomic_step[0] == "K" or atomic_step == "--":: + if atomic_step[0] == "K" or atomic_step == "--": num_essential_steps_in_path += 1 # non-essential KO, don't count as a step in the path elif atomic_step[0:2] == "-K" and len(atomic_step) == 7: From e41eb107d8ff5253b43c19bbd1c2feb7c4bfecfa Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 12:10:55 -0500 Subject: [PATCH 310/400] fix indentation bug --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index d7f3bc7bbf..ef24dd3c30 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1184,8 +1184,8 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): "are not quite sure what to do with. Here it is: %s" % (mnum, atomic_step)) - path_completeness = num_complete_steps_in_path / (len(p) - num_nonessential_steps_in_path) - meta_dict_for_bin[mnum]["pathway_completeness"].append(path_completeness) + path_completeness = num_complete_steps_in_path / (len(p) - num_nonessential_steps_in_path) + meta_dict_for_bin[mnum]["pathway_completeness"].append(path_completeness) # once all paths have been evaluated, we find the path(s) of maximum completeness and set that as the overall module completeness # this is not very efficient as it takes two passes over the list but okay From 6997b282dfa0d765f6ea57fa026e85a8d4faf6ca Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 12:16:34 -0500 Subject: [PATCH 311/400] fix string formatting bug --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index ef24dd3c30..53f1765664 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1180,7 +1180,7 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): """ defined_by_modules = True else: - raise ConfigError("Well. While estimating completeness for module %m, we found an atomic step in the pathway that we " + raise ConfigError("Well. While estimating completeness for module %s, we found an atomic step in the pathway that we " "are not quite sure what to do with. Here it is: %s" % (mnum, atomic_step)) @@ -1241,7 +1241,7 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): num_complete_module_steps += meta_dict_for_bin[atomic_step]["percent_complete"] num_essential_steps_in_path += 1 else: - raise ConfigError("Well. While adjusting completeness estimates for module %m, we found an atomic step in the pathway that we " + raise ConfigError("Well. While adjusting completeness estimates for module %s, we found an atomic step in the pathway that we " "are not quite sure what to do with. Here it is: %s" % (mnum, atomic_step)) # now we adjust the previous pathway completeness From d0e070112c5eeb432b29df0f3f7b3a519987e553 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 13:10:18 -0500 Subject: [PATCH 312/400] fix parsing bug in pathway unrolling --- anvio/kegg.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 53f1765664..1fe8705fe1 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1966,13 +1966,17 @@ def recursive_definition_unroller(self, step): for p in paths_list: p.extend([s]) else: - # here we try splitting to see if there are commas or spaces outside parentheses - # (the only way to figure this out is to try it because regex cannot handle nested parentheses) - comma_substeps = self.split_by_delim_not_within_parens(s[1:-1], ",") - if not comma_substeps: # if it doesn't work, try without removing surrounding parentheses + if s[0] == "(" and s[-1] == ")": + # here we try splitting to see if removing the outer parentheses will make the definition become unbalanced + # (the only way to figure this out is to try it because regex cannot handle nested parentheses) + comma_substeps = self.split_by_delim_not_within_parens(s[1:-1], ",") + if not comma_substeps: # if it doesn't work, try without removing surrounding parentheses + comma_substeps = self.split_by_delim_not_within_parens(s, ",") + space_substeps = self.split_by_delim_not_within_parens(s[1:-1], " ") + if not space_substeps: + space_substeps = self.split_by_delim_not_within_parens(s, " ") + else: comma_substeps = self.split_by_delim_not_within_parens(s, ",") - space_substeps = self.split_by_delim_not_within_parens(s[1:-1], " ") - if not space_substeps: space_substeps = self.split_by_delim_not_within_parens(s, " ") # complex case: no commas OR spaces outside parentheses so this is a protein complex rather than a compound step @@ -2030,9 +2034,11 @@ def split_path(self, step): it recursively calls the definition unrolling function to parse it. The list of all alternative paths that can be made from this step is returned. """ - - substeps = self.split_by_delim_not_within_parens(step[1:-1], ",") - if not substeps: # if it doesn't work, try without removing surrounding parentheses + if step[0] == "(" and step[-1] == ")": + substeps = self.split_by_delim_not_within_parens(step[1:-1], ",") + if not substeps: # if it doesn't work, try without removing surrounding parentheses + substeps = self.split_by_delim_not_within_parens(step, ",") + else: substeps = self.split_by_delim_not_within_parens(step, ",") alt_path_list = [] From b447184c62f6ee283a7941dbd720b3530948b2f6 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 13:15:50 -0500 Subject: [PATCH 313/400] fix variable name --- anvio/kegg.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 1fe8705fe1..2768788141 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1197,7 +1197,7 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): # I am just printing this for now to see how often this happens if len(meta_dict_for_bin[mnum]["most_complete_paths"]) > 1: - print("Found multiple complete paths for module %s. Here they are: %s" % (mnum, meta_dict_for_bin[mnum]["most_complete_paths"])) + print("Found %d complete paths for module %s with completeness %s. " % (len(meta_dict_for_bin[mnum]["most_complete_paths"]), mnum, meta_dict_for_bin[mnum]["percent_complete"])) over_complete_threshold = True if meta_dict_for_bin[mnum]["percent_complete"] >= self.completeness_threshold else False meta_dict_for_bin[mnum]["complete"] = over_complete_threshold meta_dict_for_bin[mnum]["present_nonessential_kos"] = module_nonessential_kos @@ -1224,7 +1224,7 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): now_complete boolean, whether or not the module is NOW considered "complete" overall based on the threshold fraction of completeness """ - for p in meta_dict_for_bin[mnum]["paths"]: + for p in meta_dict_for_bin[mod]["paths"]: num_essential_steps_in_path = 0 # note that the len(p) will include nonessential steps; we should count only essential ones num_complete_module_steps = 0 @@ -1242,22 +1242,22 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): num_essential_steps_in_path += 1 else: raise ConfigError("Well. While adjusting completeness estimates for module %s, we found an atomic step in the pathway that we " - "are not quite sure what to do with. Here it is: %s" % (mnum, atomic_step)) + "are not quite sure what to do with. Here it is: %s" % (mod, atomic_step)) # now we adjust the previous pathway completeness - old_complete_steps_in_path = meta_dict_for_bin[mnum]["pathway_completeness"][i] * num_essential_steps_in_path + old_complete_steps_in_path = meta_dict_for_bin[mod]["pathway_completeness"][i] * num_essential_steps_in_path adjusted_num_complete_steps_in_path = old_complete_steps_in_path + num_complete_module_steps - meta_dict_for_bin[mnum]["pathway_completeness"][i] = adjusted_num_complete_steps_in_path / num_essential_steps_in_path + meta_dict_for_bin[mod]["pathway_completeness"][i] = adjusted_num_complete_steps_in_path / num_essential_steps_in_path # after adjusting for all paths, adjust overall module completeness - meta_dict_for_bin[mnum]["percent_complete"] = max(meta_dict_for_bin[mnum]["pathway_completeness"]) - if meta_dict_for_bin[mnum]["percent_complete"] > 0: - meta_dict_for_bin[mnum]["most_complete_paths"] = [meta_dict_for_bin[mnum]["paths"][i] for i, pc in enumerate(meta_dict_for_bin[mnum]["pathway_completeness"]) if pc == meta_dict_for_bin[mnum]["percent_complete"]] + meta_dict_for_bin[mod]["percent_complete"] = max(meta_dict_for_bin[mod]["pathway_completeness"]) + if meta_dict_for_bin[mod]["percent_complete"] > 0: + meta_dict_for_bin[mod]["most_complete_paths"] = [meta_dict_for_bin[mod]["paths"][i] for i, pc in enumerate(meta_dict_for_bin[mod]["pathway_completeness"]) if pc == meta_dict_for_bin[mod]["percent_complete"]] else: - meta_dict_for_bin[mnum]["most_complete_paths"] = [] + meta_dict_for_bin[mod]["most_complete_paths"] = [] - now_complete = True if meta_dict_for_bin[mnum]["percent_complete"] >= self.completeness_threshold else False - meta_dict_for_bin[mnum]["complete"] = now_complete + now_complete = True if meta_dict_for_bin[mod]["percent_complete"] >= self.completeness_threshold else False + meta_dict_for_bin[mod]["complete"] = now_complete if now_complete: meta_dict_for_bin["num_complete_modules"] += 1 From f82e2f26ddb219c39197b8df460a2953774ff540 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 13:30:01 -0500 Subject: [PATCH 314/400] multiple complete paths per module is now debug output --- anvio/kegg.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 2768788141..6c6336cd57 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1195,9 +1195,10 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): else: meta_dict_for_bin[mnum]["most_complete_paths"] = [] - # I am just printing this for now to see how often this happens - if len(meta_dict_for_bin[mnum]["most_complete_paths"]) > 1: - print("Found %d complete paths for module %s with completeness %s. " % (len(meta_dict_for_bin[mnum]["most_complete_paths"]), mnum, meta_dict_for_bin[mnum]["percent_complete"])) + + if anvio.DEBUG and len(meta_dict_for_bin[mnum]["most_complete_paths"]) > 1: + self.run.warning("Found %d complete paths for module %s with completeness %s. " % (len(meta_dict_for_bin[mnum]["most_complete_paths"]), mnum, meta_dict_for_bin[mnum]["percent_complete"]), + header='DEBUG OUTPUT', lc='yellow') over_complete_threshold = True if meta_dict_for_bin[mnum]["percent_complete"] >= self.completeness_threshold else False meta_dict_for_bin[mnum]["complete"] = over_complete_threshold meta_dict_for_bin[mnum]["present_nonessential_kos"] = module_nonessential_kos From 04efd7bab3848fa0c7f2249a07b074d9ce4d9433 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 13:30:24 -0500 Subject: [PATCH 315/400] fix index bug in module completeness adjustment func --- anvio/kegg.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6c6336cd57..e78934616c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1225,12 +1225,12 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): now_complete boolean, whether or not the module is NOW considered "complete" overall based on the threshold fraction of completeness """ - for p in meta_dict_for_bin[mod]["paths"]: + for i in range(len(meta_dict_for_bin[mod]["paths"])): + p = meta_dict_for_bin[mod]["paths"][i] num_essential_steps_in_path = 0 # note that the len(p) will include nonessential steps; we should count only essential ones num_complete_module_steps = 0 - for i in range(len(p)): - atomic_step = p[i] + for atomic_step in p: # single KOs and protein complexes and '--' steps; were already counted as complete by previous function if atomic_step[0] == "K" or atomic_step == "--": num_essential_steps_in_path += 1 @@ -1245,10 +1245,10 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): raise ConfigError("Well. While adjusting completeness estimates for module %s, we found an atomic step in the pathway that we " "are not quite sure what to do with. Here it is: %s" % (mod, atomic_step)) - # now we adjust the previous pathway completeness - old_complete_steps_in_path = meta_dict_for_bin[mod]["pathway_completeness"][i] * num_essential_steps_in_path - adjusted_num_complete_steps_in_path = old_complete_steps_in_path + num_complete_module_steps - meta_dict_for_bin[mod]["pathway_completeness"][i] = adjusted_num_complete_steps_in_path / num_essential_steps_in_path + # now we adjust the previous pathway completeness + old_complete_steps_in_path = meta_dict_for_bin[mod]["pathway_completeness"][i] * num_essential_steps_in_path + adjusted_num_complete_steps_in_path = old_complete_steps_in_path + num_complete_module_steps + meta_dict_for_bin[mod]["pathway_completeness"][i] = adjusted_num_complete_steps_in_path / num_essential_steps_in_path # after adjusting for all paths, adjust overall module completeness meta_dict_for_bin[mod]["percent_complete"] = max(meta_dict_for_bin[mod]["pathway_completeness"]) From 394bf6260c6f7b38a4e44519dacd3e90f6ac9132 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 13:37:35 -0500 Subject: [PATCH 316/400] delete deprecated completeness function hehehehehe --- anvio/kegg.py | 242 -------------------------------------------------- 1 file changed, 242 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index e78934616c..1b9ada772d 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -813,248 +813,6 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N return bin_level_module_dict - def compute_module_completeness_for_bin_DEPRECATED(self, mnum, meta_dict_for_bin): - """This function calculates the completeness of the specified module. - - This requires some parsing of the module DEFINITION fields. In these fields, we have the following: - "Kxxxxx" (KO numbers) indicating which enzyme contributes to a step in the module - "Mxxxxx" (module numbers) indicating that the module encompasses another module. This is rare. See note below. - " " (spaces) separating module steps; indicating an AND operation - "," (commas) separating alternatives (which can be singular KOs or entire pathways); indicating an OR operation - "()" (parentheses) enclosing comma-separated alternatives - "+" (plus sign) indicating the following KO is a necessary component of a complex; indicating an AND operation - "-" (minus sign) indicating the following KO is non-essential in a complex; so in other words we don't care if it is there - - What we will do is build a condition statement out of each step which will evaulate to True if the step can be considered present based - on the available KOs in the current genome/bin. - For example, suppose we have a step like: (K13937,((K00036,K19243) (K01057,K07404))) - This will be parsed into the condition statement: (K13937 OR ((K00036 OR K19243) AND (K01057 OR K07404))) - where the KOs will be replaced by True if they are present and False otherwise. - - While we are parsing, we save the individual module steps in lists (ie, one for all steps, one for complete steps) for easy access later. - Afterwards we compute the completeness of the module based on the specified completion threshold. - Then, we return a bunch of information about the completeness of the module, which can then be placed into the module completeness dictionary. - - There are 3 special cases to consider here. - 1) Non-essential steps. These are steps that are marked with a preceding "-" to indicate that they are not required for the module to - be considered complete. They often occur in pathways with multiple forks. What we do with these is save and count them separately as - non-essential steps, but we do not use them in our module completeness calculations. Another thing we do is continue parsing the rest - of the module steps as normal, even though some of them may affect steps after the non-essential one. That may eventually change. - See comments in the code below. - 2) Steps without associated KOs. These are steps marked as "--". They may require an enzyme, but if so that enzyme is not in the KOfam - database, so we can't know whether they are complete or not from our KOfam hits. Therefore, we assume these steps are incomplete, and - warn the user to go back and check the module manually. - 3) Steps defined by entire modules. These steps have module numbers instead of KOs, so they require an entire module to be complete in - order to be complete. We can't figure this out until after we've evaluated all modules, so we simply parse these steps without marking - them complete, and later will go back to adjust the completeness score once all modules have been marked complete or not. - - PARAMETERS - ========== - mnum string, module number to work on - meta_dict_for_bin metabolism completeness dict for the current bin, to be modified in-place - - VARIABLES FOR UPDATING METABOLISM COMPLETENESS DICT - ======= - module_step_list list of strings, each string is an individual step in the module (may have sub-steps if there are alternate pathways) - module_complete_steps list of strings, each string is a step in the module that is considered complete based on KO availability - module_nonessential_steps list of strings, each string is a step in the module that doesn't count for completeness estimates - module_complete_nonessential_steps list of strings, each string is a non-essential step that is considered complete based on KO availability - module_total_steps int, the total number of steps in the module - module_num_complete_steps int, the number of complete steps in the module - module_num_nonessential_steps int, the total number of nonessential steps in the module - module_num_complete_nonessential_steps int, the number of nonessential steps in the module that were found to be complete - module_completeness float, a decimal indicating the fraction of complete steps in the module - - RETURNS - ======= - over_complete_threshold boolean, whether or not the module is considered "complete" overall based on the threshold fraction of completeness - has_nonessential_step boolean, whether or not the module contains non-essential steps. Used for warning the user about these. - has_no_ko_step boolean, whether or not the module contains steps without associated KOs. Used for warning the user about these. - defined_by_modules boolean, whether or not the module contains steps defined by other modules. Used for going back to adjust completeness later. - """ - - present_list_for_mnum = meta_dict_for_bin[mnum]["present_kos"] - if not present_list_for_mnum: - # no KOs in this module are present - if anvio.DEBUG: - self.run.warning("No KOs present for module %s. Parsing for completeness is still being done to obtain module steps." % mnum) - - # module information to return - module_step_list = [] # while we are at it, we'll remember what the (essential) steps are - module_complete_steps = [] # and what the complete steps are - module_nonessential_steps = [] # steps that aren't necessary for module completeness - module_complete_nonessential_steps = [] # and those nonessential steps which we find are complete - module_total_steps = 0 - module_num_complete_steps = 0 - module_num_nonessential_steps = 0 - module_num_complete_nonessential_steps = 0 - has_nonessential_step = False - has_no_ko_step = False - defined_by_modules = False - - def_lines = self.kegg_modules_db.get_data_value_entries_for_module_by_data_name(mnum, "DEFINITION") - for d in def_lines: - d = d.strip() - cur_index = 0 # current position in the DEFINITION line - parens_level = 0 # how deep we are in nested parentheses - step_is_present_condition_statement = "" - last_step_end_index = 0 - - while cur_index < len(d): - if d[cur_index] == "K": # we have found a KO - ko = d[cur_index:cur_index+6] - defined_by_modules = False # reset this flag just in case KO-defined step comes after a module-defined step - if ko in present_list_for_mnum: - step_is_present_condition_statement += "True" - else: - step_is_present_condition_statement += "False" - cur_index += 6 - - elif d[cur_index] == "(": - parens_level += 1 - step_is_present_condition_statement += "(" - cur_index += 1 - - elif d[cur_index] == ")": - parens_level -= 1 - step_is_present_condition_statement += ")" - cur_index += 1 - - elif d[cur_index] == ",": - step_is_present_condition_statement += " or " - cur_index += 1 - - elif d[cur_index] == "+": - step_is_present_condition_statement += " and " - cur_index += 1 - - elif d[cur_index] == "-": - # either a singular KO or a set of KOs in parentheses can follow this character - # since the following KO(s) are non-essential in the complex, we skip over them to ignore them - # unless this is its own step, in which case we consider the whole step non-essential - - # singular nonessential KO - if d[cur_index+1] == "K": - nonessential_ko = d[cur_index+1:cur_index+7] - cur_index += 7 - """ - OKAY, SO HERE WE HAVE SOME POOPINESS THAT MAY NEED TO BE FIXED EVENTUALLY. - Basically, some DEFINITION lines have KOs that seem to be marked non-essential; - ie, "-K11024" in "K11023 -K11024 K11025 K11026 K11027". - It was difficult to decide whether we should consider only K11024, or K11024 and all following KOs, to be non-essential. - For instance, the module M00778 is a complex case that gave us pause - see Fiesta issue 955. - But for now, we have decided to just track only the one KO as a 'non-essential step', and to not include such steps in - the module completeness estimate. - """ - # if this is the first KO in the step and we find a space after this KO, then we have found a non-essential step - if step_is_present_condition_statement == "" and (cur_index == len(d) or d[cur_index] == " "): - has_nonessential_step = True - module_nonessential_steps.append(d[last_step_end_index:cur_index]) - module_num_nonessential_steps += 1 - - if nonessential_ko in present_list_for_mnum: - module_complete_nonessential_steps.append(d[last_step_end_index:cur_index]) - module_num_complete_nonessential_steps += 1 - - # reset for next step - last_step_end_index = cur_index + 1 - cur_index += 1 - - # a whole set of nonessential KOs - elif d[cur_index+1] == "(": - while d[cur_index] != ")": - cur_index += 1 - cur_index += 1 # skip over the ')' - - # the '--' (no KO) situation - elif d[cur_index+1] == "-": - # when '--' in a DEFINITION line happens, it signifies a reaction step that has no associated KO. - # we assume that such steps are not complete, because we really can't know if it is from the KOfam hits alone - has_no_ko_step = True - step_is_present_condition_statement += "False" - cur_index += 2 # skip over both '-', the next character should be a space or end of DEFINITION line - - if cur_index < len(d) and d[cur_index] != " ": - raise ConfigError("Serious, serious parsing sadness is happening. We just processed a '--' in " - "a DEFINITION line for module %s, but did not see a space afterwards. Instead, we found %s. " - "WHAT DO WE DO NOW?" % (mnum, d[cur_index+1])) - # anything else that follows a '-' - else: - raise ConfigError("The following character follows a '-' in the DEFINITION line for module %s " - "and we just don't know what to do: %s" % (mnum, d[cur_index+1])) - - elif d[cur_index] == " ": - # if we are outside of parentheses, we are done processing the current step - if parens_level == 0: - module_step_list.append(d[last_step_end_index:cur_index]) - module_total_steps += 1 - # we do not evaluate completeness of this step yet if it is defined by other modules - if not defined_by_modules: - step_is_present = eval(step_is_present_condition_statement) - if step_is_present: - module_complete_steps.append(d[last_step_end_index:cur_index]) - module_num_complete_steps += 1 - # reset for next step - step_is_present_condition_statement = "" - last_step_end_index = cur_index + 1 - cur_index += 1 - # otherwise, we are processing an alternative path so AND is required - else: - step_is_present_condition_statement += " and " - cur_index += 1 - - elif d[cur_index] == "M": - """ - This happens when a module is defined by other modules. For example, photosynthesis module M00611 is defined as - (M00161,M00163) M00165 === (photosystem II or photosystem I) and calvin cycle - - We need all the modules to have been evaluated before we can determine completeness of steps with module numbers. - So what we will do here is just add the step to the appropriate lists without evaluating completeness, and use a - flag variable to keep track of the modules that have this sort of definition in a list so we can go back and - evaluate completeness of steps with module numbers later. - """ - defined_by_modules = True - cur_index += 6 - - else: - raise ConfigError("While parsing the DEFINITION field for module %s, (which is %s), anvi'o found the following character " - "that she didn't understand: %s. Unfortunately, this means we cannot determine the module " - "completeness. For context, here is the current index in the DEFINITION line: %s and the " - "surrounding characters: %s" % (mnum, d, d[cur_index], cur_index, d[cur_index-5:cur_index+6])) - - # once we have processed the whole line, we still need to eval the last step. - # Unless we already did (this can happen with non-essential steps), which we check by seeing if the condition statement is empty - # However, if this step is defined by modules, the condition statement will be empty, but we still need to save the step - if step_is_present_condition_statement != "" or defined_by_modules: - module_step_list.append(d[last_step_end_index:cur_index]) - module_total_steps += 1 - if not defined_by_modules: - step_is_present = eval(step_is_present_condition_statement) - if step_is_present: - module_complete_steps.append(d[last_step_end_index:cur_index]) - module_num_complete_steps += 1 - - # once we have processed all DEFINITION lines, we can compute the overall completeness - module_completeness = module_num_complete_steps / module_total_steps - over_complete_threshold = True if module_completeness >= self.completeness_threshold else False - - # instead of returning everything, we update the metabolism completeness dictionary in place - meta_dict_for_bin[mnum]["step_list"] = module_step_list - meta_dict_for_bin[mnum]["complete_step_list"] = module_complete_steps - meta_dict_for_bin[mnum]["nonessential_step_list"] = module_nonessential_steps - meta_dict_for_bin[mnum]["complete_nonessential_step_list"]= module_complete_nonessential_steps - meta_dict_for_bin[mnum]["num_steps"] = module_total_steps - meta_dict_for_bin[mnum]["num_complete_steps"] = module_num_complete_steps - meta_dict_for_bin[mnum]["num_nonessential_steps"] = module_num_nonessential_steps - meta_dict_for_bin[mnum]["num_complete_nonessential_steps"] = module_num_complete_nonessential_steps - meta_dict_for_bin[mnum]["percent_complete"] = module_completeness - meta_dict_for_bin[mnum]["complete"] = over_complete_threshold - if over_complete_threshold: - meta_dict_for_bin["num_complete_modules"] += 1 - - return over_complete_threshold, has_nonessential_step, has_no_ko_step, defined_by_modules - - def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): """This function calculates the completeness of the specified module within the given bin metabolism dictionary. From 87b67d7f4270d899e2a4c0929bbc8314bb6053d0 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 1 Apr 2020 13:49:52 -0500 Subject: [PATCH 317/400] update docstring for printing function --- anvio/kegg.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 1b9ada772d..c921e0da14 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1191,7 +1191,28 @@ def estimate_metabolism(self): def store_kegg_metabolism_superdict(self, kegg_superdict): """This function writes the metabolism superdict to a tab-delimited file. - The metabolism superdict is a three-level dictionary (genomes/bins, modules, and module completion information). + The metabolism superdict is a three-to-four-level dictionary. The first three levels are: genomes/bins, modules, and module completion information. + The module completion dictionary also has some dictionaries in it, and those make up the fourth level. + The structure of the module completion dictionary is like this example: + {mnum: {"gene_caller_ids": set([132, 133, 431, 6777]) + "kofam_hits": {'K00033' : [431, 6777], + 'K01057' : [133], + 'K00036' : [132] }, + "genes_to_contigs": {132: 0, + 133: 0, + 431: 2, + 6777: 1 }, + "contigs_to_genes": { 0: set([132, 133]), + 1: set(6777), + 2: set(431) },} + "paths": [['K00033','K01057','K02222'], ['K00033','K01057','K00036'], ...] + "pathway_completeness": [0.66, 0.66, ...] + "present_nonessential_kos": [] + "most_complete_paths": [['K00033','K01057','K02222'], ['K00033','K01057','K00036'], ...] + "percent_complete": 0.66 + "complete": False + } + To distill this information into one line, we need to convert the dictionary on-the-fly to a dict of dicts, where each genome/bin-module pair is keyed by an arbitrary integer. """ From 096b0859c0a077958e568bcbd48c1d2d3ed30585 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 2 Apr 2020 10:55:21 -0500 Subject: [PATCH 318/400] change header for bin_name depending on context --- anvio/kegg.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index c921e0da14..e16e8c4fab 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1224,7 +1224,12 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): if mnum == "num_complete_modules": continue d[unique_id] = c_dict - d[unique_id]["bin_name"] = bin + if self.profile_db_path and not self.metagenome_mode: + d[unique_id]["genome_name"] = bin + elif not self.profile_db_path and not self.metagenome_mode: + d[unique_id]["bin_name"] = bin + elif self.metagenome_mode: + d[unique_id]["metagenome_name"] = bin d[unique_id]["kegg_module"] = mnum unique_id += 1 From 3d3cd2af0ac4db5d36d4ebfcc24e021c689da1b6 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 2 Apr 2020 11:14:29 -0500 Subject: [PATCH 319/400] reset progress bar after any warnings --- anvio/kegg.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index e16e8c4fab..a0d37e8a93 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -371,6 +371,7 @@ def move_orphan_files(self): HMM profiles that did not have any matching KOfam entries. We have removed those HMM \ profiles from the final database. You can find them under the directory '%s'." % (len(no_kofam_file_list), self.orphan_data_dir)) + self.progress.reset() if no_threshold_file_list: utils.concatenate_files(no_threshold_path, no_threshold_file_list, remove_concatenated_files=remove_old_files) @@ -378,12 +379,14 @@ def move_orphan_files(self): KOfam entries that did not have any threshold to remove weak hits. We have removed those HMM \ profiles from the final database. You can find them under the directory '%s'." % (len(no_threshold_file_list), self.orphan_data_dir)) + self.progress.reset() if no_data_file_list: utils.concatenate_files(no_data_path, no_data_file_list, remove_concatenated_files=remove_old_files) self.run.warning("Please note that while anvi'o was building your databases, she found %d \ HMM profiles that did not have any associated data (besides an annotation) in their KOfam entries. \ We have removed those HMM profiles from the final database. You can find them under the directory '%s'." % (len(no_data_file_list), self.orphan_data_dir)) + self.progress.reset() def run_hmmpress(self): @@ -682,6 +685,7 @@ def init_hits_and_splits(self): if anvio.DEBUG: self.run.warning("The following gene calls in your contigs DB were removed from consideration as they \ do not have any hits to the KOfam database: %s" % (gene_calls_without_kofam_hits)) + self.progress.reset() # get rid of splits and contigs (and their associated gene calls) that are not in the profile DB @@ -1393,6 +1397,7 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu anvi'o will quietly ignore this issue and add the line to the MODULES.db anyway. Please be warned that this may break things downstream. \ In case you are interested, the line causing this issue has data name %s and data value %s" % (current_module_num, current_data_name, data_vals)) is_ok = True # let's pretend that everything is alright so that the next function will take the original parsed values + self.progress.reset() else: raise ConfigError("While parsing, anvi'o found an uncorrectable issue with a KEGG Module line in module %s. The current data name is %s, \ here is the incorrectly-formatted data value field: %s. If you think this is totally fine and want to ignore errors like this, please \ @@ -1404,6 +1409,7 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu if anvio.DEBUG and not self.quiet: self.run.warning("While parsing a KEGG Module line, we found an issue with the formatting. We did our very best to parse the line \ correctly, but please check that it looks right to you by examining the following values.") + self.progress.reset() self.run.info("Incorrectly parsed data value field", data_vals) self.run.info("Corrected data values", corrected_vals) self.run.info("Corrected data definition", corrected_def) From f6f1186069ea8a2d627b0ebd8334b225c0092680 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 2 Apr 2020 11:15:58 -0500 Subject: [PATCH 320/400] update estimate_for_list_of_splits documentation --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index a0d37e8a93..5c7b2ee31a 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1030,7 +1030,7 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=None): """This is the atomic metabolism estimator function, which builds a metabolism completeness dictionary for an arbitrary list of splits. - For example, the list of splits may represent a bin or a single isolate genome. + For example, the list of splits may represent a bin, a single isolate genome, or an entire metagenome. The metabolism completeness dictionary is first initialized to contain the KOs that are present in the genome for each KEGG module. It is later updated with the individual steps and completion estimates for each module. @@ -1038,7 +1038,7 @@ def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=N ========== ko_hits_in_splits list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering splits a list of splits identifiers - bin_name the name of the bin that we are working with + bin_name the name of the bin/genome/metagenome that we are working with RETURNS ======= From 1859659c4e88d927f8b4351ed1d99c18df1ef57d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 2 Apr 2020 11:17:57 -0500 Subject: [PATCH 321/400] add metagenome mode. that was suspiciously easy --- anvio/kegg.py | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 5c7b2ee31a..24ba1c792d 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1162,6 +1162,35 @@ def estimate_for_bins_in_collection(self, kofam_gene_split_contig): return bins_metabolism_superdict + def estimate_for_contigs_db_for_metagenome(self, kofam_gene_split_contig): + """This function handles metabolism estimation for an entire metagenome. + + Similar to isolate genomes, we treat the entire metagenome as one big 'bin'. This means that there + will be a large amount of redundancy (repeated pathways) due to the presence of multiple populations + in the metagenome. + + In fact, because we essentially consider the metagenome to be one big genome, this function is exactly the same + as estimate_for_genome(). Why is it a separate function? Well, because we may eventually want to do something + differently here. + + PARAMETERS + ========== + kofam_gene_split_contig list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering + + RETURNS + ======= + metagenome_metabolism_superdict dictionary mapping metagenome name to its metabolism completeness dictionary + """ + + metagenome_metabolism_superdict = {} + # since we consider all the hits in the metagenome collectively, we can take the UNIQUE splits from all the hits + splits_in_metagenome = list(set([tpl[2] for tpl in kofam_gene_split_contig])) + + metagenome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(kofam_gene_split_contig, splits=splits_in_metagenome, bin_name=self.contigs_db_project_name) + + return metagenome_metabolism_superdict + + def estimate_metabolism(self): """This is the driver function for estimating metabolism. @@ -1178,14 +1207,8 @@ def estimate_metabolism(self): kegg_metabolism_superdict = self.estimate_for_bins_in_collection(kofam_hits_info) elif not self.profile_db_path and not self.metagenome_mode: kegg_metabolism_superdict = self.estimate_for_genome(kofam_hits_info) - elif self.profile_db_path and self.metagenome_mode: - raise ConfigError("This class doesn't know how to deal with that yet :/") - # metagenome, with profiling - #self.estimate_for_contigs_db_for_metagenome() - elif not self.profile_db_path and self.metagenome_mode: - raise ConfigError("This class doesn't know how to deal with that yet :/") - # metagenome without profiling - #self.estimate_for_contigs_db_for_metagenome() + elif self.metagenome_mode: + kegg_metabolism_superdict = self.estimate_for_contigs_db_for_metagenome(kofam_hits_info) else: raise ConfigError("This class doesn't know how to deal with that yet :/") From 83eb61d8d54e887b78cf2a2bf6dc9eef11b16610 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 2 Apr 2020 14:04:11 -0500 Subject: [PATCH 322/400] typo --- anvio/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/utils.py b/anvio/utils.py index b8926c1eb4..28754fa9ad 100644 --- a/anvio/utils.py +++ b/anvio/utils.py @@ -3255,7 +3255,7 @@ def download_file(url, output_file_path, progress=progress, run=run): f.close() progress.end() - run.info('Downloaded succesfully', output_file_path) + run.info('Downloaded successfully', output_file_path) def get_remote_file_content(url, gzipped=False): From 320575a7955f96b1d7f81264182656027378a95e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 2 Apr 2020 14:06:13 -0500 Subject: [PATCH 323/400] fix kegg dir structure and outputs during kegg setup --- anvio/kegg.py | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 24ba1c792d..904dbb59da 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -47,13 +47,14 @@ def __init__(self, args): self.kegg_data_dir = A('kegg_data_dir') or os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG') self.orphan_data_dir = os.path.join(self.kegg_data_dir, "orphan_data") self.module_data_dir = os.path.join(self.kegg_data_dir, "modules") + self.hmm_data_dir = os.path.join(self.kegg_data_dir, "HMMs") self.quiet = A('quiet') or False self.just_do_it = A('just_do_it') # shared variables for all KEGG subclasses - self.kofam_hmm_file_path = os.path.join(self.kegg_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms - self.ko_list_file_path = os.path.join(self.kegg_data_dir, "ko_list") - self.kegg_module_file = os.path.join(self.kegg_data_dir, "ko00002.keg") + self.kofam_hmm_file_path = os.path.join(self.hmm_data_dir, "Kofam.hmm") # file containing concatenated KOfam hmms + self.ko_list_file_path = os.path.join(self.kegg_data_dir, "ko_list.txt") + self.kegg_module_file = os.path.join(self.kegg_data_dir, "modules.keg") def setup_ko_dict(self): @@ -163,6 +164,7 @@ def __init__(self, args, run=run, progress=progress): self.is_database_exists() filesnpaths.gen_output_directory(self.kegg_data_dir, delete_if_exists=args.reset) + filesnpaths.gen_output_directory(self.hmm_data_dir, delete_if_exists=args.reset) filesnpaths.gen_output_directory(self.orphan_data_dir, delete_if_exists=args.reset) filesnpaths.gen_output_directory(self.module_data_dir, delete_if_exists=args.reset) @@ -170,7 +172,8 @@ def __init__(self, args, run=run, progress=progress): # for ko list, add /ko_list.gz to end of url # for profiles, add /profiles.tar.gz to end of url self.database_url = "ftp://ftp.genome.jp/pub/db/kofam" - self.files = ['ko_list.gz', 'profiles.tar.gz'] + # dictionary mapping downloaded file name to final decompressed file name or folder location + self.files = {'ko_list.gz': self.ko_list_file_path, 'profiles.tar.gz': self.kegg_data_dir} # Kegg module text files self.kegg_module_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=htext&filedir=" @@ -196,7 +199,7 @@ def download_profiles(self): self.run.info("Kofam Profile Database URL", self.database_url) - for file_name in self.files: + for file_name in self.files.keys(): utils.download_file(self.database_url + '/' + file_name, os.path.join(self.kegg_data_dir, file_name), progress=self.progress, run=self.run) @@ -302,17 +305,18 @@ def download_modules(self): def decompress_files(self): """This function decompresses the Kofam profiles.""" - for file_name in self.files: - self.progress.new('Decompressing file %s' % file_name) + self.progress.new('Decompressing files') + for file_name in self.files.keys(): + self.progress.update('Decompressing file %s' % file_name) full_path = os.path.join(self.kegg_data_dir, file_name) if full_path.endswith("tar.gz"): - utils.tar_extract_file(full_path, output_file_path = self.kegg_data_dir, keep_original=False) + utils.tar_extract_file(full_path, output_file_path=self.files[file_name], keep_original=False) else: - utils.gzip_decompress_file(full_path, keep_original=False) + utils.gzip_decompress_file(full_path, output_file_path=self.files[file_name], keep_original=False) self.progress.update("File decompressed. Yay.") - self.progress.end() + self.progress.end() def confirm_downloaded_profiles(self): @@ -367,33 +371,34 @@ def move_orphan_files(self): remove_old_files = not anvio.DEBUG # if we are running in debug mode, we will not remove the individual hmm files after concatenation if no_kofam_file_list: utils.concatenate_files(no_kofam_path, no_kofam_file_list, remove_concatenated_files=remove_old_files) + self.progress.reset() self.run.warning("Please note that while anvi'o was building your databases, she found %d \ HMM profiles that did not have any matching KOfam entries. We have removed those HMM \ profiles from the final database. You can find them under the directory '%s'." % (len(no_kofam_file_list), self.orphan_data_dir)) - self.progress.reset() + if no_threshold_file_list: utils.concatenate_files(no_threshold_path, no_threshold_file_list, remove_concatenated_files=remove_old_files) + self.progress.reset() self.run.warning("Please note that while anvi'o was building your databases, she found %d \ KOfam entries that did not have any threshold to remove weak hits. We have removed those HMM \ profiles from the final database. You can find them under the directory '%s'." % (len(no_threshold_file_list), self.orphan_data_dir)) - self.progress.reset() + if no_data_file_list: utils.concatenate_files(no_data_path, no_data_file_list, remove_concatenated_files=remove_old_files) + self.progress.reset() self.run.warning("Please note that while anvi'o was building your databases, she found %d \ HMM profiles that did not have any associated data (besides an annotation) in their KOfam entries. \ We have removed those HMM profiles from the final database. You can find them under the directory '%s'." % (len(no_data_file_list), self.orphan_data_dir)) - self.progress.reset() def run_hmmpress(self): """This function concatenates the Kofam profiles and runs hmmpress on them.""" self.progress.new('Preparing Kofam HMM Profiles') - log_file_path = os.path.join(self.kegg_data_dir, '00_hmmpress_log.txt') self.progress.update('Verifying the Kofam directory %s contains all HMM profiles' % self.kegg_data_dir) self.confirm_downloaded_profiles() @@ -411,7 +416,7 @@ def run_hmmpress(self): self.progress.update('Running hmmpress...') cmd_line = ['hmmpress', self.kofam_hmm_file_path] - log_file_path = os.path.join(self.kegg_data_dir, '00_hmmpress_log.txt') + log_file_path = os.path.join(self.hmm_data_dir, '00_hmmpress_log.txt') ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: @@ -683,9 +688,9 @@ def init_hits_and_splits(self): genes_in_splits = [tpl for tpl in genes_in_splits if tpl[0] not in gene_calls_without_kofam_hits] genes_in_contigs = [tpl for tpl in genes_in_contigs if tpl[0] not in gene_calls_without_kofam_hits] if anvio.DEBUG: + self.progress.reset() self.run.warning("The following gene calls in your contigs DB were removed from consideration as they \ do not have any hits to the KOfam database: %s" % (gene_calls_without_kofam_hits)) - self.progress.reset() # get rid of splits and contigs (and their associated gene calls) that are not in the profile DB @@ -1416,11 +1421,12 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu if not is_ok and not is_corrected: self.num_uncorrected_errors += 1 if self.just_do_it: + self.progress.reset() self.run.warning("While parsing, anvi'o found an uncorrectable issue with a KEGG Module line in module %s, but since you used the --just-do-it flag, \ anvi'o will quietly ignore this issue and add the line to the MODULES.db anyway. Please be warned that this may break things downstream. \ In case you are interested, the line causing this issue has data name %s and data value %s" % (current_module_num, current_data_name, data_vals)) is_ok = True # let's pretend that everything is alright so that the next function will take the original parsed values - self.progress.reset() + else: raise ConfigError("While parsing, anvi'o found an uncorrectable issue with a KEGG Module line in module %s. The current data name is %s, \ here is the incorrectly-formatted data value field: %s. If you think this is totally fine and want to ignore errors like this, please \ @@ -1430,9 +1436,9 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu if is_corrected: self.num_corrected_errors += 1 if anvio.DEBUG and not self.quiet: + self.progress.reset() self.run.warning("While parsing a KEGG Module line, we found an issue with the formatting. We did our very best to parse the line \ correctly, but please check that it looks right to you by examining the following values.") - self.progress.reset() self.run.info("Incorrectly parsed data value field", data_vals) self.run.info("Corrected data values", corrected_vals) self.run.info("Corrected data definition", corrected_def) From eb46630b9010e334368080061f42b3dfec9bbc62 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 2 Apr 2020 16:24:09 -0500 Subject: [PATCH 324/400] output is fixed :) --- anvio/kegg.py | 49 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 904dbb59da..176ccde254 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1246,26 +1246,51 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): } To distill this information into one line, we need to convert the dictionary on-the-fly to a dict of dicts, - where each genome/bin-module pair is keyed by an arbitrary integer. + where each bin-module-path-kofam_hit-gene_caller_id is keyed by an arbitrary integer. There will be a lot of redundant information + in the rows. """ + name_header = None + if self.profile_db_path and not self.metagenome_mode: + name_header = "genome_name" + elif not self.profile_db_path and not self.metagenome_mode: + name_header = "bin_name" + elif self.metagenome_mode: + name_header = "metagenome_name" + + header_list = [name_header, "kegg_module", "module_is_complete", "module_completeness", + "path_id", "path", "path_completeness", "kofam_hit_in_path", "gene_caller_id", "contig"] + d = {} unique_id = 0 for bin, mod_dict in kegg_superdict.items(): for mnum, c_dict in mod_dict.items(): if mnum == "num_complete_modules": continue - d[unique_id] = c_dict - if self.profile_db_path and not self.metagenome_mode: - d[unique_id]["genome_name"] = bin - elif not self.profile_db_path and not self.metagenome_mode: - d[unique_id]["bin_name"] = bin - elif self.metagenome_mode: - d[unique_id]["metagenome_name"] = bin - d[unique_id]["kegg_module"] = mnum - unique_id += 1 - - utils.store_dict_as_TAB_delimited_file(d, self.output_file_path, key_header="unique_id") + + for p_index in range(len(c_dict['paths'])): + p = c_dict['paths'][p_index] + + for ko in c_dict['kofam_hits']: + if ko not in p: + continue + + for gc_id in c_dict["kofam_hits"][ko]: + d[unique_id] = {} + d[unique_id][name_header] = bin + d[unique_id]["kegg_module"] = mnum + d[unique_id]["module_is_complete"] = c_dict["complete"] + d[unique_id]["module_completeness"] = c_dict["percent_complete"] + d[unique_id]["path_id"] = p_index + d[unique_id]["path"] = ",".join(p) + d[unique_id]["path_completeness"] = c_dict["pathway_completeness"][p_index] + d[unique_id]["kofam_hit_in_path"] = ko + d[unique_id]["gene_caller_id"] = gc_id + d[unique_id]["contig"] = c_dict["genes_to_contigs"][gc_id] + + unique_id += 1 + + utils.store_dict_as_TAB_delimited_file(d, self.output_file_path, key_header="unique_id", headers=header_list) self.run.info("Output file", self.output_file_path, nl_before=1) From 5ee9fa1204ac4e54ae5d30787ec3c7e04812209b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 2 Apr 2020 16:48:40 -0500 Subject: [PATCH 325/400] fix output headers --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 176ccde254..7c43fa8b7b 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1258,7 +1258,7 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): elif self.metagenome_mode: name_header = "metagenome_name" - header_list = [name_header, "kegg_module", "module_is_complete", "module_completeness", + header_list = ["unique_id", name_header, "kegg_module", "module_is_complete", "module_completeness", "path_id", "path", "path_completeness", "kofam_hit_in_path", "gene_caller_id", "contig"] d = {} From 57af100bf2722678c65a50cf97bfecb278471c3c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 2 Apr 2020 17:16:32 -0500 Subject: [PATCH 326/400] switch headers for bin and genome --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 7c43fa8b7b..b0ad2c3338 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1252,9 +1252,9 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): name_header = None if self.profile_db_path and not self.metagenome_mode: - name_header = "genome_name" - elif not self.profile_db_path and not self.metagenome_mode: name_header = "bin_name" + elif not self.profile_db_path and not self.metagenome_mode: + name_header = "genome_name" elif self.metagenome_mode: name_header = "metagenome_name" From 75cdf83ead17b783770a30ce866b5498b33483a6 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 6 Apr 2020 13:31:17 -0500 Subject: [PATCH 327/400] now we generate a summary file for complete modules in addition to the main completion dictionary output --- anvio/kegg.py | 37 ++++++++++++++++++++++++++----- bin/anvi-estimate-kegg-metabolism | 6 ++--- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 5997f06d01..cc69a1d4cc 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -735,8 +735,8 @@ def __init__(self, args, run=run, progress=progress): self.bin_id = A('bin_id') self.bin_ids_file = A('bin_ids_file') self.metagenome_mode = True if A('metagenome_mode') else False - self.completeness_threshold = A('module-completion-threshold') or 0.75 - self.output_file_path = A('output_file') or "kegg-metabolism.txt" + self.completeness_threshold = A('module_completion_threshold') or 0.75 + self.output_file_prefix = A('output_file_prefix') or "kegg-metabolism" self.contigs_db_project_name = "Unknown" self.bin_ids_to_process = None @@ -1334,7 +1334,7 @@ def estimate_metabolism(self): def store_kegg_metabolism_superdict(self, kegg_superdict): - """This function writes the metabolism superdict to a tab-delimited file. + """This function writes the metabolism superdict to a tab-delimited file, and also generates a file summarizing the complete modules. The metabolism superdict is a three-to-four-level dictionary. The first three levels are: genomes/bins, modules, and module completion information. The module completion dictionary also has some dictionaries in it, and those make up the fourth level. @@ -1361,8 +1361,15 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): To distill this information into one line, we need to convert the dictionary on-the-fly to a dict of dicts, where each bin-module-path-kofam_hit-gene_caller_id is keyed by an arbitrary integer. There will be a lot of redundant information in the rows. + + The complete modules summary file includes only a portion of the information in the metabolism dictionary. Its purpose is to give the user + quick access to the complete modules in each bin. Every bin-module pair in this file is keyed by an arbitrary integer (with no relation to the + id in the other file). """ + hits_output_path = self.output_file_prefix + "-all_kofam_hits.txt" + complete_module_summary_path = self.output_file_prefix + "-complete_modules.txt" + name_header = None if self.profile_db_path and not self.metagenome_mode: name_header = "bin_name" @@ -1373,14 +1380,32 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): header_list = ["unique_id", name_header, "kegg_module", "module_is_complete", "module_completeness", "path_id", "path", "path_completeness", "kofam_hit_in_path", "gene_caller_id", "contig"] + summary_header_list = ["unique_id", name_header, "kegg_module","module_completeness", "module_name", "module_class", + "module_category", "module_subcategory"] d = {} + cm_summary = {} unique_id = 0 + summary_unique_id = 0 for bin, mod_dict in kegg_superdict.items(): for mnum, c_dict in mod_dict.items(): if mnum == "num_complete_modules": continue + + if c_dict["complete"]: + cm_summary[summary_unique_id] = {} + cm_summary[summary_unique_id][name_header] = bin + cm_summary[summary_unique_id]["kegg_module"] = mnum + cm_summary[summary_unique_id]["module_completeness"] = c_dict["percent_complete"] + cm_summary[summary_unique_id]["module_name"] = self.kegg_modules_db.get_module_name(mnum) + mnum_class_dict = self.kegg_modules_db.get_kegg_module_class_dict(mnum) + cm_summary[summary_unique_id]["module_class"] = mnum_class_dict["class"] + cm_summary[summary_unique_id]["module_category"] = mnum_class_dict["category"] + cm_summary[summary_unique_id]["module_subcategory"] = mnum_class_dict["subcategory"] + + summary_unique_id += 1 + for p_index in range(len(c_dict['paths'])): p = c_dict['paths'][p_index] @@ -1403,8 +1428,10 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): unique_id += 1 - utils.store_dict_as_TAB_delimited_file(d, self.output_file_path, key_header="unique_id", headers=header_list) - self.run.info("Output file", self.output_file_path, nl_before=1) + utils.store_dict_as_TAB_delimited_file(d, hits_output_path, key_header="unique_id", headers=header_list) + self.run.info("Kofam hits output file", hits_output_path, nl_before=1) + utils.store_dict_as_TAB_delimited_file(cm_summary, complete_module_summary_path, key_header="unique_id", headers=summary_header_list) + self.run.info("Complete modules summary file", complete_module_summary_path) class KeggModulesDatabase(KeggContext): diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index 02d1b06aa7..8d5a9e8082 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -47,10 +47,10 @@ if __name__ == '__main__': groupP.add_argument(*anvio.A('bin-id'), **anvio.K('bin-id')) groupP.add_argument(*anvio.A('bin-ids-file'), **anvio.K('bin-ids-file')) - groupC = parser.add_argument_group('OUTPUT', "Parameters for controlling estimation output. The output will be a TAB-delimited file which by \ - default is called kegg-metabolism.txt, but you can of course change that name here.") + groupC = parser.add_argument_group('OUTPUT', "Parameters for controlling estimation output. The output will be TAB-delimited files which by \ + default are prefixed with 'kegg-metabolism', but you can of course change that name here.") groupC.add_argument(*anvio.A('module-completion-threshold'), **anvio.K('module-completion-threshold')) - groupC.add_argument(*anvio.A('output-file'), **anvio.K('output-file')) + groupC.add_argument(*anvio.A('output-file-prefix'), **anvio.K('output-file-prefix')) args = anvio.get_args(parser) From e7cf5aceaa28f75d691b3679ad9cbb21de54c9a3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 8 Apr 2020 13:44:19 -0500 Subject: [PATCH 328/400] redundancy estimatiom skeleton, plus func for naive redundancy --- anvio/kegg.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index cc69a1d4cc..820901ba6e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1145,6 +1145,49 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): return now_complete + def compute_naive_redundancy_for_path(self, num_ko_hits_in_path_dict): + """This function computes a naive redundancy measure for a module path, given the number of hits per KO in the path. + + naive redundancy = # extra hits / len(path) where a hit is "extra" if it is not the first hit to the KO. + """ + extra_hits = [num_ko_hits_in_path_dict[ko] - 1 for ko in num_ko_hits_in_path_dict if num_ko_hits_in_path_dict[ko] > 1 ] + print("extra hits: ", extra_hits) + return sum(extra_hits)/len(num_ko_hits_in_path_dict.keys()) + + + def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): + """This function calculates the redundancy of the specified module within the given bin metabolism dictionary. + + Each module can have multiple paths, but we only compute redundancy on the paths with the highest completeness + (stored under the "most_complete_paths" key). If there are no paths in this list (which only happens when there + are 0 KOfam hits to the module), then we do not compute redundancy. + + PARAMETERS + ========== + mnum string, module number to work on + meta_dict_for_bin metabolism completeness dict for the current bin, to be modified in-place + + """ + + paths_of_highest_completeness = meta_dict_for_bin[mnum]["most_complete_paths"] + if not paths_of_highest_completeness: + # put zero values in dict wherever necessary + return + + for p in paths_of_highest_completeness: + kofam_hits_in_path = { ko : meta_dict_for_bin[mnum]["kofam_hits"][ko] for ko in meta_dict_for_bin[mnum]["kofam_hits"].keys() if ko in p } + num_hits_per_kofam = { ko : len(kofam_hits_in_path[ko]) for ko in kofam_hits_in_path.keys() } + for ko in p: + if ko not in num_hits_per_kofam: + num_hits_per_kofam[ko] = 0 + + # for now, we will try a bunch of different redundancy calculations and put them all into the dictionary until we find the ones we like + naive = self.compute_naive_redundancy_for_path(num_hits_per_kofam) + print("naive redundancy = ", naive) + + return + + def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=None): """This is the atomic metabolism estimator function, which builds a metabolism completeness dictionary for an arbitrary list of splits. @@ -1200,6 +1243,15 @@ def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=N if mod_is_complete: complete_mods.append(mod) + + # estimate redundancy of each module + for mod in metabolism_dict_for_list_of_splits.keys(): + if mod == "num_complete_modules": + continue + + # redundancy estimation GOES HERE + + # notify user of the modules that gave some fishy results if not self.quiet: if mods_with_nonessential_steps: From 788d3e792f8ceb2181ca86dcc23a46a48307403c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 8 Apr 2020 13:45:39 -0500 Subject: [PATCH 329/400] call redundancy estimation for each module --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 820901ba6e..214df8f81a 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1249,7 +1249,7 @@ def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=N if mod == "num_complete_modules": continue - # redundancy estimation GOES HERE + self.compute_module_redundancy_for_bin(mod, metabolism_dict_for_list_of_splits) # notify user of the modules that gave some fishy results From f71a088c993f946c1c1ca0e16d4d437b32f8e0bd Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 9 Apr 2020 13:05:01 -0500 Subject: [PATCH 330/400] import stats --- anvio/kegg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 214df8f81a..fea1949133 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -9,6 +9,7 @@ import glob import re import copy +import statistics as stats import anvio import anvio.db as db From 33e7f3eeea89b48e67f28440b4cf770978090664 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 9 Apr 2020 13:05:47 -0500 Subject: [PATCH 331/400] function for copywise redundancy computation --- anvio/kegg.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index fea1949133..175cced202 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1151,11 +1151,43 @@ def compute_naive_redundancy_for_path(self, num_ko_hits_in_path_dict): naive redundancy = # extra hits / len(path) where a hit is "extra" if it is not the first hit to the KO. """ + extra_hits = [num_ko_hits_in_path_dict[ko] - 1 for ko in num_ko_hits_in_path_dict if num_ko_hits_in_path_dict[ko] > 1 ] - print("extra hits: ", extra_hits) return sum(extra_hits)/len(num_ko_hits_in_path_dict.keys()) + def compute_copywise_redundancy_for_path(self, num_ko_hits_in_path_dict, aggregation_measure="average"): + """This function computes redundancy based on the completeness of each extra copy of a path. + + The 'base' redundancy score is determined by the number of extra copies with 100% completeness. + The completeness measurements of all other extra copies are aggregated (using the aggregation_measure) and + added to this 'base' redundancy to get the overall path redundancy. + """ + + extra_hits = [num_ko_hits_in_path_dict[ko] - 1 if num_ko_hits_in_path_dict[ko] > 1 else 0 for ko in num_ko_hits_in_path_dict] + base_redundancy = min(extra_hits) # number of extra copies of path that are 100% complete + extra_copy_completeness = [] + # here we get the completeness of every extra copy of the path + for i in range((base_redundancy+1), max(extra_hits) + 1): + num_present_kos_in_copy = len([num_hits for num_hits in extra_hits if num_hits >= i]) + extra_copy_completeness.append(num_present_kos_in_copy/len(num_ko_hits_in_path_dict.keys())) + + aggregated_completeness = None + if not extra_copy_completeness: # this handles the case when ALL extra copies are 100% complete + aggregated_completeness = 0 + else: + if aggregation_measure == "average": + aggregated_completeness = stats.mean(extra_copy_completeness) + elif aggregation_measure == "median": + aggregated_completeness = stats.median(extra_copy_completeness) + elif aggregation_measure == "knee": + raise ConfigError("aggregation measure 'knee' not implemented yet") + else: + raise ConfigError("The function compute_copywise_redundancy_for_path() doesn't know how to handle the aggregation measure '%s'", aggregation_measure) + + return (base_redundancy + aggregated_completeness), extra_copy_completeness + + def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): """This function calculates the redundancy of the specified module within the given bin metabolism dictionary. From f685e18db0b99253573e5c47951361e987a879f3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 9 Apr 2020 13:06:12 -0500 Subject: [PATCH 332/400] store redundancy outputs in module completeness dict --- anvio/kegg.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 175cced202..5d35154f24 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1202,6 +1202,12 @@ def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): """ + meta_dict_for_bin[mnum]["naive_redundancy"] = [] + meta_dict_for_bin[mnum]["copywise_average"] = [] + meta_dict_for_bin[mnum]["copywise_average_completeness_distributions"] = [] + meta_dict_for_bin[mnum]["copywise_median"] = [] + meta_dict_for_bin[mnum]["copywise_median_completeness_distributions"] = [] + paths_of_highest_completeness = meta_dict_for_bin[mnum]["most_complete_paths"] if not paths_of_highest_completeness: # put zero values in dict wherever necessary @@ -1215,8 +1221,14 @@ def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): num_hits_per_kofam[ko] = 0 # for now, we will try a bunch of different redundancy calculations and put them all into the dictionary until we find the ones we like - naive = self.compute_naive_redundancy_for_path(num_hits_per_kofam) - print("naive redundancy = ", naive) + meta_dict_for_bin[mnum]["naive_redundancy"].append(self.compute_naive_redundancy_for_path(num_hits_per_kofam)) + cw_avg_redundancy, copy_completeness_distribution = self.compute_copywise_redundancy_for_path(num_hits_per_kofam, aggregation_measure="average") + meta_dict_for_bin[mnum]["copywise_average"].append(cw_avg_redundancy) + meta_dict_for_bin[mnum]["copywise_average_completeness_distributions"].append(copy_completeness_distribution) + cw_med_redundancy, copy_completeness_distribution = self.compute_copywise_redundancy_for_path(num_hits_per_kofam, aggregation_measure="median") + meta_dict_for_bin[mnum]["copywise_median"].append(cw_med_redundancy) + meta_dict_for_bin[mnum]["copywise_median_completeness_distributions"].append(copy_completeness_distribution) + return From 840aeabffbccf41e13c6915abae7e3885ce44c50 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 9 Apr 2020 13:33:13 -0500 Subject: [PATCH 333/400] add weighted sum aggregation measure --- anvio/kegg.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 5d35154f24..79fd74da25 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1180,6 +1180,10 @@ def compute_copywise_redundancy_for_path(self, num_ko_hits_in_path_dict, aggrega aggregated_completeness = stats.mean(extra_copy_completeness) elif aggregation_measure == "median": aggregated_completeness = stats.median(extra_copy_completeness) + elif aggregation_measure == "weighted_sum": + aggregated_completeness = 0 + for c in range(len(extra_copy_completeness)): + aggregated_completeness += 1/(c+1) * extra_copy_completeness[c] elif aggregation_measure == "knee": raise ConfigError("aggregation measure 'knee' not implemented yet") else: @@ -1207,6 +1211,8 @@ def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): meta_dict_for_bin[mnum]["copywise_average_completeness_distributions"] = [] meta_dict_for_bin[mnum]["copywise_median"] = [] meta_dict_for_bin[mnum]["copywise_median_completeness_distributions"] = [] + meta_dict_for_bin[mnum]["copywise_weighted-sum"] = [] + meta_dict_for_bin[mnum]["copywise_weighted-sum_completeness_distributions"] = [] paths_of_highest_completeness = meta_dict_for_bin[mnum]["most_complete_paths"] if not paths_of_highest_completeness: @@ -1228,6 +1234,9 @@ def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): cw_med_redundancy, copy_completeness_distribution = self.compute_copywise_redundancy_for_path(num_hits_per_kofam, aggregation_measure="median") meta_dict_for_bin[mnum]["copywise_median"].append(cw_med_redundancy) meta_dict_for_bin[mnum]["copywise_median_completeness_distributions"].append(copy_completeness_distribution) + cw_ws_redundancy, copy_completeness_distribution = self.compute_copywise_redundancy_for_path(num_hits_per_kofam, aggregation_measure="weighted_sum") + meta_dict_for_bin[mnum]["copywise_weighted-sum"].append(cw_ws_redundancy) + meta_dict_for_bin[mnum]["copywise_weighted-sum_completeness_distributions"].append(copy_completeness_distribution) return From 48ecf24e4479d3b4e696b68920760fbf464d221e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 9 Apr 2020 13:36:28 -0500 Subject: [PATCH 334/400] we only add completeness distribution once because it is the same regardless of aggregation measure --- anvio/kegg.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 79fd74da25..73c55f52d8 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1208,11 +1208,9 @@ def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): meta_dict_for_bin[mnum]["naive_redundancy"] = [] meta_dict_for_bin[mnum]["copywise_average"] = [] - meta_dict_for_bin[mnum]["copywise_average_completeness_distributions"] = [] + meta_dict_for_bin[mnum]["copywise_completeness_distributions"] = [] meta_dict_for_bin[mnum]["copywise_median"] = [] - meta_dict_for_bin[mnum]["copywise_median_completeness_distributions"] = [] meta_dict_for_bin[mnum]["copywise_weighted-sum"] = [] - meta_dict_for_bin[mnum]["copywise_weighted-sum_completeness_distributions"] = [] paths_of_highest_completeness = meta_dict_for_bin[mnum]["most_complete_paths"] if not paths_of_highest_completeness: @@ -1230,13 +1228,11 @@ def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): meta_dict_for_bin[mnum]["naive_redundancy"].append(self.compute_naive_redundancy_for_path(num_hits_per_kofam)) cw_avg_redundancy, copy_completeness_distribution = self.compute_copywise_redundancy_for_path(num_hits_per_kofam, aggregation_measure="average") meta_dict_for_bin[mnum]["copywise_average"].append(cw_avg_redundancy) - meta_dict_for_bin[mnum]["copywise_average_completeness_distributions"].append(copy_completeness_distribution) + meta_dict_for_bin[mnum]["copywise_completeness_distributions"].append(copy_completeness_distribution) cw_med_redundancy, copy_completeness_distribution = self.compute_copywise_redundancy_for_path(num_hits_per_kofam, aggregation_measure="median") meta_dict_for_bin[mnum]["copywise_median"].append(cw_med_redundancy) - meta_dict_for_bin[mnum]["copywise_median_completeness_distributions"].append(copy_completeness_distribution) cw_ws_redundancy, copy_completeness_distribution = self.compute_copywise_redundancy_for_path(num_hits_per_kofam, aggregation_measure="weighted_sum") meta_dict_for_bin[mnum]["copywise_weighted-sum"].append(cw_ws_redundancy) - meta_dict_for_bin[mnum]["copywise_weighted-sum_completeness_distributions"].append(copy_completeness_distribution) return From 92eaebffdee249662ec4977a0003829e8434f543 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 14 Apr 2020 10:11:06 -0500 Subject: [PATCH 335/400] add option to use hmmsearch --- anvio/drivers/hmmer.py | 13 ++++++++++--- anvio/kegg.py | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index 0a51526600..531c43c94b 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -31,15 +31,21 @@ class HMMer: - def __init__(self, target_files_dict, num_threads_to_use=1, progress=progress, run=run): + def __init__(self, target_files_dict, num_threads_to_use=1, program_to_use="hmmscan", progress=progress, run=run): """A class to streamline HMM runs.""" self.num_threads_to_use = num_threads_to_use + self.program_to_use = program_to_use self.progress = progress self.run = run self.tmp_dirs = [] self.target_files_dict = {} + acceptable_programs = ["hmmscan", "hmmsearch"] + if self.program_to_use not in acceptable_programs: + raise ConfigError("HMMer class here. You are attemptimg to use the program %s to run HMMs, but we don't recognize it. The currently" + " supported programs are: %s" % (self.program_to_use, ", ".join(acceptable_programs))) + for source in target_files_dict: tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) @@ -96,6 +102,7 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.info('Number of genes in HMM model', num_genes_in_model) self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) + self.run.info('HMMer program used for search', self.program_to_use) tmp_dir = os.path.dirname(self.target_files_dict[target][0]) log_file_path = os.path.join(tmp_dir, '*_log') @@ -129,13 +136,13 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode shitty_file = part_file + '_shitty' if noise_cutoff_terms: - cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', + cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, *noise_cutoff_terms.split(), '--cpu', cores_per_process, '--tblout', shitty_file, hmm, part_file] else: # if we didn't pass any noise cutoff terms, here we don't include them in the command line - cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', + cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, '--cpu', cores_per_process, '--tblout', shitty_file, diff --git a/anvio/kegg.py b/anvio/kegg.py index 73c55f52d8..de7e3fb8a7 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -625,7 +625,7 @@ def process_kofam_hmms(self): report_aa_sequences=True) # run hmmscan - hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads) + hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads, program_to_use="hmmsearch") hmm_hits_file = hmmer.run_hmmscan('KOfam', 'AA', 'GENE', None, None, len(self.ko_dict), self.kofam_hmm_file_path, None, None) # get an instance of gene functions table From 40dce58e8a715852f3ec3b645e68cab8cd374e81 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 14 Apr 2020 17:29:40 -0500 Subject: [PATCH 336/400] add hmmpress step to fix anvi-run-hmms, which broke after i removed hmmpress from the hmmer module --- anvio/tables/hmmhits.py | 43 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/anvio/tables/hmmhits.py b/anvio/tables/hmmhits.py index 6cb8dacc01..5e96c908b1 100644 --- a/anvio/tables/hmmhits.py +++ b/anvio/tables/hmmhits.py @@ -3,6 +3,8 @@ import os import hashlib +import gzip +import shutil import anvio import anvio.db as db @@ -83,6 +85,39 @@ def check_sources(self, sources): "to remove them first, or run this program with `--just-do-it` flag so anvi'o would remove all " "for you. Here are the list of HMM sources that need to be removed: '%s'." % (', '.join(sources_need_to_be_removed))) + def hmmpress_sources(self, sources, tmp_dir, in_place=False): + """This function checks if the hmm files have been hmmpressed, and if not, it runs hmmpress. + + If in_place is False, we assume that the model should be unpacked and compressed in the temp directory. + Otherwise, we do it in the directory where the model is stored so that it only has to be done once. + + Returns the locations of each hmmpressed file path in a dictionary keyed by the source. + """ + hmmpressed_file_paths = {} + for source in sources: + model_file = sources[source]['model'] + hmm_file_path = None + + if in_place: + pass + #hmm_file_path = model_file + # check here if already hmmpressed and if so return + else: + hmm_file_path = os.path.join(tmp_dir, source + '.hmm') + hmm_file = open(hmm_file_path, 'wb') + hmm_file.write(gzip.open(model_file, 'rb').read()) + hmm_file.close() + + log_file_path = log_file_path = os.path.join(tmp_dir, 'hmmpress.log') + cmd_line = ['hmmpress', hmm_file_path] + ret_val = utils.run_command(cmd_line, log_file_path) + + hmmpressed_file_paths[source] = hmm_file_path + + if ret_val: + raise ConfigError("Sadly, anvi'o failed while attempting to compress the HMM model for source %s. You can check out the log file (%s) for " + "more detailed information on why this happened." % (source, log_file_path)) + return hmmpressed_file_paths def populate_search_tables(self, sources={}): # make sure the output file is OK to write. @@ -102,6 +137,8 @@ def populate_search_tables(self, sources={}): tmp_directory_path = filesnpaths.get_temp_directory_path() + hmmpressed_files = self.hmmpress_sources(sources, tmp_directory_path) + # here we will go through targets and populate target_files_dict based on what we find among them. targets = set([s['target'] for s in list(sources.values())]) for target in targets: @@ -146,7 +183,7 @@ class Args: pass kind_of_search = sources[source]['kind'] domain = sources[source]['domain'] all_genes_searched_against = sources[source]['genes'] - hmm_model = sources[source]['model'] + hmm_model = hmmpressed_files[source] reference = sources[source]['ref'] noise_cutoff_terms = sources[source]['noise_cutoff_terms'] @@ -208,6 +245,7 @@ class Args: pass self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) + # FIXME: I have no clue why importing the anvio module is necessary at this point, # but without this, mini test fails becasue "`anvio.DEBUG` is being used # before initialization". nonsense. @@ -217,6 +255,8 @@ class Args: pass for v in list(target_files_dict.values()): os.remove(v) + shutil.rmtree(tmp_directory_path) + def add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(self, source, search_results_dict, skip_amino_acid_sequences=False): """Add new gene calls to the contigs database and update the HMM `search_results_dict`. @@ -410,4 +450,3 @@ def process_splits(self, search_results_dict): db_entries_for_splits.append(db_entry) return db_entries_for_splits - From fb83f0d06f0d025e2704383b91ffa91890779730 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 15 Apr 2020 16:27:55 -0500 Subject: [PATCH 337/400] parser handles hmmsearch output --- anvio/parsers/hmmscan.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index 39399b9a95..f5c90c592c 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -20,20 +20,31 @@ class HMMScan(Parser): - def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE'): + def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE', program='hmmscan'): self.alphabet = alphabet self.context = context + self.program = program self.run = run files_expected = {'hits': hmm_scan_hits_txt} if self.context == "GENE": - # see the HMMER user guide for details of the fields for AA sequence search, and DNA sequence search. - # --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- - # target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description - col_names = ['gene_name', 'gene_hmm_id', 'gene_callers_id', 'f', 'e_value', 'bit_score', 'f', 'f', 'dom_bit_score', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f'] - col_mapping = [str, str, int, str, float, float, str, str, float, str, str, str, str, str, str, str, str, str] + if self.program == 'hmmscan': + # see the HMMER user guide for details of the fields for AA sequence search, and DNA sequence search. + # --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- + # target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description + col_names = ['gene_name', 'gene_hmm_id', 'gene_callers_id', 'f', 'e_value', 'bit_score', 'f', 'f', 'dom_bit_score', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f'] + col_mapping = [str, str, int, str, float, float, str, str, float, str, str, str, str, str, str, str, str, str] + elif self.program == 'hmmsearch': + # --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- + # target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target + #------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ ----- --- --- --- --- --- --- --- --- --------------------- + col_names = ['gene_callers_id', 'f', 'gene_name', 'gene_hmm_id', 'e_value', 'bit_score', 'f', 'f', 'dom_bit_score', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f'] + col_mapping = [int, str, str, str, float, float, str, str, float, str, str, str, str, str, str, str, str, str] + else: + raise ConfigError("The HMMScan Parser class is not sure if you know what you are doing. You told it that you wanted to " + "parse HMM hits from the program %s, but this class doesn't know how to handle those." % (self.program)) elif self.context == "CONTIG" and (self.alphabet == "DNA" or self.alphabet == "RNA"): # 'hmm_target', 'hmm_acc', 'query_id', 'query_acc', 'hmm_from', 'hmm_to', 'alignment_from', 'alignment_to', 'envelope_from', 'envelope_to', 'seq_len', 'strand', 'e_value', 'score', 'bias', 'desc'] col_names = ['gene_name', 'gene_hmm_id', 'contig_name', 'f', 'hmm_from', 'hmm_to', 'alignment_from', 'alignment_to', 'envelope_from', 'envelope_to', 'f', 'f', 'e_value', 'f', 'f', 'f'] From 65e480dbe270de700bd79658e1308685a6d770ea Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 15 Apr 2020 16:50:31 -0500 Subject: [PATCH 338/400] add option to choose which hmmer program to run --- anvio/__init__.py | 6 ++++++ anvio/kegg.py | 11 ++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index 0e9f210f9d..a5819b4d08 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -709,6 +709,12 @@ def get_args(parser): "up using this flag) (plus, there may or may not be some historical data on this here: " "https://github.com/meren/anvio/issues/309)."} ), + 'hmmer-program': ( + ['--hmmer-program'], + {'type': str, + 'required': False, + 'help': "Which of the HMMER programs to use to run HMMs (ie, hmmscan, hmmsearch)"} + ), 'hmm-source': ( ['--hmm-source'], {'metavar': 'SOURCE NAME', diff --git a/anvio/kegg.py b/anvio/kegg.py index de7e3fb8a7..9f3ecfae0c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -575,12 +575,13 @@ def __init__(self, args, run=run, progress=progress): self.progress = progress self.contigs_db_path = args.contigs_db self.num_threads = args.num_threads + self.hmm_program = args.hmmer_program or 'hmmsearch' self.ko_dict = None # should be set up by setup_ko_dict() # init the base class KeggContext.__init__(self, self.args) - filesnpaths.is_program_exists('hmmscan') + filesnpaths.is_program_exists(self.hmm_program) # verify that Kofam HMM profiles have been set up if not os.path.exists(self.kofam_hmm_file_path): @@ -606,8 +607,8 @@ def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): if ok_if_missing_from_dict: return "Unknown function with KO num %s" % knum else: - raise ConfigError("It seems hmmscan found a KO number that does not exist\ - in the KOfam ko_list file: %s" % knum) + raise ConfigError("It seems %s found a KO number that does not exist\ + in the KOfam ko_list file: %s" % (self.hmm_program, knum)) return self.ko_dict[knum]['definition'] @@ -625,7 +626,7 @@ def process_kofam_hmms(self): report_aa_sequences=True) # run hmmscan - hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads, program_to_use="hmmsearch") + hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads, program_to_use=self.hmm_program) hmm_hits_file = hmmer.run_hmmscan('KOfam', 'AA', 'GENE', None, None, len(self.ko_dict), self.kofam_hmm_file_path, None, None) # get an instance of gene functions table @@ -646,7 +647,7 @@ def process_kofam_hmms(self): return # parse hmmscan output - parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE') + parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE', program=self.hmm_program) search_results_dict = parser.get_search_results(ko_list_dict=self.ko_dict) # add functions and KEGG modules info to database From 5e1bc9afab3aedd8ca17f5b6a63db9cc74ae8d7a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 15 Apr 2020 17:26:38 -0500 Subject: [PATCH 339/400] add parameter for hmmer program to kofams script --- bin/anvi-run-kegg-kofams | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams index dbebaf8dea..c41bf5f27d 100755 --- a/bin/anvi-run-kegg-kofams +++ b/bin/anvi-run-kegg-kofams @@ -36,6 +36,7 @@ if __name__ == '__main__': groupR.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db')) groupO.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) groupO.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) + groupO.add_argument(*anvio.A('hmmer-program'), **anvio.K('hmmer-program')) args = anvio.get_args(parser) From 0029606a18d6286c8bc575c0bc2f8b468e3fcd5a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 10:46:57 -0500 Subject: [PATCH 340/400] write metabolism dict to json format --- anvio/__init__.py | 7 +++++++ anvio/kegg.py | 17 +++++++++++++++++ bin/anvi-estimate-kegg-metabolism | 3 +++ 3 files changed, 27 insertions(+) diff --git a/anvio/__init__.py b/anvio/__init__.py index 846fe3db82..f40eba3dd5 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -2194,6 +2194,13 @@ def get_args(parser): "'present' in a given genome or bin. It is the fraction of steps that must be complete in " " in order for the entire module to be marked complete. The default is %(default)g."} ), + 'get-raw-data-as-json': ( + ['--get-raw-data-as-json'], + {'default': None, + 'metavar': 'FILE_PATH', + 'type': str, + 'help': "If you want the raw metabolism estimation data dictionary in JSON-format, provide a file path to this argument."} + ), } # two functions that works with the dictionary above. diff --git a/anvio/kegg.py b/anvio/kegg.py index 73c55f52d8..ec4337ca26 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -10,6 +10,7 @@ import re import copy import statistics as stats +import json import anvio import anvio.db as db @@ -739,6 +740,8 @@ def __init__(self, args, run=run, progress=progress): self.completeness_threshold = A('module_completion_threshold') or 0.75 self.output_file_prefix = A('output_file_prefix') or "kegg-metabolism" self.contigs_db_project_name = "Unknown" + self.write_dict_to_json = True if A('get_raw_data_as_json') else False + self.json_output_file_path = A('get_raw_data_as_json') self.bin_ids_to_process = None if self.bin_id and self.bin_ids_file: @@ -1433,6 +1436,8 @@ def estimate_metabolism(self): raise ConfigError("This class doesn't know how to deal with that yet :/") self.store_kegg_metabolism_superdict(kegg_metabolism_superdict) + if self.write_dict_to_json: + self.store_metabolism_superdict_as_json(kegg_metabolism_superdict) def store_kegg_metabolism_superdict(self, kegg_superdict): @@ -1536,6 +1541,18 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): self.run.info("Complete modules summary file", complete_module_summary_path) + def store_metabolism_superdict_as_json(self, kegg_superdict): + """This function writes the metabolism superdict into one json file.""" + + def set_to_list(obj): + if isinstance(obj, set): + return list(obj) + + filesnpaths.is_output_file_writable(self.json_output_file_path) + open(self.json_output_file_path, 'w').write(json.dumps(kegg_superdict, indent=4, default=set_to_list)) + self.run.info("JSON Output", self.json_output_file_path) + + class KeggModulesDatabase(KeggContext): """To create or access a Modules DB. diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index 8d5a9e8082..dfd2f6cc80 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -52,6 +52,9 @@ if __name__ == '__main__': groupC.add_argument(*anvio.A('module-completion-threshold'), **anvio.K('module-completion-threshold')) groupC.add_argument(*anvio.A('output-file-prefix'), **anvio.K('output-file-prefix')) + groupD = parser.add_argument_group('DEBUG', "Parameters to use if you think something fishy is going on or otherwise want to exert more control. Go for it.") + groupD.add_argument(*anvio.A('get-raw-data-as-json'), **anvio.K('get-raw-data-as-json')) + args = anvio.get_args(parser) try: From b046683d0f54807742ef3e29a846f1feddb097d5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 12:27:08 -0500 Subject: [PATCH 341/400] more params for JSON data --- anvio/__init__.py | 17 +++++++++++++++++ anvio/kegg.py | 2 ++ bin/anvi-estimate-kegg-metabolism | 2 ++ 3 files changed, 21 insertions(+) diff --git a/anvio/__init__.py b/anvio/__init__.py index f40eba3dd5..15cdac314a 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -2201,6 +2201,23 @@ def get_args(parser): 'type': str, 'help': "If you want the raw metabolism estimation data dictionary in JSON-format, provide a file path to this argument."} ), + 'store-json-before-estimation': ( + ['--store-json-before-estimation'], + {'default': False, + 'action': 'store_true', + 'help': "This flag is used to control when the metabolism data dictionary is stored. When provided alongside the " + "--get-raw-data-as-json flag, the JSON file will be created before metabolism estimation is run, and " + "that file will consequently include only information about KOfam hits and gene calls. The idea is that you can " + "then modify this file as you like and re-run this program using the flag --estimate-from-json."} + ), + 'estimate-from-json': ( + ['--estimate-from-json'], + {'default': False, + 'action': 'store_true', + 'help': "If you have a JSON file containing KOfam hits and gene call information from your contigs database " + "(such as a file produced using the --get-raw-data-as-json flag), you can provide that file to this flag " + "and KEGG metabolism estimates will be computed from the information within instead of from a contigs database."} + ), } # two functions that works with the dictionary above. diff --git a/anvio/kegg.py b/anvio/kegg.py index ec4337ca26..cd08b95795 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -742,6 +742,8 @@ def __init__(self, args, run=run, progress=progress): self.contigs_db_project_name = "Unknown" self.write_dict_to_json = True if A('get_raw_data_as_json') else False self.json_output_file_path = A('get_raw_data_as_json') + self.store_json_before_estimation = True if A('store_json_before_estimation') else False + self.estimate_from_json = True if A('estimate_from_json') else False self.bin_ids_to_process = None if self.bin_id and self.bin_ids_file: diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index dfd2f6cc80..07be176b76 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -54,6 +54,8 @@ if __name__ == '__main__': groupD = parser.add_argument_group('DEBUG', "Parameters to use if you think something fishy is going on or otherwise want to exert more control. Go for it.") groupD.add_argument(*anvio.A('get-raw-data-as-json'), **anvio.K('get-raw-data-as-json')) + groupD.add_argument(*anvio.A('store-json-before-estimation'), **anvio.K('store-json-before-estimation')) + groupD.add_argument(*anvio.A('estimate-from-json'), **anvio.K('estimate-from-json')) args = anvio.get_args(parser) From 4513de74e9aa25c1903fbc36f0ef5757116bc718 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 12:57:33 -0500 Subject: [PATCH 342/400] fix estimate from json flag to take a file path --- anvio/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index 15cdac314a..82beabfb37 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -2212,8 +2212,9 @@ def get_args(parser): ), 'estimate-from-json': ( ['--estimate-from-json'], - {'default': False, - 'action': 'store_true', + {'default': None, + 'metavar': 'FILE_PATH', + 'type': str, 'help': "If you have a JSON file containing KOfam hits and gene call information from your contigs database " "(such as a file produced using the --get-raw-data-as-json flag), you can provide that file to this flag " "and KEGG metabolism estimates will be computed from the information within instead of from a contigs database."} From f123b7556a8807e6f2d4b30235819469bc71493b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 12:58:06 -0500 Subject: [PATCH 343/400] sanity check for json params --- anvio/kegg.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index cd08b95795..36b0efc06e 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -743,7 +743,7 @@ def __init__(self, args, run=run, progress=progress): self.write_dict_to_json = True if A('get_raw_data_as_json') else False self.json_output_file_path = A('get_raw_data_as_json') self.store_json_before_estimation = True if A('store_json_before_estimation') else False - self.estimate_from_json = True if A('estimate_from_json') else False + self.estimate_from_json = A('estimate_from_json') or None self.bin_ids_to_process = None if self.bin_id and self.bin_ids_file: @@ -758,6 +758,10 @@ def __init__(self, args, run=run, progress=progress): if self.profile_db_path and not self.collection_name: raise ConfigError("If you provide a profiles DB, you should also provide a collection name.") + if self.store_json_before_estimation and not self.json_output_file_path: + raise ConfigError("Whoops. You seem to want to store the metabolism dictionary in a JSON file, but you haven't provided the name of that file. " + "Please use the --get-raw-data-as-json flag to do so.") + # init the base class KeggContext.__init__(self, self.args) From 0227f85d3c93c886c831a09da867b43650ed710a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 14:42:17 -0500 Subject: [PATCH 344/400] change filename to file prefix for json output --- anvio/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index 82beabfb37..927c3b3768 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -2197,9 +2197,12 @@ def get_args(parser): 'get-raw-data-as-json': ( ['--get-raw-data-as-json'], {'default': None, - 'metavar': 'FILE_PATH', + 'metavar': 'FILENAME_PREFIX', 'type': str, - 'help': "If you want the raw metabolism estimation data dictionary in JSON-format, provide a file path to this argument."} + 'help': "If you want the raw metabolism estimation data dictionary in JSON-format, provide a filename prefix to this argument." + "The program will then output one or more files with the .json extension containing this data. " + "P.S. The only time you will see multiple output JSON files is when you run this program on multiple bins using --store-json-before-estimation" + ", in which case the bin names will become part of each file name."} ), 'store-json-before-estimation': ( ['--store-json-before-estimation'], From 33d9ad8633c438349b5c5975746e8b2c4c33fbd6 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 14:42:42 -0500 Subject: [PATCH 345/400] sanity check for profiles.db --- anvio/kegg.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 36b0efc06e..413af5f659 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -755,6 +755,10 @@ def __init__(self, args, run=run, progress=progress): filesnpaths.is_file_exists(self.bin_ids_file) self.bin_ids_to_process = [line.strip() for line in open(self.bin_ids_file).readlines()] + if self.bin_id or self.bin_ids_file or self.collection_name and not self.profile_db_path: + raise ConfigError("You have requested metabolism estimation for a bin or set of bins, but you haven't provided " + "a profiles database. Unfortunately, this just does not work. Please try again.") + if self.profile_db_path and not self.collection_name: raise ConfigError("If you provide a profiles DB, you should also provide a collection name.") From 9d0e7eb34ccfe7dc4dc4f0ffce6105a3977439fd Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 14:48:18 -0500 Subject: [PATCH 346/400] json storage function now takes a file path --- anvio/kegg.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 413af5f659..dde95b10ee 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1446,8 +1446,8 @@ def estimate_metabolism(self): raise ConfigError("This class doesn't know how to deal with that yet :/") self.store_kegg_metabolism_superdict(kegg_metabolism_superdict) - if self.write_dict_to_json: - self.store_metabolism_superdict_as_json(kegg_metabolism_superdict) + if self.write_dict_to_json and not self.store_json_before_estimation: + self.store_metabolism_superdict_as_json(kegg_metabolism_superdict, self.json_output_file_path + ".json") def store_kegg_metabolism_superdict(self, kegg_superdict): @@ -1551,16 +1551,16 @@ def store_kegg_metabolism_superdict(self, kegg_superdict): self.run.info("Complete modules summary file", complete_module_summary_path) - def store_metabolism_superdict_as_json(self, kegg_superdict): + def store_metabolism_superdict_as_json(self, kegg_superdict, file_path): """This function writes the metabolism superdict into one json file.""" def set_to_list(obj): if isinstance(obj, set): return list(obj) - filesnpaths.is_output_file_writable(self.json_output_file_path) - open(self.json_output_file_path, 'w').write(json.dumps(kegg_superdict, indent=4, default=set_to_list)) - self.run.info("JSON Output", self.json_output_file_path) + filesnpaths.is_output_file_writable(file_path) + open(file_path, 'w').write(json.dumps(kegg_superdict, indent=4, default=set_to_list)) + self.run.info("JSON Output", file_path) class KeggModulesDatabase(KeggContext): From d336b8eb79d45561f32c2c3d6f5945bc52d89e92 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 14:48:49 -0500 Subject: [PATCH 347/400] store json before running estimation --- anvio/kegg.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index dde95b10ee..75fe0d2240 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1274,6 +1274,11 @@ def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=N metabolism_dict_for_list_of_splits = self.mark_kos_present_for_list_of_splits(ko_hits_in_splits, split_list=splits, bin_name=bin_name) + + if self.store_json_before_estimation: + bin_level_metabolism_dict_for_json = { bin_name: metabolism_dict_for_list_of_splits } + self.store_metabolism_superdict_as_json(bin_level_metabolism_dict_for_json, self.json_output_file_path + "_" + bin_name + ".json") + metabolism_dict_for_list_of_splits["num_complete_modules"] = 0 complete_mods = [] From 15ecb7bc5bc435d7a593dd2751bb03137fd88763 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 15:06:39 -0500 Subject: [PATCH 348/400] do not run estimation when we do not want that info in the JSON output --- anvio/__init__.py | 12 +++++------- anvio/kegg.py | 14 +++++++------- bin/anvi-estimate-kegg-metabolism | 2 +- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index 927c3b3768..33076507eb 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -2200,16 +2200,14 @@ def get_args(parser): 'metavar': 'FILENAME_PREFIX', 'type': str, 'help': "If you want the raw metabolism estimation data dictionary in JSON-format, provide a filename prefix to this argument." - "The program will then output one or more files with the .json extension containing this data. " - "P.S. The only time you will see multiple output JSON files is when you run this program on multiple bins using --store-json-before-estimation" - ", in which case the bin names will become part of each file name."} + "The program will then output a file with the .json extension containing this data."} ), - 'store-json-before-estimation': ( - ['--store-json-before-estimation'], + 'store-json-without-estimation': ( + ['--store-json-without-estimation'], {'default': False, 'action': 'store_true', - 'help': "This flag is used to control when the metabolism data dictionary is stored. When provided alongside the " - "--get-raw-data-as-json flag, the JSON file will be created before metabolism estimation is run, and " + 'help': "This flag is used to control what is stored in the JSON-formatted metabolism data dictionary. When this flag is provided alongside the " + "--get-raw-data-as-json flag, the JSON file will be created without running metabolism estimation, and " "that file will consequently include only information about KOfam hits and gene calls. The idea is that you can " "then modify this file as you like and re-run this program using the flag --estimate-from-json."} ), diff --git a/anvio/kegg.py b/anvio/kegg.py index 75fe0d2240..db3d23b4bb 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -742,7 +742,7 @@ def __init__(self, args, run=run, progress=progress): self.contigs_db_project_name = "Unknown" self.write_dict_to_json = True if A('get_raw_data_as_json') else False self.json_output_file_path = A('get_raw_data_as_json') - self.store_json_before_estimation = True if A('store_json_before_estimation') else False + self.store_json_without_estimation = True if A('store_json_without_estimation') else False self.estimate_from_json = A('estimate_from_json') or None self.bin_ids_to_process = None @@ -762,7 +762,7 @@ def __init__(self, args, run=run, progress=progress): if self.profile_db_path and not self.collection_name: raise ConfigError("If you provide a profiles DB, you should also provide a collection name.") - if self.store_json_before_estimation and not self.json_output_file_path: + if self.store_json_without_estimation and not self.json_output_file_path: raise ConfigError("Whoops. You seem to want to store the metabolism dictionary in a JSON file, but you haven't provided the name of that file. " "Please use the --get-raw-data-as-json flag to do so.") @@ -1275,9 +1275,8 @@ def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=N metabolism_dict_for_list_of_splits = self.mark_kos_present_for_list_of_splits(ko_hits_in_splits, split_list=splits, bin_name=bin_name) - if self.store_json_before_estimation: - bin_level_metabolism_dict_for_json = { bin_name: metabolism_dict_for_list_of_splits } - self.store_metabolism_superdict_as_json(bin_level_metabolism_dict_for_json, self.json_output_file_path + "_" + bin_name + ".json") + if self.store_json_without_estimation: + return metabolism_dict_for_list_of_splits metabolism_dict_for_list_of_splits["num_complete_modules"] = 0 @@ -1450,8 +1449,9 @@ def estimate_metabolism(self): else: raise ConfigError("This class doesn't know how to deal with that yet :/") - self.store_kegg_metabolism_superdict(kegg_metabolism_superdict) - if self.write_dict_to_json and not self.store_json_before_estimation: + if not self.store_json_without_estimation: + self.store_kegg_metabolism_superdict(kegg_metabolism_superdict) + if self.write_dict_to_json: self.store_metabolism_superdict_as_json(kegg_metabolism_superdict, self.json_output_file_path + ".json") diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index 07be176b76..1e18caea82 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -54,7 +54,7 @@ if __name__ == '__main__': groupD = parser.add_argument_group('DEBUG', "Parameters to use if you think something fishy is going on or otherwise want to exert more control. Go for it.") groupD.add_argument(*anvio.A('get-raw-data-as-json'), **anvio.K('get-raw-data-as-json')) - groupD.add_argument(*anvio.A('store-json-before-estimation'), **anvio.K('store-json-before-estimation')) + groupD.add_argument(*anvio.A('store-json-without-estimation'), **anvio.K('store-json-without-estimation')) groupD.add_argument(*anvio.A('estimate-from-json'), **anvio.K('estimate-from-json')) args = anvio.get_args(parser) From 78527e1f9ec400456b9116ed8ae5e791d73de020 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 16:30:39 -0500 Subject: [PATCH 349/400] allow either contigs db or json file as input --- anvio/kegg.py | 10 +++++++++- bin/anvi-estimate-kegg-metabolism | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index db3d23b4bb..c54ce2fa4b 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -745,6 +745,10 @@ def __init__(self, args, run=run, progress=progress): self.store_json_without_estimation = True if A('store_json_without_estimation') else False self.estimate_from_json = A('estimate_from_json') or None + if not self.estimate_from_json and not self.contigs_db_path: + raise ConfigError("NO INPUT PROVIDED. You must provide (at least) a contigs database to this program, unless you are using the --estimate-from-json " + "flag, in which case you must provide a JSON-formatted file.") + self.bin_ids_to_process = None if self.bin_id and self.bin_ids_file: raise ConfigError("You have provided anvi'o with both the individual bin id %s and a file with bin ids (%s). \ @@ -765,12 +769,16 @@ def __init__(self, args, run=run, progress=progress): if self.store_json_without_estimation and not self.json_output_file_path: raise ConfigError("Whoops. You seem to want to store the metabolism dictionary in a JSON file, but you haven't provided the name of that file. " "Please use the --get-raw-data-as-json flag to do so.") + if self.store_json_without_estimation and self.estimate_from_json: + raise ConfigError("It is impossible to both estimate metabolism from JSON data and produce a JSON file without estimation at the same time... " + "anvi'o is judging you SO hard right now.") # init the base class KeggContext.__init__(self, self.args) - utils.is_contigs_db(self.contigs_db_path) + if not self.estimate_from_json: + utils.is_contigs_db(self.contigs_db_path) # load existing kegg modules db if not os.path.exists(os.path.join(self.kegg_data_dir, "MODULES.db")): diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index 1e18caea82..ce552ad3cc 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -34,7 +34,7 @@ if __name__ == '__main__': anvi'o will attempt to estimate metabolism for all contigs in it, assuming that\ the contigs database represents a single genome. If the contigs database is actually\ a metagenome, you should use the `--metagenome` flag to explicitly declare that.") - groupI.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db', {'required': True})) + groupI.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db', {'required': False})) groupI.add_argument(*anvio.A('metagenome-mode'), **anvio.K('metagenome-mode')) groupI.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) From 22e82dd77d04886e72aded5671459c2c75406fdb Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 16:31:04 -0500 Subject: [PATCH 350/400] add function for estimating from json file --- anvio/kegg.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index c54ce2fa4b..915bf0dbfa 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1436,6 +1436,12 @@ def estimate_for_contigs_db_for_metagenome(self, kofam_gene_split_contig): return metagenome_metabolism_superdict + def estimate_metabolism_from_json_data(self): + """This function runs the estimation functions on data obtained from a provided JSON file""" + + self.run.info("JSON input file", self.estimate_from_json) + + def estimate_metabolism(self): """This is the driver function for estimating metabolism. @@ -1444,18 +1450,22 @@ def estimate_metabolism(self): The metabolism completion dictionary is keyed by KEGG module number, with a few exceptions for summary data (ie, 'num_complete_modules'). """ - kofam_hits_info = self.init_hits_and_splits() - kegg_metabolism_superdict = {} - if self.profile_db_path and not self.metagenome_mode: - kegg_metabolism_superdict = self.estimate_for_bins_in_collection(kofam_hits_info) - elif not self.profile_db_path and not self.metagenome_mode: - kegg_metabolism_superdict = self.estimate_for_genome(kofam_hits_info) - elif self.metagenome_mode: - kegg_metabolism_superdict = self.estimate_for_contigs_db_for_metagenome(kofam_hits_info) + if self.estimate_from_json: + self.estimate_metabolism_from_json_data() else: - raise ConfigError("This class doesn't know how to deal with that yet :/") + + kofam_hits_info = self.init_hits_and_splits() + + if self.profile_db_path and not self.metagenome_mode: + kegg_metabolism_superdict = self.estimate_for_bins_in_collection(kofam_hits_info) + elif not self.profile_db_path and not self.metagenome_mode: + kegg_metabolism_superdict = self.estimate_for_genome(kofam_hits_info) + elif self.metagenome_mode: + kegg_metabolism_superdict = self.estimate_for_contigs_db_for_metagenome(kofam_hits_info) + else: + raise ConfigError("This class doesn't know how to deal with that yet :/") if not self.store_json_without_estimation: self.store_kegg_metabolism_superdict(kegg_metabolism_superdict) From 00fa53a82af4816ac88479b533ca93eed28e9951 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 19:17:00 -0500 Subject: [PATCH 351/400] sanity checks for keys in JSON file --- anvio/kegg.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 915bf0dbfa..892084c6af 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1441,6 +1441,40 @@ def estimate_metabolism_from_json_data(self): self.run.info("JSON input file", self.estimate_from_json) + filesnpaths.is_file_json_formatted(self.estimate_from_json) + kegg_metabolism_superdict = json.load(open(self.estimate_from_json)) + + expected_keys_for_module = {"gene_caller_ids", "kofam_hits", "genes_to_contigs", "contigs_to_genes"} + bins_found = [] + additional_keys = set([]) + + for bin_name, meta_dict_for_bin in kegg_metabolism_superdict.items(): + bins_found.append(bin_name) + for mod, mod_dict in meta_dict_for_bin.items(): + if mod == "num_complete_modules": + self.run.warning("Your JSON file appears to have been generated from data that already contains metabolic module completeness information. " + "We say this because the key 'num_complete_modules' was found. This isn't a problem; however you should know that anvi'o " + "won't take any of the existing estimation information into account. The only module-level keys that will be used from this file " + "are: %s" % (expected_keys_for_module)) + continue + # verify that dict contains the necessary keys for estimation + if not expected_keys_for_module.issubset(set(mod_dict.keys())): + missing_keys = expected_keys_for_module.difference(set(mod_dict.keys())) + raise ConfigError("Your JSON file is incorrectly formatted for metabolism estimation. We expect the following keys: %s. " + "However, we didn't find some of them for module %s in %s. Here are the missing keys: %s" + % (expected_keys_for_module, mod, bin_name, missing_keys)) + + additional_keys = additional_keys.union(set(mod_dict.keys()).difference(expected_keys_for_module)) + + if not self.quiet and additional_keys: + self.run.warning("Just to let you know, we found the following module-level keys in your JSON file that were totally ignored during metabolism estimation " + "(no harm was done by including them): %s" % (additional_keys)) + + + # convert lists back to sets + + self.run.info("Bins/genomes/metagenomes found", ", ".join(bins_found)) + def estimate_metabolism(self): """This is the driver function for estimating metabolism. From 5978ada8bc33ed77fa6c6d576f26c98a24bff675 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 19:23:25 -0500 Subject: [PATCH 352/400] convert lists to sets --- anvio/kegg.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 892084c6af..6665272484 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1466,13 +1466,16 @@ def estimate_metabolism_from_json_data(self): additional_keys = additional_keys.union(set(mod_dict.keys()).difference(expected_keys_for_module)) + # convert gene_caller_ids and contigs_to_genes lists to sets + mod_dict['gene_caller_ids'] = set(mod_dict['gene_caller_ids']) + for contig, gene_list in mod_dict['contigs_to_genes'].items(): + mod_dict['contigs_to_genes'][contig] = set(gene_list) + + if not self.quiet and additional_keys: self.run.warning("Just to let you know, we found the following module-level keys in your JSON file that were totally ignored during metabolism estimation " "(no harm was done by including them): %s" % (additional_keys)) - - # convert lists back to sets - self.run.info("Bins/genomes/metagenomes found", ", ".join(bins_found)) From cac24835976a2cb9dbfe1afeff42a57cd00c45b6 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 19:59:28 -0500 Subject: [PATCH 353/400] change structure to separate kofam hit marking from module estimation --- anvio/kegg.py | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6665272484..7973705602 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1255,37 +1255,26 @@ def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): cw_ws_redundancy, copy_completeness_distribution = self.compute_copywise_redundancy_for_path(num_hits_per_kofam, aggregation_measure="weighted_sum") meta_dict_for_bin[mnum]["copywise_weighted-sum"].append(cw_ws_redundancy) - return - def estimate_for_list_of_splits(self, ko_hits_in_splits, splits=None, bin_name=None): - """This is the atomic metabolism estimator function, which builds a metabolism completeness dictionary for an arbitrary list of splits. + def estimate_for_list_of_splits(self, metabolism_dict_for_list_of_splits, bin_name=None): + """This is the atomic metabolism estimator function, which builds up the metabolism completeness dictionary for an arbitrary list of splits. For example, the list of splits may represent a bin, a single isolate genome, or an entire metagenome. - The metabolism completeness dictionary is first initialized to contain the KOs that are present in the genome for each KEGG module. - It is later updated with the individual steps and completion estimates for each module. + + The function takes in a metabolism completeness dictionary already initialized with the relevant KOfam hits per module, and updates it + with the individual steps and completion estimates for each module. PARAMETERS ========== - ko_hits_in_splits list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering - splits a list of splits identifiers - bin_name the name of the bin/genome/metagenome that we are working with - - RETURNS - ======= metabolism_dict_for_list_of_splits the metabolism completeness dictionary of dictionaries for this list of splits. It contains one dictionary of module steps and completion information for each module (keyed by module number), as well as one key num_complete_modules that tracks the number of complete modules found in these splits. Calling functions should assign this dictionary to a metabolism superdict with the bin name as a key. + bin_name the name of the bin/genome/metagenome that we are working with """ - metabolism_dict_for_list_of_splits = self.mark_kos_present_for_list_of_splits(ko_hits_in_splits, split_list=splits, - bin_name=bin_name) - - if self.store_json_without_estimation: - return metabolism_dict_for_list_of_splits - metabolism_dict_for_list_of_splits["num_complete_modules"] = 0 complete_mods = [] @@ -1372,8 +1361,12 @@ def estimate_for_genome(self, kofam_gene_split_contig): genome_metabolism_superdict = {} # since all hits belong to one genome, we can take the UNIQUE splits from all the hits splits_in_genome = list(set([tpl[2] for tpl in kofam_gene_split_contig])) - - genome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(kofam_gene_split_contig, splits=splits_in_genome, bin_name=self.contigs_db_project_name) + metabolism_dict_for_genome = self.mark_kos_present_for_list_of_splits(kofam_gene_split_contig, split_list=splits_in_genome, + bin_name=self.contigs_db_project_name) + if not self.store_json_without_estimation: + genome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(metabolism_dict_for_genome, bin_name=self.contigs_db_project_name) + else: + genome_metabolism_superdict[self.contigs_db_project_name] = metabolism_dict_for_genome return genome_metabolism_superdict @@ -1402,7 +1395,12 @@ def estimate_for_bins_in_collection(self, kofam_gene_split_contig): for bin_name in bin_name_to_split_names_dict: splits_in_bin = bin_name_to_split_names_dict[bin_name] ko_in_bin = [tpl for tpl in kofam_gene_split_contig if tpl[2] in splits_in_bin] - bins_metabolism_superdict[bin_name] = self.estimate_for_list_of_splits(ko_in_bin, splits=splits_in_bin, bin_name=bin_name) + metabolism_dict_for_bin = self.mark_kos_present_for_list_of_splits(ko_in_bin, split_list=splits_in_bin, bin_name=bin_name) + + if not self.store_json_without_estimation: + bins_metabolism_superdict[bin_name] = self.estimate_for_list_of_splits(metabolism_dict_for_bin, bin_name=bin_name) + else: + bins_metabolism_superdict[bin_name] = metabolism_dict_for_bin return bins_metabolism_superdict @@ -1430,8 +1428,12 @@ def estimate_for_contigs_db_for_metagenome(self, kofam_gene_split_contig): metagenome_metabolism_superdict = {} # since we consider all the hits in the metagenome collectively, we can take the UNIQUE splits from all the hits splits_in_metagenome = list(set([tpl[2] for tpl in kofam_gene_split_contig])) - - metagenome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(kofam_gene_split_contig, splits=splits_in_metagenome, bin_name=self.contigs_db_project_name) + metabolism_dict_for_metagenome = self.mark_kos_present_for_list_of_splits(kofam_gene_split_contig, split_list=splits_in_metagenome, + bin_name=self.contigs_db_project_name) + if not self.store_json_without_estimation: + metagenome_metabolism_superdict[self.contigs_db_project_name] = self.estimate_for_list_of_splits(metabolism_dict_for_metagenome, bin_name=self.contigs_db_project_name) + else: + metagenome_metabolism_superdict[self.contigs_db_project_name] = metabolism_dict_for_metagenome return metagenome_metabolism_superdict From bf4d9d491ab82c6ccfedd1c6fd84662d70b6cacc Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Apr 2020 20:19:37 -0500 Subject: [PATCH 354/400] add function to estimate from json --- anvio/kegg.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 7973705602..93540d5762 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1444,7 +1444,8 @@ def estimate_metabolism_from_json_data(self): self.run.info("JSON input file", self.estimate_from_json) filesnpaths.is_file_json_formatted(self.estimate_from_json) - kegg_metabolism_superdict = json.load(open(self.estimate_from_json)) + kegg_metabolism_superdict = json.load(open(self.estimate_from_json), parse_int=int) + new_kegg_metabolism_superdict = {} expected_keys_for_module = {"gene_caller_ids", "kofam_hits", "genes_to_contigs", "contigs_to_genes"} bins_found = [] @@ -1472,6 +1473,9 @@ def estimate_metabolism_from_json_data(self): mod_dict['gene_caller_ids'] = set(mod_dict['gene_caller_ids']) for contig, gene_list in mod_dict['contigs_to_genes'].items(): mod_dict['contigs_to_genes'][contig] = set(gene_list) + mod_dict['genes_to_contigs'] = {int(g):c for g,c in mod_dict['genes_to_contigs'].items()} + + new_kegg_metabolism_superdict[bin_name] = self.estimate_for_list_of_splits(meta_dict_for_bin, bin_name=bin_name) if not self.quiet and additional_keys: @@ -1479,6 +1483,7 @@ def estimate_metabolism_from_json_data(self): "(no harm was done by including them): %s" % (additional_keys)) self.run.info("Bins/genomes/metagenomes found", ", ".join(bins_found)) + return new_kegg_metabolism_superdict def estimate_metabolism(self): @@ -1492,7 +1497,7 @@ def estimate_metabolism(self): kegg_metabolism_superdict = {} if self.estimate_from_json: - self.estimate_metabolism_from_json_data() + kegg_metabolism_superdict = self.estimate_metabolism_from_json_data() else: kofam_hits_info = self.init_hits_and_splits() From fc80c8a6968f7718c8e654a514d5e2455591c27e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Apr 2020 15:51:18 -0500 Subject: [PATCH 355/400] warning message if you try to use hmmsearch on a nucleotide alphabet because the program used is still nhmmscan --- anvio/drivers/hmmer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index 531c43c94b..fd3c8f57a4 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -129,6 +129,10 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode "Anvi'o will use %s process with %s cores each instead. I hope thats okay for you. " % (str(self.num_threads_to_use), str(num_parts), target, str(num_parts), cores_per_process)) + if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch': + self.run.warning("You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. " + "We hope that is alright." % (self.program_to_use, alphabet)) + for part_file in self.target_files_dict[target]: log_file = part_file + '_log' From 482c5e5d15367b2e6be5f6fca841aa1a67128e17 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Apr 2020 15:51:57 -0500 Subject: [PATCH 356/400] add hmmsearch as default program to pfams --- anvio/pfam.py | 7 ++++--- bin/anvi-run-pfams | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index ef3f148f8f..315727b46b 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -166,12 +166,13 @@ def __init__(self, args, run=run, progress=progress): self.progress = progress self.contigs_db_path = args.contigs_db self.num_threads = args.num_threads + self.hmm_program = args.hmmer_program or 'hmmsearch' self.pfam_data_dir = args.pfam_data_dir # load_catalog will populate this self.function_catalog = {} - filesnpaths.is_program_exists('hmmscan') + filesnpaths.is_program_exists(self.hmm_program) utils.is_contigs_db(self.contigs_db_path) if not self.pfam_data_dir: @@ -273,7 +274,7 @@ class Args: pass report_aa_sequences=True) # run hmmscan - hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads) + hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads, program_to_use=self.hmm_program) hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None, len(self.function_catalog), hmm_file, None, '--cut_ga') if not hmm_hits_file: @@ -286,7 +287,7 @@ class Args: pass return # parse hmmscan output - parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE') + parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE', program=self.hmm_program) search_results_dict = parser.get_search_results() # add functions to database diff --git a/bin/anvi-run-pfams b/bin/anvi-run-pfams index 7969fec4d8..5f3929033e 100755 --- a/bin/anvi-run-pfams +++ b/bin/anvi-run-pfams @@ -34,6 +34,7 @@ if __name__ == '__main__': parser.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db')) parser.add_argument(*anvio.A('pfam-data-dir'), **anvio.K('pfam-data-dir')) parser.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) + parser.add_argument(*anvio.A('hmmer-program'), **anvio.K('hmmer-program')) args = anvio.get_args(parser) From 7d74e680a9a230774d43e21251769d1fa5b39a5b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Apr 2020 15:52:31 -0500 Subject: [PATCH 357/400] description update --- bin/anvi-run-kegg-kofams | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams index c41bf5f27d..ef94d7f9fe 100755 --- a/bin/anvi-run-kegg-kofams +++ b/bin/anvi-run-kegg-kofams @@ -31,7 +31,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description=__description__) groupR = parser.add_argument_group('REQUIRED INPUT', 'The stuff you need for this to work.') - groupO = parser.add_argument_group('OPTIONAL INPUT', "The stuff you (probably) don't need.") + groupO = parser.add_argument_group('OPTIONAL INPUT', "Optional params for a custom experience.") groupR.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db')) groupO.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) From ec7c34c3e7ad23560f8823c705eba8f570d9a73f Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Apr 2020 16:07:48 -0500 Subject: [PATCH 358/400] update provides --- bin/anvi-estimate-kegg-metabolism | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index ce552ad3cc..99fe0c2842 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -16,7 +16,7 @@ __version__ = anvio.__version__ __maintainer__ = "Iva Veseli" __email__ = "iveseli@uchicago.edu" __requires__ = ["contigs-db", "kofam-data", "kegg-modules-db", "kegg-functions",] -#__provides__ = ["genome-metabolism", "genome-metabolism-txt",] #TODO: update when finished +__provides__ = ["kegg-metabolism",] __description__ = "Reconstructs metabolic pathways and estimates pathway completeness for a given set of contigs." From 8022862652fe10354341cdeba4a427b7b6f7b5e1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Apr 2020 16:09:03 -0500 Subject: [PATCH 359/400] output correct program used for nucleotide alphabets --- anvio/drivers/hmmer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index fd3c8f57a4..cb07beae8a 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -102,7 +102,10 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.info('Number of genes in HMM model', num_genes_in_model) self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) - self.run.info('HMMer program used for search', self.program_to_use) + if alphabet in ['DNA', 'RNA']: + self.run.info('HMMer program used for search', 'nhmmscan') + else: + self.run.info('HMMer program used for search', self.program_to_use) tmp_dir = os.path.dirname(self.target_files_dict[target][0]) log_file_path = os.path.join(tmp_dir, '*_log') From 0bb6a90a31e2f6bd4fdc7b7db7a9ebc63b6d9dc6 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Apr 2020 16:09:41 -0500 Subject: [PATCH 360/400] allow choice of hmm program (hmmscan is still default) --- anvio/tables/hmmhits.py | 8 +++++--- bin/anvi-run-hmms | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/anvio/tables/hmmhits.py b/anvio/tables/hmmhits.py index 5e96c908b1..c38e1831e4 100644 --- a/anvio/tables/hmmhits.py +++ b/anvio/tables/hmmhits.py @@ -38,12 +38,14 @@ class TablesForHMMHits(Table): - def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress, initializing_for_deletion=False, just_do_it=False): + def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress, initializing_for_deletion=False, just_do_it=False, hmm_program_to_use='hmmscan'): self.num_threads_to_use = num_threads_to_use self.db_path = db_path self.just_do_it = just_do_it + self.hmm_program = hmm_program_to_use or 'hmmscan' utils.is_contigs_db(self.db_path) + filesnpaths.is_program_exists(self.hmm_program) self.contigs_db_hash = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('contigs_db_hash') @@ -175,7 +177,7 @@ class Args: pass target_files_dict['%s:CONTIG' % alphabet], rna_alphabet=True if alphabet=='RNA' else False) - commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use) + commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use, program_to_use=self.hmm_program) for source in sources: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target']) @@ -200,7 +202,7 @@ class Args: pass if not hmm_scan_hits_txt: search_results_dict = {} else: - parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context) + parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context, program=self.hmm_program) search_results_dict = parser.get_search_results() if not len(search_results_dict): diff --git a/bin/anvi-run-hmms b/bin/anvi-run-hmms index 7aa67527a8..26b7365879 100755 --- a/bin/anvi-run-hmms +++ b/bin/anvi-run-hmms @@ -67,7 +67,7 @@ def main(args): # sources will be loaded from defaults. pass - search_tables = TablesForHMMHits(args.contigs_db, num_threads_to_use=args.num_threads, just_do_it=args.just_do_it) + search_tables = TablesForHMMHits(args.contigs_db, num_threads_to_use=args.num_threads, just_do_it=args.just_do_it, hmm_program_to_use=args.hmmer_program) search_tables.populate_search_tables(sources) if not args.hmm_profile_dir and not args.installed_hmm_profile and args.also_scan_trnas: @@ -100,6 +100,7 @@ if __name__ == '__main__': groupD = parser.add_argument_group("PERFORMANCE", "Stuff everyone forgets to set and then get upset with how slow " "science goes.") groupD.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) + groupD.add_argument(*anvio.A('hmmer-program'), **anvio.K('hmmer-program')) groupE = parser.add_argument_group("AUTHORITY", "Because you are the boss.") groupE.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) From 82628b597fc979d52737f7a45e1339a01838b6de Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 21 Apr 2020 11:24:22 -0500 Subject: [PATCH 361/400] add creation date meta value and warning for old db --- anvio/constants.py | 10 +++++++--- anvio/kegg.py | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/anvio/constants.py b/anvio/constants.py index 0a542a8d4f..35f3d80c37 100644 --- a/anvio/constants.py +++ b/anvio/constants.py @@ -62,7 +62,7 @@ max_num_items_for_hierarchical_clustering = 20000 -# max coverage depth to read from BAM files using pysam. +# max coverage depth to read from BAM files using pysam. # this parameter also can be set later using command line parameters # we use uint16 as dtype for numpy arrays when we work on & store coverages # which has limit of 65536, so this constant needs to be smaller than that. @@ -163,8 +163,8 @@ 'Val': {"C":5, "H":11, "N":1, "O":2, "S":0}}) # taken from http://prowl.rockefeller.edu/aainfo/volume.htm -# volume reference: A.A. Zamyatin, Protein Volume in Solution, Prog. Biophys. Mol. Biol. 24(1972)107-123. -# surface area reference: C. Chotia, The Nature of the Accessible and Buried Surfaces in Proteins, J. Mol. Biol., 105(1975)1-14. +# volume reference: A.A. Zamyatin, Protein Volume in Solution, Prog. Biophys. Mol. Biol. 24(1972)107-123. +# surface area reference: C. Chotia, The Nature of the Accessible and Buried Surfaces in Proteins, J. Mol. Biol., 105(1975)1-14. AA_geometry = Counter({'Ala': {"volume":88.6, "area":115}, 'Arg': {"volume":173.4, "area":225}, 'Asn': {"volume":111.1, "area":150}, @@ -369,3 +369,7 @@ def get_codon_to_num_lookup(reverse_complement=False): nt_to_RC_num_lookup = get_nt_to_num_lookup({'A': 3, 'C': 2, 'G': 1, 'T': 0, 'N': 4}) codon_to_num_lookup = get_codon_to_num_lookup(reverse_complement=False) codon_to_RC_num_lookup = get_codon_to_num_lookup(reverse_complement=True) + + +# KEGG setup constant - used to warn user that the KEGG MODULES.db data may need to be updated +KEGG_SETUP_INTERVAL = 90 # days since last MODULES.db creation diff --git a/anvio/kegg.py b/anvio/kegg.py index 860db906ec..7ca9d62b52 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -11,6 +11,7 @@ import copy import statistics as stats import json +import time import anvio import anvio.db as db @@ -26,6 +27,7 @@ from anvio.parsers import parser_modules from anvio.tables.genefunctions import TableForGeneFunctions from anvio.dbops import ContigsSuperclass, ContigsDatabase, ProfileSuperclass, ProfileDatabase +from anvio.constants import KEGG_SETUP_INTERVAL __author__ = "Developers of anvi'o (see AUTHORS.txt)" @@ -1661,6 +1663,16 @@ def __init__(self, db_path, args, module_dictionary=None, run=run, progress=prog self.run.info('Modules database', 'An existing database, %s, has been loaded.' % self.db_path, quiet=self.quiet) self.run.info('Kegg Modules', '%d found' % self.db.get_meta_value('num_modules'), quiet=self.quiet) + + days_since_created = self.get_days_since_creation() + if not self.quiet and days_since_created >= KEGG_SETUP_INTERVAL: + self.run.warning("Just a friendly PSA here: it has been at least %s days since the MODULES.db was created (%s days to be exact). " + "It is entirely possible that KEGG has been updated since then, so perhaps it is a good idea to re-run " + "anvi-setup-kegg-kofams to be sure that you are working with the latest KEGG data. No pressure, though. If you do " + "want to reset your KEGG setup, we STRONGLY encourage saving a copy of your current KEGG data directory, just " + "in case there was an update that breaks everything and you need to go back to your previous KEGG setup. Don't say we " + "didn't warn you. And we will even be so nice as to tell you that your current KEGG data directory is %s" + % (KEGG_SETUP_INTERVAL, days_since_created, self.kegg_data_dir)) else: # if self.module_dict is None, then we tried to initialize the DB outside of setup if not self.module_dict: @@ -1964,10 +1976,16 @@ def create(self): self.db.set_meta_value('db_type', 'modules') self.db.set_meta_value('num_modules', num_modules_parsed) self.db.set_meta_value('total_entries', mod_table.get_total_entries()) + self.db.set_meta_value('creation_date', time.time()) self.db.disconnect() + def get_days_since_creation(self): + """Returns the time (in days) since MODULES.db was created""" + return (time.time() - float(self.db.get_meta_value('creation_date'))) / 3600 + + # KEGG Modules Table functions for data access and parsing start below # ==================================================================== def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): From 8a456e6e5ee084fbac675f5fd811667eafb31051 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Tue, 21 Apr 2020 11:24:45 -0500 Subject: [PATCH 362/400] add accessor function for all knums in modules table --- anvio/kegg.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 7ca9d62b52..6313b28c72 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -2024,6 +2024,11 @@ def get_all_modules_as_list(self): """This function returns a list of all modules in the DB.""" return self.db.get_single_column_from_table(self.module_table_name, 'module', unique=True) + def get_all_knums_as_list(self): + """This function returns a list of all KO numbers in the DB.""" + where_clause_string = "data_name = 'ORTHOLOGY'" + return self.db.get_single_column_from_table(self.module_table_name, 'data_value', unique=True, where_clause=where_clause_string) + def get_modules_for_knum(self, knum): """This function returns a list of modules that the given KO belongs to.""" where_clause_string = "data_value = '%s'" % (knum) From 5e6f975ceb1e52d112a83290b0c8927b1fb12d6d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 10:58:31 -0500 Subject: [PATCH 363/400] ignore modules in orthology lines when building modules table --- anvio/kegg.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6313b28c72..1b538fbc43 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1932,8 +1932,13 @@ def create(self): prev_data_name_field = entries_tuple_list[0][0] for name, val, definition, line in entries_tuple_list: - # append_and_store will collect db entries and store every 10000 at a time - mod_table.append_and_store(self.db, mnum, name, val, definition, line) + # there is one situation in which we want to ignore the entry, and that is Modules appearing in the ORTHOLOGY category, like so: + # (M00531 Assimilatory nitrate reduction, nitrate => ammonia) + if not (name == "ORTHOLOGY" and val[0] == '('): + # append_and_store will collect db entries and store every 10000 at a time + mod_table.append_and_store(self.db, mnum, name, val, definition, line) + else: + line -= 1 f.close() From 36987e864d895028bc9f9e2dc666b73814146fdd Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 10:59:32 -0500 Subject: [PATCH 364/400] add hash of content to modules self table --- anvio/kegg.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 1b538fbc43..db48fac3e9 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1977,11 +1977,13 @@ def create(self): self.run.info('Number of parsing errors (corrected)', self.num_corrected_errors, quiet=self.quiet) self.run.info('Number of parsing errors (uncorrected)', self.num_uncorrected_errors, quiet=self.quiet) + # record some useful metadata self.db.set_meta_value('db_type', 'modules') self.db.set_meta_value('num_modules', num_modules_parsed) self.db.set_meta_value('total_entries', mod_table.get_total_entries()) self.db.set_meta_value('creation_date', time.time()) + self.db.set_meta_value('hash', self.get_db_content_hash()) self.db.disconnect() @@ -1991,6 +1993,15 @@ def get_days_since_creation(self): return (time.time() - float(self.db.get_meta_value('creation_date'))) / 3600 + def get_db_content_hash(self): + """Compute hash of all KOs and module numbers present in the db (used for tracking major changes to db content with future KEGG updates)""" + mods_and_orths = self.db.get_all_modules_as_list() + mods_and_orths.append(self.db.get_all_knums_as_list()) + mods_and_orths = tuple(mods_and_orths) + return hash(mods_and_orths) + + + # KEGG Modules Table functions for data access and parsing start below # ==================================================================== def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): From 00ea4182a3542cfefe036230d2a5d0e293f8ca0e Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 11:17:15 -0500 Subject: [PATCH 365/400] bug fix in hashing function --- anvio/kegg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index db48fac3e9..adce2cf3e5 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1995,8 +1995,8 @@ def get_days_since_creation(self): def get_db_content_hash(self): """Compute hash of all KOs and module numbers present in the db (used for tracking major changes to db content with future KEGG updates)""" - mods_and_orths = self.db.get_all_modules_as_list() - mods_and_orths.append(self.db.get_all_knums_as_list()) + mods_and_orths = self.get_all_modules_as_list() + mods_and_orths.append(self.get_all_knums_as_list()) mods_and_orths = tuple(mods_and_orths) return hash(mods_and_orths) From 69b6237b59938a4582eb2c0bb2a6c37ebfdaadb6 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 11:44:32 -0500 Subject: [PATCH 366/400] bug fix in hashing function --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index adce2cf3e5..265258805b 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1996,7 +1996,7 @@ def get_days_since_creation(self): def get_db_content_hash(self): """Compute hash of all KOs and module numbers present in the db (used for tracking major changes to db content with future KEGG updates)""" mods_and_orths = self.get_all_modules_as_list() - mods_and_orths.append(self.get_all_knums_as_list()) + mods_and_orths += self.get_all_knums_as_list() mods_and_orths = tuple(mods_and_orths) return hash(mods_and_orths) From 93952d44cf36b977f1907fbb307f82fa6b5e95da Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 12:00:24 -0500 Subject: [PATCH 367/400] add modules db hash to contigs db self table before annotating with kegg --- anvio/kegg.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 265258805b..d88bc1fb3c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -600,6 +600,17 @@ def __init__(self, args, run=run, progress=progress): # load existing kegg modules db self.kegg_modules_db = KeggModulesDatabase(os.path.join(self.kegg_data_dir, "MODULES.db"), args=self.args) + + def set_hash_in_contigs_db(self): + """Modify the contigs DB self table to indicate which MODULES.db has been used to annotate it""" + A = lambda x: self.args.__dict__[x] if x in self.args.__dict__ else None + self.contigs_db_path = A('contigs_db') + + contigs_db = ContigsDatabase(self.contigs_db_path) + contigs_db.db.set_meta_value('modules_db_hash', self.kegg_modules_db.db.get_meta_value('hash')) + contigs_db.disconnect() + + def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): if not self.ko_dict: raise ConfigError("Oops! The ko_list file has not been properly loaded, so get_annotation_from_ko_dict() is \ @@ -621,6 +632,9 @@ def process_kofam_hmms(self): tmp_directory_path = filesnpaths.get_temp_directory_path() contigs_db = ContigsSuperclass(self.args) # initialize contigs db + # mark contigs db with hash of modules.db content for version tracking + self.set_hash_in_contigs_db() + # get AA sequences as FASTA target_files_dict = {'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')} contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['AA:GENE'], From d159f2da2c9e47d7d40ba0432344129e74c826d5 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 12:02:48 -0500 Subject: [PATCH 368/400] move setting hash to after annotation --- anvio/kegg.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index d88bc1fb3c..9b51da0623 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -632,8 +632,6 @@ def process_kofam_hmms(self): tmp_directory_path = filesnpaths.get_temp_directory_path() contigs_db = ContigsSuperclass(self.args) # initialize contigs db - # mark contigs db with hash of modules.db content for version tracking - self.set_hash_in_contigs_db() # get AA sequences as FASTA target_files_dict = {'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')} @@ -725,6 +723,9 @@ def process_kofam_hmms(self): a functional source.") gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) + # mark contigs db with hash of modules.db content for version tracking + self.set_hash_in_contigs_db() + if anvio.DEBUG: run.warning("The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up\ later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug") From fd8a33039818f19a27691559e116f456ea1fe6eb Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 12:19:56 -0500 Subject: [PATCH 369/400] sanity check for different modules db hashes before estimation --- anvio/kegg.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 9b51da0623..bb9fb2ddf3 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -824,6 +824,19 @@ def init_hits_and_splits(self): self.progress.new('Loading data from Contigs DB') contigs_db = ContigsDatabase(self.contigs_db_path, run=self.run, progress=self.progress) self.contigs_db_project_name = contigs_db.meta['project_name'] + + # sanity check that contigs db was annotated with same version of MODULES.db that will be used for metabolism estimation + contigs_db_mod_hash = contigs_db.meta['modules_db_hash'] + mod_db_hash = self.kegg_modules_db.db.get_meta_value('hash') + if contigs_db_mod_hash != mod_db_hash: + raise ConfigError("The contigs DB that you are working with has been annotated with a different version of the MODULES.db than you are working with now. " + "Perhaps you updated your KEGG setup after running `anvi-run-kegg-kofams` on this contigs DB? Or maybe you have multiple KEGG data " + "directories set up on your computer, and the one you are using now is different from the one that you used for `anvi-run-kegg-kofams`? " + "Well. The solution to the first problem is to re-run `anvi-run-kegg-kofams` on the contigs DB (%s) using the updated MODULES.db " + "(located in the KEGG data directory %s). The solution to the second problem is to specify the appropriate KEGG data directory using " + "the --kegg-data-dir flag. If neither of those things make this work, then you should contact the developers to see if they can help you " + "figure this out." % (self.contigs_db_path, self.kegg_data_dir)) + genes_in_splits = contigs_db.db.get_some_columns_from_table(t.genes_in_splits_table_name, "gene_callers_id, split") genes_in_contigs = contigs_db.db.get_some_columns_from_table(t.genes_in_contigs_table_name, "gene_callers_id, contig") kofam_hits = contigs_db.db.get_some_columns_from_table(t.gene_function_calls_table_name, "gene_callers_id, accession", From de630ac267719ece942158d27b3ace12f2747bd9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 13:12:11 -0500 Subject: [PATCH 370/400] change hashing function to sort lists and convert to string to ensure identical content means identical hash --- anvio/kegg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index bb9fb2ddf3..711c3f72d4 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -2023,9 +2023,9 @@ def get_days_since_creation(self): def get_db_content_hash(self): """Compute hash of all KOs and module numbers present in the db (used for tracking major changes to db content with future KEGG updates)""" - mods_and_orths = self.get_all_modules_as_list() - mods_and_orths += self.get_all_knums_as_list() - mods_and_orths = tuple(mods_and_orths) + mods_and_orths = self.get_all_modules_as_list().sort() + mods_and_orths += self.get_all_knums_as_list().sort() + mods_and_orths = "".join(mods_and_orths) return hash(mods_and_orths) From e827eb2c3f3bcae5fcebeb5bacb7ce71806818dc Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 14:09:29 -0500 Subject: [PATCH 371/400] fix another bug in hashing function --- anvio/kegg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 711c3f72d4..3d99a3b3e9 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -2023,8 +2023,11 @@ def get_days_since_creation(self): def get_db_content_hash(self): """Compute hash of all KOs and module numbers present in the db (used for tracking major changes to db content with future KEGG updates)""" - mods_and_orths = self.get_all_modules_as_list().sort() - mods_and_orths += self.get_all_knums_as_list().sort() + mods = self.get_all_modules_as_list() + mods.sort() + orths = self.get_all_knums_as_list() + orths.sort() + mods_and_orths = mods + orths mods_and_orths = "".join(mods_and_orths) return hash(mods_and_orths) From 53c473cb58ca2bec1dd5b214fcdc58d57049f927 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 14:09:48 -0500 Subject: [PATCH 372/400] add geometric mean aggregation measure to copywise redundancy function --- anvio/kegg.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 3d99a3b3e9..1802e7cf22 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1236,6 +1236,8 @@ def compute_copywise_redundancy_for_path(self, num_ko_hits_in_path_dict, aggrega aggregated_completeness = 0 for c in range(len(extra_copy_completeness)): aggregated_completeness += 1/(c+1) * extra_copy_completeness[c] + elif aggregation_measure == "geometric_mean" + aggregated_completeness = stats.geometric_mean(extra_copy_completeness) elif aggregation_measure == "knee": raise ConfigError("aggregation measure 'knee' not implemented yet") else: @@ -1285,6 +1287,8 @@ def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): meta_dict_for_bin[mnum]["copywise_median"].append(cw_med_redundancy) cw_ws_redundancy, copy_completeness_distribution = self.compute_copywise_redundancy_for_path(num_hits_per_kofam, aggregation_measure="weighted_sum") meta_dict_for_bin[mnum]["copywise_weighted-sum"].append(cw_ws_redundancy) + cw_gm_redundancy, copy_completeness_distribution = self.compute_copywise_redundancy_for_path(num_hits_per_kofam, aggregation_measure="geometric_mean") + meta_dict_for_bin[mnum]["copywise_weighted-sum"].append(cw_gm_redundancy) return From fa97bbfe37de69e150e1e6621495009bfbb6dfd7 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 14:12:24 -0500 Subject: [PATCH 373/400] : --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 1802e7cf22..4ee99179ca 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -1236,7 +1236,7 @@ def compute_copywise_redundancy_for_path(self, num_ko_hits_in_path_dict, aggrega aggregated_completeness = 0 for c in range(len(extra_copy_completeness)): aggregated_completeness += 1/(c+1) * extra_copy_completeness[c] - elif aggregation_measure == "geometric_mean" + elif aggregation_measure == "geometric_mean": aggregated_completeness = stats.geometric_mean(extra_copy_completeness) elif aggregation_measure == "knee": raise ConfigError("aggregation measure 'knee' not implemented yet") From 209d3d1d28e40502c41c0232f8d9f807f454a624 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 14:27:31 -0500 Subject: [PATCH 374/400] use hashlib instead of hash to get consistent hashes between runs --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 4ee99179ca..38bd78cad2 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -12,6 +12,7 @@ import statistics as stats import json import time +import hashlib import anvio import anvio.db as db @@ -2033,7 +2034,7 @@ def get_db_content_hash(self): orths.sort() mods_and_orths = mods + orths mods_and_orths = "".join(mods_and_orths) - return hash(mods_and_orths) + return str(hashlib.sha224(mods_and_orths.encode('utf-8')).hexdigest()) From e7372b971e58a36548d7f56facc95f1c26521e4c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 15:01:48 -0500 Subject: [PATCH 375/400] only use first 12 digits of hash --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 38bd78cad2..aee66ad5ee 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -2034,7 +2034,7 @@ def get_db_content_hash(self): orths.sort() mods_and_orths = mods + orths mods_and_orths = "".join(mods_and_orths) - return str(hashlib.sha224(mods_and_orths.encode('utf-8')).hexdigest()) + return str(hashlib.sha224(mods_and_orths.encode('utf-8')).hexdigest())[0:12] From 197d6012367a0dff6739838595e475b494ffb50a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 15:02:27 -0500 Subject: [PATCH 376/400] calculate geometric mean using math because our stats package version does not have it --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index aee66ad5ee..3e362b9940 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -13,6 +13,7 @@ import json import time import hashlib +import math import anvio import anvio.db as db @@ -1238,7 +1239,7 @@ def compute_copywise_redundancy_for_path(self, num_ko_hits_in_path_dict, aggrega for c in range(len(extra_copy_completeness)): aggregated_completeness += 1/(c+1) * extra_copy_completeness[c] elif aggregation_measure == "geometric_mean": - aggregated_completeness = stats.geometric_mean(extra_copy_completeness) + aggregated_completeness = math.prod(extra_copy_completeness)**len(extra_copy_completeness) elif aggregation_measure == "knee": raise ConfigError("aggregation measure 'knee' not implemented yet") else: From 87c794df106fa9eeb96524a73028ecfd471fbb9b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 15:36:05 -0500 Subject: [PATCH 377/400] okay. fine. we will use scipy for geometric mean. --- anvio/kegg.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 3e362b9940..43a84f2a68 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -9,11 +9,11 @@ import glob import re import copy -import statistics as stats +import statistics +from scipy import stats import json import time import hashlib -import math import anvio import anvio.db as db @@ -1231,15 +1231,15 @@ def compute_copywise_redundancy_for_path(self, num_ko_hits_in_path_dict, aggrega aggregated_completeness = 0 else: if aggregation_measure == "average": - aggregated_completeness = stats.mean(extra_copy_completeness) + aggregated_completeness = statistics.mean(extra_copy_completeness) elif aggregation_measure == "median": - aggregated_completeness = stats.median(extra_copy_completeness) + aggregated_completeness = statistics.median(extra_copy_completeness) elif aggregation_measure == "weighted_sum": aggregated_completeness = 0 for c in range(len(extra_copy_completeness)): aggregated_completeness += 1/(c+1) * extra_copy_completeness[c] elif aggregation_measure == "geometric_mean": - aggregated_completeness = math.prod(extra_copy_completeness)**len(extra_copy_completeness) + aggregated_completeness = stats.gmean(extra_copy_completeness) elif aggregation_measure == "knee": raise ConfigError("aggregation measure 'knee' not implemented yet") else: From f712e54240f58b1534131731cc7f8eeba0d68f87 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 16:01:48 -0500 Subject: [PATCH 378/400] remove in_place hmmpress option that I did not implement (yet) --- anvio/tables/hmmhits.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/anvio/tables/hmmhits.py b/anvio/tables/hmmhits.py index c38e1831e4..a79209ec14 100644 --- a/anvio/tables/hmmhits.py +++ b/anvio/tables/hmmhits.py @@ -87,28 +87,18 @@ def check_sources(self, sources): "to remove them first, or run this program with `--just-do-it` flag so anvi'o would remove all " "for you. Here are the list of HMM sources that need to be removed: '%s'." % (', '.join(sources_need_to_be_removed))) - def hmmpress_sources(self, sources, tmp_dir, in_place=False): - """This function checks if the hmm files have been hmmpressed, and if not, it runs hmmpress. + def hmmpress_sources(self, sources, tmp_dir): + """This function runs hmmpress on the hmm profiles. - If in_place is False, we assume that the model should be unpacked and compressed in the temp directory. - Otherwise, we do it in the directory where the model is stored so that it only has to be done once. - - Returns the locations of each hmmpressed file path in a dictionary keyed by the source. + It returns the locations of each hmmpressed file path in a dictionary keyed by the source. """ hmmpressed_file_paths = {} for source in sources: model_file = sources[source]['model'] - hmm_file_path = None - - if in_place: - pass - #hmm_file_path = model_file - # check here if already hmmpressed and if so return - else: - hmm_file_path = os.path.join(tmp_dir, source + '.hmm') - hmm_file = open(hmm_file_path, 'wb') - hmm_file.write(gzip.open(model_file, 'rb').read()) - hmm_file.close() + hmm_file_path = os.path.join(tmp_dir, source + '.hmm') + hmm_file = open(hmm_file_path, 'wb') + hmm_file.write(gzip.open(model_file, 'rb').read()) + hmm_file.close() log_file_path = log_file_path = os.path.join(tmp_dir, 'hmmpress.log') cmd_line = ['hmmpress', hmm_file_path] From 2bd3efdfe9c93244e22c1050fd92ccf91645e28d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Wed, 22 Apr 2020 21:14:04 -0500 Subject: [PATCH 379/400] remove extra pfams line --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 63de99a0cf..2253bbed24 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,3 @@ anvio/data/misc/SCG_TAXONOMY/GTDB/SCG_SEARCH_DATABASES/*.dmnd anvio/tests/sandbox/test_visualize_split_coverages/TEST_OUTDIR anvio/data/misc/KEGG/ -anvio/data/misc/Pfam/ From eea5713f51b5e06e21d2720d4dbeb21f830747a3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 23 Apr 2020 10:47:42 -0500 Subject: [PATCH 380/400] cosmetic updates to hmmer.py --- anvio/drivers/hmmer.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index cb07beae8a..5efbd8a8ca 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -5,8 +5,9 @@ import io import gzip import shutil -from threading import Thread, Lock import glob +from threading import Thread, Lock + import anvio import anvio.utils as utils @@ -33,6 +34,7 @@ class HMMer: def __init__(self, target_files_dict, num_threads_to_use=1, program_to_use="hmmscan", progress=progress, run=run): """A class to streamline HMM runs.""" + self.num_threads_to_use = num_threads_to_use self.program_to_use = program_to_use self.progress = progress @@ -43,8 +45,8 @@ def __init__(self, target_files_dict, num_threads_to_use=1, program_to_use="hmms acceptable_programs = ["hmmscan", "hmmsearch"] if self.program_to_use not in acceptable_programs: - raise ConfigError("HMMer class here. You are attemptimg to use the program %s to run HMMs, but we don't recognize it. The currently" - " supported programs are: %s" % (self.program_to_use, ", ".join(acceptable_programs))) + raise ConfigError("HMMer class here. You are attemptimg to use the program %s to run HMMs, but we don't recognize it. The currently " + "supported programs are: %s" % (self.program_to_use, ", ".join(acceptable_programs))) for source in target_files_dict: tmp_dir = filesnpaths.get_temp_directory_path() @@ -74,10 +76,10 @@ def verify_hmmpress_output(self, hmm_path): expected_extensions = ['h3f', 'h3i', 'h3m', 'h3p'] for ext in expected_extensions: if not os.path.exists(base_path + ext): - raise ConfigError("It appears that hmmpress was not properly run on the hmm profiles at %s. The \ - file %s does not exist. It is likely that you will have to set up your profiles \ - again by running a program such as `anvi-setup-pfams` or `anvi-setup-kegg-kofams`. \ - We are very sorry about this." % (hmm_path, base_path + ext)) + raise ConfigError("It appears that hmmpress was not properly run on the hmm profiles at %s. The " + "file %s does not exist. It is likely that you will have to set up your profiles " + "again by running a program such as `anvi-setup-pfams` or `anvi-setup-kegg-kofams`. " + "We are very sorry about this." % (hmm_path, base_path + ext)) def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms): @@ -116,7 +118,6 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode # check if all hmmpress files are in the HMM directory self.verify_hmmpress_output(hmm) - # we may want to throw a more descriptive error *here* instead of failing in the verify function workers = [] @@ -129,12 +130,12 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode cores_per_process = self.num_threads_to_use // num_parts self.run.warning("You requested %s cores but there were only %s entries in the fasta for the target '%s'. " - "Anvi'o will use %s process with %s cores each instead. I hope thats okay for you. " % + "Anvi'o will use %s process with %s cores each instead. I hope thats okay for you. " % (str(self.num_threads_to_use), str(num_parts), target, str(num_parts), cores_per_process)) if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch': self.run.warning("You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. " - "We hope that is alright." % (self.program_to_use, alphabet)) + "We hope that is alright." % (self.program_to_use, alphabet)) for part_file in self.target_files_dict[target]: From 68d56f832c26f0642dbdbf2182a4092f85a6b064 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 23 Apr 2020 11:23:31 -0500 Subject: [PATCH 381/400] update hmmer-program param help --- anvio/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index 1c60d4ce71..1c9f66ad13 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -713,7 +713,13 @@ def get_args(parser): ['--hmmer-program'], {'type': str, 'required': False, - 'help': "Which of the HMMER programs to use to run HMMs (ie, hmmscan, hmmsearch)"} + 'help': "Which of the HMMER programs to use to run HMMs (hmmscan or hmmsearch). By default " + "anvi'o will use hmmscan for typical HMM operations like those in anvi-run-hmms (as these " + "tend to scan a very large number of genes against a relatively small number of HMMs), " + "but if you are using this program to scan a very large number of HMMs, hmmsearch might " + "be a better choice for performance. For this reason, hmmsearch is the default in operations like " + "anvi-run-pfams and anvi-run-kegg-kofams. See this article for a discussion on the performance " + "of these two programs: https://cryptogenomicon.org/2011/05/27/hmmscan-vs-hmmsearch-speed-the-numerology/"} ), 'hmm-source': ( ['--hmm-source'], From 1b292c1aadeab12481a63b159a056d28f7cfb654 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 23 Apr 2020 11:26:08 -0500 Subject: [PATCH 382/400] capitalize KEGG --- anvio/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index 1c9f66ad13..e861fc65a1 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -2368,7 +2368,7 @@ def get_version_tuples(): ("Pan DB version", __pan__version__), ("Genome data storage version", __genomes_storage_version__), ("Structure DB version", __structure__version__), - ("Kegg Modules DB version", __kegg_modules_version__)] + ("KEGG Modules DB version", __kegg_modules_version__)] def print_version(): From 493606ac918d0ed79081db69458950b5e3762b0b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 23 Apr 2020 12:02:48 -0500 Subject: [PATCH 383/400] fix docstring format --- anvio/db.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/anvio/db.py b/anvio/db.py index 49a63f9020..8ee057c846 100644 --- a/anvio/db.py +++ b/anvio/db.py @@ -601,8 +601,25 @@ def get_some_rows_from_table_as_dict(self, table_name, where_clause, error_if_no by using `WHERE column = value` notation, which is not possible with the more generalized function. - row_num_as_key bool added as parameter so this function works for KEGG MODULES.db, which does not have unique IDs in the - first column. If True, the returned dictionary will be keyed by integers from 0 to (# rows returned - 1) + Parameters + ========== + table_name: str + which table to get rows from + where_clause: str + SQL-style where clause for row selection + error_if_no_data: bool + if true, this function will raise an error if no data is selected from the table. otherwise, it will + quietly return the empty dictionary + string_the_key: bool + if true, the row number will be converted to a string before being used as a key in the dictionary + row_num_as_key: bool + added as parameter so this function works for KEGG MODULES.db, which does not have unique IDs in the + first column. If True, the returned dictionary will be keyed by integers from 0 to (# rows returned - 1) + + Returns + ======= + results_dict: dictionary + contains the requested rows from the table """ results_dict = {} From a2533fec72533b095323a20406bedb3257311fd8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 23 Apr 2020 12:31:07 -0500 Subject: [PATCH 384/400] explicit log file path printed per thread --- anvio/drivers/hmmer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index 5efbd8a8ca..9898a6a7b8 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -110,10 +110,7 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.info('HMMer program used for search', self.program_to_use) tmp_dir = os.path.dirname(self.target_files_dict[target][0]) - log_file_path = os.path.join(tmp_dir, '*_log') - self.run.info('Temporary work dir', tmp_dir) - self.run.info('Log files', log_file_path) # check if all hmmpress files are in the HMM directory @@ -137,12 +134,15 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode self.run.warning("You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. " "We hope that is alright." % (self.program_to_use, alphabet)) - + thread_num = 0 for part_file in self.target_files_dict[target]: log_file = part_file + '_log' output_file = part_file + '_output' shitty_file = part_file + '_shitty' + self.run.info('Log file for thread %s' % thread_num, log_file) + thread_num += 1 + if noise_cutoff_terms: cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, *noise_cutoff_terms.split(), From c5ae53865bbbdc55fed65c34588784a13f9a97bc Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 23 Apr 2020 13:28:20 -0500 Subject: [PATCH 385/400] reorder imports --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 43a84f2a68..5b690dac3d 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -10,10 +10,10 @@ import re import copy import statistics -from scipy import stats import json import time import hashlib +from scipy import stats import anvio import anvio.db as db From 49422a846be416c1e903d5b0f72857da8c2f3910 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 11:53:53 -0500 Subject: [PATCH 386/400] format docstring --- anvio/drivers/hmmer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index 9898a6a7b8..8bced34a30 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -67,8 +67,8 @@ def verify_hmmpress_output(self, hmm_path): PARAMETERS ========== - hmm_path string, the path at which the HMM profiles are located - + hmm_path: string + the path at which the HMM profiles are located """ for file_path in glob.glob(os.path.join(hmm_path, '*.hmm')): From 924ba518765a1d788b0719f8e83fe82c118dad58 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 11:54:26 -0500 Subject: [PATCH 387/400] formatting, structure, and output clarification changes --- anvio/kegg.py | 482 +++++++++++++++++++++++++++++++------------------- 1 file changed, 297 insertions(+), 185 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 5b690dac3d..a76545f9b0 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -63,6 +63,7 @@ def __init__(self, args): self.ko_list_file_path = os.path.join(self.kegg_data_dir, "ko_list.txt") self.kegg_module_file = os.path.join(self.kegg_data_dir, "modules.keg") self.kegg_pathway_file = os.path.join(self.kegg_data_dir, "pathways.keg") + self.kegg_modules_db_path = os.path.join(self.kegg_data_dir, "MODULES.db") def setup_ko_dict(self): @@ -95,8 +96,8 @@ def setup_ko_dict(self): orphan_ko_dict.update({ko:self.ko_dict[ko] for ko in self.ko_no_threshold_list}) if not os.path.exists(self.orphan_data_dir): # should not happen but we check just in case - raise ConfigError("Hmm. Something is out of order. The orphan data directory %s does not exist \ - yet, but it needs to in order for the setup_ko_dict() function to work." % self.orphan_data_dir) + raise ConfigError("Hmm. Something is out of order. The orphan data directory %s does not exist " + "yet, but it needs to in order for the setup_ko_dict() function to work." % self.orphan_data_dir) orphan_ko_path = os.path.join(self.orphan_data_dir, "01_ko_fams_with_no_threshold.txt") orphan_ko_headers = ["threshold","score_type","profile_type","F-measure","nseq","nseq_used","alen","mlen","eff_nseq","re/pos", "definition"] utils.store_dict_as_TAB_delimited_file(orphan_ko_dict, orphan_ko_path, key_header="knum", headers=orphan_ko_headers) @@ -104,6 +105,7 @@ def setup_ko_dict(self): [self.ko_dict.pop(ko) for ko in self.ko_skip_list] [self.ko_dict.pop(ko) for ko in self.ko_no_threshold_list] + def get_ko_skip_list(self): """The purpose of this function is to determine which KO numbers have no associated data or just no score threshold in the ko_list file. @@ -146,6 +148,7 @@ def get_ko_skip_list(self): no_threshold_list.append(k) return skip_list, no_threshold_list + class KeggSetup(KeggContext): """Class for setting up KEGG Kofam HMM profiles and modules. @@ -194,23 +197,29 @@ def is_database_exists(self): """This function determines whether the user has already downloaded the Kofam HMM profiles and KEGG modules.""" if os.path.exists(self.kofam_hmm_file_path): - raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag if you want to re-download it." % self.kegg_data_dir) + raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag " + "if you want to re-download it." % self.kegg_data_dir) if os.path.exists(self.kegg_module_file): - raise ConfigError("Interestingly, though KOfam HMM profiles are not installed on your system, KEGG module information seems to have been \ - already downloaded in %s. Please use the --reset flag to re-download everything from scratch." % self.kegg_data_dir) + raise ConfigError("Interestingly, though KOfam HMM profiles are not installed on your system, KEGG module " + "information seems to have been already downloaded in %s. Please use the --reset flag to " + "re-download everything from scratch." % self.kegg_data_dir) if os.path.exists(self.kegg_pathway_file): - raise ConfigError("Interestingly, though KOfam HMM profiles are not installed on your system, KEGG pathway information seems to have been \ - already downloaded in %s. Please use the --reset flag to re-download everything from scratch." % self.kegg_data_dir) + raise ConfigError("Interestingly, though KOfam HMM profiles are not installed on your system, KEGG pathway " + "information seems to have been already downloaded in %s. Please use the --reset flag to " + "re-download everything from scratch." % self.kegg_data_dir) if os.path.exists(self.module_data_dir): - raise ConfigError("It seems the KEGG module directory %s already exists on your system. This is even more strange because Kofam HMM \ - profiles have not been downloaded. We suggest you to use the --reset flag to download everything from scratch." % self.module_data_dir) + raise ConfigError("It seems the KEGG module directory %s already exists on your system. This is even more " + "strange because Kofam HMM profiles have not been downloaded. We suggest you to use the " + "--reset flag to download everything from scratch." % self.module_data_dir) if os.path.exists(self.pathway_data_dir): - raise ConfigError("It seems the KEGG pathway directory %s already exists on your system. This is even more strange because Kofam HMM \ - profiles have not been downloaded. We suggest you to use the --reset flag to download everything from scratch." % self.pathway_data_dir) + raise ConfigError("It seems the KEGG pathway directory %s already exists on your system. This is even more " + "strange because Kofam HMM profiles have not been downloaded. We suggest you to use the " + "--reset flag to download everything from scratch." % self.pathway_data_dir) + def download_profiles(self): """This function downloads the Kofam profiles.""" @@ -221,6 +230,7 @@ def download_profiles(self): utils.download_file(self.database_url + '/' + file_name, os.path.join(self.kegg_data_dir, file_name), progress=self.progress, run=self.run) + def process_module_file(self): """This function reads the kegg module file into a dictionary. It should be called during setup to get the KEGG module numbers so that KEGG modules can be downloaded. @@ -285,8 +295,11 @@ def process_module_file(self): self.module_dict[mnum] = {"name" : fields[2], "type" : current_module_type, "category" : current_category, "subcategory" : current_subcategory} # unknown code else: - raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has \ - made the file unparseable. Sad. :(" % (self.kegg_module_file, first_char)) + raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has " + "made the file unparseable. It is likely that an update to KEGG has broken " + "things such that anvi'o doesn't know what is going on anymore. Sad, we know. :( " + "Please contact the developers to see if this is a fixable issue, and in the " + "meantime use an older version of the KEGG data directory (if you have one)." % (self.kegg_module_file, first_char)) self.progress.end() @@ -353,16 +366,19 @@ def process_pathway_file(self): self.pathway_dict[konum] = {"name" : fields[2], "category" : current_category, "subcategory" : current_subcategory} # unknown code else: - raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has \ - made the file unparseable. Sad. :(" % (self.kegg_pathway_file, first_char)) + raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has " + "made the file unparseable. It is likely that an update to KEGG has broken " + "things such that anvi'o doesn't know what is going on anymore. Sad, we know. :( " + "Please contact the developers to see if this is a fixable issue, and in the " + "meantime use an older version of the KEGG data directory (if you have one)." % (self.kegg_pathway_file, first_char)) self.progress.end() def download_modules(self): """This function downloads the KEGG modules. - To do so, it also processes the KEGG module file into a dictionary via the - process_module_file() function. To verify that each file has been downloaded properly, we check that the last line is '///'. + To do so, it also processes the KEGG module file into a dictionary via the process_module_file() function. + To verify that each file has been downloaded properly, we check that the last line is '///'. """ self.run.info("KEGG Module Database URL", self.kegg_rest_api_get) @@ -385,8 +401,8 @@ def download_modules(self): f.seek(f.tell() - 4, os.SEEK_SET) last_line = f.readline().strip('\n') if not last_line == '///': - raise ConfigError("The KEGG module file %s was not downloaded properly. We were expecting the last line in the file \ - to be '///', but instead it was %s." % (file_path, last_line)) + raise ConfigError("The KEGG module file %s was not downloaded properly. We were expecting the last line in the file " + "to be '///', but instead it was %s." % (file_path, last_line)) def download_pathways(self): @@ -417,8 +433,8 @@ def download_pathways(self): f.seek(f.tell() - 4, os.SEEK_SET) last_line = f.readline().strip('\n') if not last_line == '///': - raise ConfigError("The KEGG pathway file %s was not downloaded properly. We were expecting the last line in the file \ - to be '///', but instead it was %s." % (file_path, last_line)) + raise ConfigError("The KEGG pathway file %s was not downloaded properly. We were expecting the last line in the file " + "to be '///', but instead it was %s." % (file_path, last_line)) def decompress_files(self): @@ -450,9 +466,10 @@ def confirm_downloaded_profiles(self): if k not in self.ko_skip_list: hmm_path = os.path.join(self.kegg_data_dir, "profiles/%s.hmm" % k) if not os.path.exists(hmm_path): - raise ConfigError("The KOfam HMM profile at %s does not exist. This probably means that something went wrong \ - while downloading the KOfam database. Please run `anvi-setup-kegg-kofams` with the --reset \ - flag." % (hmm_path)) + raise ConfigError("The KOfam HMM profile at %s does not exist. This probably means that something went wrong " + "while downloading the KOfam database. Please run `anvi-setup-kegg-kofams` with the --reset " + "flag." % (hmm_path)) + def move_orphan_files(self): """This function moves the following to the orphan files directory: @@ -460,13 +477,13 @@ def move_orphan_files(self): - profiles that do not have ko_list entries - profiles whose ko_list entries have no scoring threshold (in ko_no_threshold_list) - And, the following profiles should not have been downloaded, but we check if they exist and move any that do: + And, the following profiles should not have been downloaded, but if they were then we move them, too: - profiles whose ko_list entries have no data at all (in ko_skip_list) """ if not os.path.exists(self.orphan_data_dir): # should not happen but we check just in case - raise ConfigError("Hmm. Something is out of order. The orphan data directory %s does not exist \ - yet, but it needs to in order for the move_orphan_files() function to work." % self.orphan_data_dir) + raise ConfigError("Hmm. Something is out of order. The orphan data directory %s does not exist " + "yet, but it needs to in order for the move_orphan_files() function to work." % self.orphan_data_dir) no_kofam_path = os.path.join(self.orphan_data_dir, "00_hmm_profiles_with_no_ko_fams.hmm") no_kofam_file_list = [] @@ -491,27 +508,26 @@ def move_orphan_files(self): if no_kofam_file_list: utils.concatenate_files(no_kofam_path, no_kofam_file_list, remove_concatenated_files=remove_old_files) self.progress.reset() - self.run.warning("Please note that while anvi'o was building your databases, she found %d \ - HMM profiles that did not have any matching KOfam entries. We have removed those HMM \ - profiles from the final database. You can find them under the directory '%s'." - % (len(no_kofam_file_list), self.orphan_data_dir)) - + self.run.warning("Please note that while anvi'o was building your databases, she found %d " + "HMM profiles that did not have any matching KOfam entries. We have removed those HMM " + "profiles from the final database. You can find them under the directory '%s'." + % (len(no_kofam_file_list), self.orphan_data_dir)) if no_threshold_file_list: utils.concatenate_files(no_threshold_path, no_threshold_file_list, remove_concatenated_files=remove_old_files) self.progress.reset() - self.run.warning("Please note that while anvi'o was building your databases, she found %d \ - KOfam entries that did not have any threshold to remove weak hits. We have removed those HMM \ - profiles from the final database. You can find them under the directory '%s'." - % (len(no_threshold_file_list), self.orphan_data_dir)) + self.run.warning("Please note that while anvi'o was building your databases, she found %d " + "KOfam entries that did not have any threshold to remove weak hits. We have removed those HMM " + "profiles from the final database. You can find them under the directory '%s'." + % (len(no_threshold_file_list), self.orphan_data_dir)) if no_data_file_list: utils.concatenate_files(no_data_path, no_data_file_list, remove_concatenated_files=remove_old_files) self.progress.reset() - self.run.warning("Please note that while anvi'o was building your databases, she found %d \ - HMM profiles that did not have any associated data (besides an annotation) in their KOfam entries. \ - We have removed those HMM profiles from the final database. You can find them under the directory '%s'." - % (len(no_data_file_list), self.orphan_data_dir)) + self.run.warning("Please note that while anvi'o was building your databases, she found %d " + "HMM profiles that did not have any associated data (besides an annotation) in their KOfam entries. " + "We have removed those HMM profiles from the final database. You can find them under the directory '%s'." + % (len(no_data_file_list), self.orphan_data_dir)) def run_hmmpress(self): @@ -539,23 +555,26 @@ def run_hmmpress(self): ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: - raise ConfigError("Hmm. There was an error while running `hmmpress` on the Kofam HMM profiles. \ - Check out the log file ('%s') to see what went wrong." % (log_file_path)) + raise ConfigError("Hmm. There was an error while running `hmmpress` on the Kofam HMM profiles. " + "Check out the log file ('%s') to see what went wrong." % (log_file_path)) else: # getting rid of the log file because hmmpress was successful os.remove(log_file_path) self.progress.end() + def setup_modules_db(self): - """This function creates the Modules DB from the Kegg Module files. """ + """This function creates the Modules DB from the Kegg Module files.""" - mod_db = KeggModulesDatabase(os.path.join(self.kegg_data_dir, "MODULES.db"), args=self.args, module_dictionary=self.module_dict, run=run, progress=progress) + mod_db = KeggModulesDatabase(self.kegg_modules_db_path, args=self.args, module_dictionary=self.module_dict, run=run, progress=progress) mod_db.create() def setup_profiles(self): - """This is a driver function which executes the KEGG setup process by downloading, decompressing, and hmmpressing the KOfam profiles. + """This is a driver function which executes the KEGG setup process. + + It downloads, decompresses, and hmmpresses the KOfam profiles. It also downloads and processes the KEGG Module files into the MODULES.db. """ @@ -566,6 +585,7 @@ def setup_profiles(self): self.run_hmmpress() self.setup_modules_db() + class KeggRunHMMs(KeggContext): """ Class for running `hmmscan` against the KOfam database and adding the resulting hits to contigs DB for later metabolism prediction. @@ -590,21 +610,22 @@ def __init__(self, args, run=run, progress=progress): # verify that Kofam HMM profiles have been set up if not os.path.exists(self.kofam_hmm_file_path): - raise ConfigError("Anvi'o is unable to find the Kofam.hmm file at %s. This can happen one of two ways. Either you \ - didn't specify the correct KEGG data directory using the flag --kegg-data-dir, or you haven't \ - yet set up the Kofam data by running `anvi-setup-kegg-kofams`. Hopefully you now know what to do \ - to fix this problem. :) " % self.kegg_data_dir) + raise ConfigError("Anvi'o is unable to find the Kofam.hmm file at %s. This can happen one of two ways. Either you " + "didn't specify the correct KEGG data directory using the flag --kegg-data-dir, or you haven't " + "yet set up the Kofam data by running `anvi-setup-kegg-kofams`. Hopefully you now know what to do " + "to fix this problem. :) " % self.kegg_data_dir) utils.is_contigs_db(self.contigs_db_path) self.setup_ko_dict() # read the ko_list file into self.ko_dict # load existing kegg modules db - self.kegg_modules_db = KeggModulesDatabase(os.path.join(self.kegg_data_dir, "MODULES.db"), args=self.args) + self.kegg_modules_db = KeggModulesDatabase(self.kegg_modules_db_path, args=self.args) def set_hash_in_contigs_db(self): - """Modify the contigs DB self table to indicate which MODULES.db has been used to annotate it""" + """Modifies the contigs DB self table to indicate which MODULES.db has been used to annotate it.""" + A = lambda x: self.args.__dict__[x] if x in self.args.__dict__ else None self.contigs_db_path = A('contigs_db') @@ -614,22 +635,37 @@ def set_hash_in_contigs_db(self): def get_annotation_from_ko_dict(self, knum, ok_if_missing_from_dict=False): + """Returns the functional annotation of the provided KO number. + + Parameters + ========== + knum : str + The KO number for which to get an annotation for + ok_if_missing_from_dict : bool + If false, not finding the KO will raise an error. If true, the function will quietly return an "Unknown" annotation string for the missing KO + + Returns + ======= + annotation : str + """ + if not self.ko_dict: - raise ConfigError("Oops! The ko_list file has not been properly loaded, so get_annotation_from_ko_dict() is \ - extremely displeased and unable to function properly. Please refrain from calling this \ - function until after setup_ko_dict() has been called.") + raise ConfigError("Oops! The ko_list file has not been properly loaded, so get_annotation_from_ko_dict() is " + "extremely displeased and unable to function properly. Please refrain from calling this " + "function until after setup_ko_dict() has been called.") if not knum in self.ko_dict: if ok_if_missing_from_dict: return "Unknown function with KO num %s" % knum else: - raise ConfigError("It seems %s found a KO number that does not exist\ - in the KOfam ko_list file: %s" % (self.hmm_program, knum)) + raise ConfigError("It seems %s found a KO number that does not exist " + "in the KOfam ko_list file: %s" % (self.hmm_program, knum)) return self.ko_dict[knum]['definition'] + def process_kofam_hmms(self): - """This is a driver function for running HMMs against the KOfam database and processing the hits into the provided contigs DB""" + """This is a driver function for running HMMs against the KOfam database and processing the hits into the provided contigs DB.""" tmp_directory_path = filesnpaths.get_temp_directory_path() contigs_db = ContigsSuperclass(self.args) # initialize contigs db @@ -650,16 +686,16 @@ def process_kofam_hmms(self): gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress) if not hmm_hits_file: - run.info_single("The HMM search returned no hits :/ So there is nothing to add to the contigs database. But\ - now anvi'o will add KOfam as a functional source with no hits, clean the temporary directories\ - and gracefully quit.", nl_before=1, nl_after=1) + run.info_single("The HMM search returned no hits :/ So there is nothing to add to the contigs database. But " + "now anvi'o will add KOfam as a functional source with no hits, clean the temporary directories " + "and gracefully quit.", nl_before=1, nl_after=1) if not anvio.DEBUG: shutil.rmtree(tmp_directory_path) hmmer.clean_tmp_dirs() else: - self.run.warning("Because you ran this script with the --debug flag, anvi'o will not clean up the temporary\ - directories located at %s and %s. Please be responsible for cleaning up this directory yourself \ - after you are finished debugging :)" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug") + self.run.warning("Because you ran this script with the --debug flag, anvi'o will not clean up the temporary " + "directories located at %s and %s. Please be responsible for cleaning up this directory yourself " + "after you are finished debugging :)" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug") gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) return @@ -721,22 +757,23 @@ def process_kofam_hmms(self): gene_function_calls_table.create(kegg_module_names_dict) gene_function_calls_table.create(kegg_module_classes_dict) else: - self.run.warning("KOfam class has no hits to process. Returning empty handed, but still adding KOfam as \ - a functional source.") + self.run.warning("KOfam class has no hits to process. Returning empty handed, but still adding KOfam as " + "a functional source.") gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) # mark contigs db with hash of modules.db content for version tracking self.set_hash_in_contigs_db() if anvio.DEBUG: - run.warning("The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up\ - later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug") + run.warning("The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up " + "later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug") else: - run.info_single('Cleaning up the temp directory (you can use `--debug` if you would\ - like to keep it for testing purposes)', nl_before=1, nl_after=1) + run.info_single("Cleaning up the temp directory (you can use `--debug` if you would " + "like to keep it for testing purposes)", nl_before=1, nl_after=1) shutil.rmtree(tmp_directory_path) hmmer.clean_tmp_dirs() + class KeggMetabolismEstimator(KeggContext): """ Class for reconstructing/estimating metabolism based on hits to KEGG databases. @@ -767,12 +804,12 @@ def __init__(self, args, run=run, progress=progress): if not self.estimate_from_json and not self.contigs_db_path: raise ConfigError("NO INPUT PROVIDED. You must provide (at least) a contigs database to this program, unless you are using the --estimate-from-json " - "flag, in which case you must provide a JSON-formatted file.") + "flag, in which case you must provide a JSON-formatted file.") self.bin_ids_to_process = None if self.bin_id and self.bin_ids_file: - raise ConfigError("You have provided anvi'o with both the individual bin id %s and a file with bin ids (%s). \ - Please make up your mind. Which one do you want an estimate for? :)" % (self.bin_id, self.bin_ids_file)) + raise ConfigError("You have provided anvi'o with both the individual bin id %s and a file with bin ids (%s). " + "Please make up your mind. Which one do you want an estimate for? :)" % (self.bin_id, self.bin_ids_file)) elif self.bin_id: self.bin_ids_to_process = [self.bin_id] elif self.bin_ids_file: @@ -781,17 +818,17 @@ def __init__(self, args, run=run, progress=progress): if self.bin_id or self.bin_ids_file or self.collection_name and not self.profile_db_path: raise ConfigError("You have requested metabolism estimation for a bin or set of bins, but you haven't provided " - "a profiles database. Unfortunately, this just does not work. Please try again.") + "a profiles database. Unfortunately, this just does not work. Please try again.") if self.profile_db_path and not self.collection_name: raise ConfigError("If you provide a profiles DB, you should also provide a collection name.") if self.store_json_without_estimation and not self.json_output_file_path: raise ConfigError("Whoops. You seem to want to store the metabolism dictionary in a JSON file, but you haven't provided the name of that file. " - "Please use the --get-raw-data-as-json flag to do so.") + "Please use the --get-raw-data-as-json flag to do so.") if self.store_json_without_estimation and self.estimate_from_json: raise ConfigError("It is impossible to both estimate metabolism from JSON data and produce a JSON file without estimation at the same time... " - "anvi'o is judging you SO hard right now.") + "anvi'o is judging you SO hard right now.") # init the base class @@ -801,13 +838,13 @@ def __init__(self, args, run=run, progress=progress): utils.is_contigs_db(self.contigs_db_path) # load existing kegg modules db - if not os.path.exists(os.path.join(self.kegg_data_dir, "MODULES.db")): - raise ConfigError("It appears that a modules database (%s) does not exist in the KEGG data directory %s. \ - Perhaps you need to specify a different KEGG directory using --kegg-data-dir. Or perhaps you didn't run \ - `anvi-setup-kegg-kofams`, though we are not sure how you got to this point in that case \ - since you also cannot run `anvi-run-kegg-kofams` without first having run KEGG setup. But fine. Hopefully \ - you now know what you need to do to make this message go away." % ("MODULES.db", self.kegg_data_dir)) - self.kegg_modules_db = KeggModulesDatabase(os.path.join(self.kegg_data_dir, "MODULES.db"), args=self.args) + if not os.path.exists(self.kegg_modules_db_path): + raise ConfigError("It appears that a modules database (%s) does not exist in the KEGG data directory %s. " + "Perhaps you need to specify a different KEGG directory using --kegg-data-dir. Or perhaps you didn't run " + "`anvi-setup-kegg-kofams`, though we are not sure how you got to this point in that case " + "since you also cannot run `anvi-run-kegg-kofams` without first having run KEGG setup. But fine. Hopefully " + "you now know what you need to do to make this message go away." % ("MODULES.db", self.kegg_data_dir)) + self.kegg_modules_db = KeggModulesDatabase(self.kegg_modules_db_path, args=self.args) def init_hits_and_splits(self): """This function loads KOfam hits, gene calls, splits, and contigs from the contigs DB. @@ -820,7 +857,8 @@ def init_hits_and_splits(self): RETURNS ======= - kofam_gene_split_contig list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering + kofam_gene_split_contig : list + (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering """ self.progress.new('Loading data from Contigs DB') @@ -842,7 +880,7 @@ def init_hits_and_splits(self): genes_in_splits = contigs_db.db.get_some_columns_from_table(t.genes_in_splits_table_name, "gene_callers_id, split") genes_in_contigs = contigs_db.db.get_some_columns_from_table(t.genes_in_contigs_table_name, "gene_callers_id, contig") kofam_hits = contigs_db.db.get_some_columns_from_table(t.gene_function_calls_table_name, "gene_callers_id, accession", - where_clause="source = 'KOfam'") + where_clause="source = 'KOfam'") min_contig_length_in_contigs_db = contigs_db.db.get_max_value_in_column(t.contigs_info_table_name, "length", return_min_instead=True) contigs_db.disconnect() @@ -857,8 +895,8 @@ def init_hits_and_splits(self): genes_in_contigs = [tpl for tpl in genes_in_contigs if tpl[0] not in gene_calls_without_kofam_hits] if anvio.DEBUG: self.progress.reset() - self.run.warning("The following gene calls in your contigs DB were removed from consideration as they \ - do not have any hits to the KOfam database: %s" % (gene_calls_without_kofam_hits)) + self.run.warning("The following gene calls in your contigs DB were removed from consideration as they " + "do not have any hits to the KOfam database: %s" % (gene_calls_without_kofam_hits)) # get rid of splits and contigs (and their associated gene calls) that are not in the profile DB @@ -919,6 +957,7 @@ def init_hits_and_splits(self): return kofam_gene_split_contig + def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=None, bin_name=None): """This function generates a bin-level dictionary of dictionaries, which associates modules with the KOs that are present in the bin for each module. @@ -939,13 +978,17 @@ def mark_kos_present_for_list_of_splits(self, kofam_hits_in_splits, split_list=N PARAMETERS ========== - kofam_hits_in_splits list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering - split_list list of splits we are considering, this is only for debugging output - bin_name name of the bin containing these splits, this is only for debugging output + kofam_hits_in_splits : list + (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering + split_list : list + splits we are considering, this is only for debugging output + bin_name : str + name of the bin containing these splits, this is only for debugging output RETURNS ======= - bin_level_module_dict dict of dicts that maps module number to dictionary of KOs present in the splits for that module + bin_level_module_dict : dictionary of dictionaries + initialized metabolism completeness dictionary for the list of splits (genome, metagenome, or bin) provided """ bin_level_module_dict = {} @@ -1015,8 +1058,10 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): PARAMETERS ========== - mnum string, module number to work on - meta_dict_for_bin metabolism completeness dict for the current bin, to be modified in-place + mnum : string + module number to work on + meta_dict_for_bin : dictionary of dictionaries + metabolism completeness dict for the current bin, to be modified in-place NEW KEYS ADDED TO METABOLISM COMPLETENESS DICT ======= @@ -1029,10 +1074,14 @@ def compute_module_completeness_for_bin(self, mnum, meta_dict_for_bin): RETURNS ======= - over_complete_threshold boolean, whether or not the module is considered "complete" overall based on the threshold fraction of completeness - has_nonessential_step boolean, whether or not the module contains non-essential steps. Used for warning the user about these. - has_no_ko_step boolean, whether or not the module contains steps without associated KOs. Used for warning the user about these. - defined_by_modules boolean, whether or not the module contains steps defined by other modules. Used for going back to adjust completeness later. + over_complete_threshold : boolean + whether or not the module is considered "complete" overall based on the threshold fraction of completeness + has_nonessential_step : boolean + whether or not the module contains non-essential steps. Used for warning the user about these. + has_no_ko_step : boolean + whether or not the module contains steps without associated KOs. Used for warning the user about these. + defined_by_modules : boolean + whether or not the module contains steps defined by other modules. Used for going back to adjust completeness later. """ present_list_for_mnum = meta_dict_for_bin[mnum]["kofam_hits"].keys() @@ -1152,12 +1201,15 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): PARAMETERS ========== - mod string, the module number to adjust - meta_dict_for_bin metabolism completeness dictionary for the current bin + mod : string + the module number to adjust + meta_dict_for_bin : dictionary of dictionaries + metabolism completeness dictionary for the current bin RETURNS ======= - now_complete boolean, whether or not the module is NOW considered "complete" overall based on the threshold fraction of completeness + now_complete : boolean + whether or not the module is NOW considered "complete" overall based on the threshold fraction of completeness """ for i in range(len(meta_dict_for_bin[mod]["paths"])): @@ -1178,7 +1230,7 @@ def adjust_module_completeness_for_bin(self, mod, meta_dict_for_bin): num_essential_steps_in_path += 1 else: raise ConfigError("Well. While adjusting completeness estimates for module %s, we found an atomic step in the pathway that we " - "are not quite sure what to do with. Here it is: %s" % (mod, atomic_step)) + "are not quite sure what to do with. Here it is: %s" % (mod, atomic_step)) # now we adjust the previous pathway completeness old_complete_steps_in_path = meta_dict_for_bin[mod]["pathway_completeness"][i] * num_essential_steps_in_path @@ -1257,8 +1309,10 @@ def compute_module_redundancy_for_bin(self, mnum, meta_dict_for_bin): PARAMETERS ========== - mnum string, module number to work on - meta_dict_for_bin metabolism completeness dict for the current bin, to be modified in-place + mnum : string + module number to work on + meta_dict_for_bin : dictionary of dictionaries + metabolism completeness dict for the current bin, to be modified in-place """ @@ -1305,11 +1359,13 @@ def estimate_for_list_of_splits(self, metabolism_dict_for_list_of_splits, bin_na PARAMETERS ========== - metabolism_dict_for_list_of_splits the metabolism completeness dictionary of dictionaries for this list of splits. It contains - one dictionary of module steps and completion information for each module (keyed by module number), - as well as one key num_complete_modules that tracks the number of complete modules found in these splits. - Calling functions should assign this dictionary to a metabolism superdict with the bin name as a key. - bin_name the name of the bin/genome/metagenome that we are working with + metabolism_dict_for_list_of_splits : dictionary of dictionaries + the metabolism completeness dictionary of dictionaries for this list of splits. It contains + one dictionary of module steps and completion information for each module (keyed by module number), + as well as one key num_complete_modules that tracks the number of complete modules found in these splits. + Calling functions should assign this dictionary to a metabolism superdict with the bin name as a key. + bin_name : str + the name of the bin/genome/metagenome that we are working with """ metabolism_dict_for_list_of_splits["num_complete_modules"] = 0 @@ -1357,9 +1413,9 @@ def estimate_for_list_of_splits(self, metabolism_dict_for_list_of_splits, bin_na if not self.quiet: if mods_with_nonessential_steps: self.run.warning("Please note that anvi'o found one or more non-essential steps in the following KEGG modules: %s. " - "At this time, we are not counting these steps in our percent completion estimates. But we still kept track of which " - "of these non-essential steps were found to be complete. You can see this information in the output file." - % (", ".join(mods_with_nonessential_steps))) + "At this time, we are not counting these steps in our percent completion estimates. But we still kept track of which " + "of these non-essential steps were found to be complete. You can see this information in the output file." + % (", ".join(mods_with_nonessential_steps))) if mods_with_unassociated_ko: self.run.warning("Just so you know, while estimating the completeness of some KEGG modules, anvi'o saw " @@ -1388,11 +1444,13 @@ def estimate_for_genome(self, kofam_gene_split_contig): PARAMETERS ========== - kofam_gene_split_contig list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering + kofam_gene_split_contig : list + (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering RETURNS ======= - genome_metabolism_dict dictionary mapping genome name to its metabolism completeness dictionary + genome_metabolism_dict : dictionary of dictionary of dictionaries + dictionary mapping genome name to its metabolism completeness dictionary """ genome_metabolism_superdict = {} @@ -1414,11 +1472,13 @@ def estimate_for_bins_in_collection(self, kofam_gene_split_contig): PARAMETERS ========== - kofam_gene_split_contig list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering + kofam_gene_split_contig : list + (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering RETURNS ======= - bins_metabolism_superdict dictionary mapping bin name to its metabolism completeness dictionary + bins_metabolism_superdict : dictionary of dictionary of dictionaries + dictionary mapping bin name to its metabolism completeness dictionary """ bins_metabolism_superdict = {} @@ -1455,11 +1515,13 @@ def estimate_for_contigs_db_for_metagenome(self, kofam_gene_split_contig): PARAMETERS ========== - kofam_gene_split_contig list of (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering + kofam_gene_split_contig : list + (ko_num, gene_call_id, split, contig) tuples, one per KOfam hit in the splits we are considering RETURNS ======= - metagenome_metabolism_superdict dictionary mapping metagenome name to its metabolism completeness dictionary + metagenome_metabolism_superdict : dictionary of dictionary of dictionaries + dictionary mapping metagenome name to its metabolism completeness dictionary """ metagenome_metabolism_superdict = {} @@ -1493,9 +1555,9 @@ def estimate_metabolism_from_json_data(self): for mod, mod_dict in meta_dict_for_bin.items(): if mod == "num_complete_modules": self.run.warning("Your JSON file appears to have been generated from data that already contains metabolic module completeness information. " - "We say this because the key 'num_complete_modules' was found. This isn't a problem; however you should know that anvi'o " - "won't take any of the existing estimation information into account. The only module-level keys that will be used from this file " - "are: %s" % (expected_keys_for_module)) + "We say this because the key 'num_complete_modules' was found. This isn't a problem; however you should know that anvi'o " + "won't take any of the existing estimation information into account. The only module-level keys that will be used from this file " + "are: %s" % (expected_keys_for_module)) continue # verify that dict contains the necessary keys for estimation if not expected_keys_for_module.issubset(set(mod_dict.keys())): @@ -1557,7 +1619,7 @@ def estimate_metabolism(self): def store_kegg_metabolism_superdict(self, kegg_superdict): """This function writes the metabolism superdict to a tab-delimited file, and also generates a file summarizing the complete modules. - The metabolism superdict is a three-to-four-level dictionary. The first three levels are: genomes/bins, modules, and module completion information. + The metabolism superdict is a three-to-four-level dictionary. The first three levels are: genomes/metagenomes/bins, modules, and module completion information. The module completion dictionary also has some dictionaries in it, and those make up the fourth level. The structure of the module completion dictionary is like this example: {mnum: {"gene_caller_ids": set([132, 133, 431, 6777]) @@ -1701,17 +1763,17 @@ def __init__(self, db_path, args, module_dictionary=None, run=run, progress=prog days_since_created = self.get_days_since_creation() if not self.quiet and days_since_created >= KEGG_SETUP_INTERVAL: self.run.warning("Just a friendly PSA here: it has been at least %s days since the MODULES.db was created (%s days to be exact). " - "It is entirely possible that KEGG has been updated since then, so perhaps it is a good idea to re-run " - "anvi-setup-kegg-kofams to be sure that you are working with the latest KEGG data. No pressure, though. If you do " - "want to reset your KEGG setup, we STRONGLY encourage saving a copy of your current KEGG data directory, just " - "in case there was an update that breaks everything and you need to go back to your previous KEGG setup. Don't say we " - "didn't warn you. And we will even be so nice as to tell you that your current KEGG data directory is %s" - % (KEGG_SETUP_INTERVAL, days_since_created, self.kegg_data_dir)) + "It is entirely possible that KEGG has been updated since then, so perhaps it is a good idea to re-run " + "anvi-setup-kegg-kofams to be sure that you are working with the latest KEGG data. No pressure, though. If you do " + "want to reset your KEGG setup, we STRONGLY encourage saving a copy of your current KEGG data directory, just " + "in case there was an update that breaks everything and you need to go back to your previous KEGG setup. Don't say we " + "didn't warn you. And we will even be so nice as to tell you that your current KEGG data directory is %s" + % (KEGG_SETUP_INTERVAL, days_since_created, self.kegg_data_dir)) else: # if self.module_dict is None, then we tried to initialize the DB outside of setup if not self.module_dict: - raise ConfigError("ERROR - a new KeggModulesDatabase() cannot be initialized without providing a modules dictionary. This \ - usually happens when you try to access a Modules DB before one has been setup. Running `anvi-setup-kegg-kofams` may fix this.") + raise ConfigError("ERROR - a new KeggModulesDatabase() cannot be initialized without providing a modules dictionary. This " + "usually happens when you try to access a Modules DB before one has been setup. Running `anvi-setup-kegg-kofams` may fix this.") def touch(self): """Creates an empty Modules database on disk, and sets `self.db` to access to it. @@ -1722,14 +1784,15 @@ def touch(self): # sanity check to avoid overriding previous Modules DB # this will probably never happen as long as this function is called through the setup script, but we check just in case if os.path.exists(self.db_path): - raise ConfigError("A modules database at %s already exists. Please use the --reset flag when you restart the setup \ - if you really want to get rid of this one and make a new one." % (self.db_path)) + raise ConfigError("A modules database at %s already exists. Please use the --reset flag when you restart the setup " + "if you really want to get rid of this one and make a new one." % (self.db_path)) self.db = db.DB(self.db_path, anvio.__kegg_modules_version__, new_database=True) self.db.create_table(self.module_table_name, self.module_table_structure, self.module_table_types) + def data_vals_sanity_check(self, data_vals, current_data_name, current_module_num): """This function checks if the data values were correctly parsed from a line in a KEGG module file. @@ -1743,13 +1806,17 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu PARAMETERS ========== - data_vals str, the data values field (split from the kegg module line) - current_data_name str, which data name we are working on. It should never be None because we should have already figured this out by parsing the line. - current_module_num str, which module we are working on. We need this to keep track of which modules throw parsing errors. + data_vals : str + the data values field (split from the kegg module line) + current_data_name : str + which data name we are working on. It should never be None because we should have already figured this out by parsing the line. + current_module_num : str + which module we are working on. We need this to keep track of which modules throw parsing errors. RETURNS ======= - is_ok bool, whether the values look correctly formatted or not + is_ok : bool + whether the values look correctly formatted or not """ is_ok = True @@ -1758,8 +1825,8 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu corrected_def = None if not current_data_name: - raise ConfigError("data_vals_sanity_check() cannot be performed when the current data name is None. Something was not right when parsing the KEGG \ - module line.") + raise ConfigError("data_vals_sanity_check() cannot be performed when the current data name is None. Something was not right " + "when parsing the KEGG module line.") elif current_data_name == "ENTRY": # example format: M00175 if data_vals[0] != 'M' or len(data_vals) != 6: @@ -1830,23 +1897,26 @@ def data_vals_sanity_check(self, data_vals, current_data_name, current_module_nu self.num_uncorrected_errors += 1 if self.just_do_it: self.progress.reset() - self.run.warning("While parsing, anvi'o found an uncorrectable issue with a KEGG Module line in module %s, but since you used the --just-do-it flag, \ - anvi'o will quietly ignore this issue and add the line to the MODULES.db anyway. Please be warned that this may break things downstream. \ - In case you are interested, the line causing this issue has data name %s and data value %s" % (current_module_num, current_data_name, data_vals)) + self.run.warning("While parsing, anvi'o found an uncorrectable issue with a KEGG Module line in module %s, but " + "since you used the --just-do-it flag, anvi'o will quietly ignore this issue and add the line " + "to the MODULES.db anyway. Please be warned that this may break things downstream. In case you " + "are interested, the line causing this issue has data name %s and data value %s." + % (current_module_num, current_data_name, data_vals)) is_ok = True # let's pretend that everything is alright so that the next function will take the original parsed values else: - raise ConfigError("While parsing, anvi'o found an uncorrectable issue with a KEGG Module line in module %s. The current data name is %s, \ - here is the incorrectly-formatted data value field: %s. If you think this is totally fine and want to ignore errors like this, please \ - re-run the setup with the --just-do-it flag. But if you choose to do that of course we are obliged to inform you that things may eventually \ - break as a result." % (current_module_num, current_data_name, data_vals)) + raise ConfigError("While parsing, anvi'o found an uncorrectable issue with a KEGG Module line in module %s. The " + "current data name is %s, here is the incorrectly-formatted data value field: %s. If you think " + "this is totally fine and want to ignore errors like this, please re-run the setup with the " + "--just-do-it flag. But if you choose to do that of course we are obliged to inform you that things " + "may eventually break as a result." % (current_module_num, current_data_name, data_vals)) if is_corrected: self.num_corrected_errors += 1 if anvio.DEBUG and not self.quiet: self.progress.reset() - self.run.warning("While parsing a KEGG Module line, we found an issue with the formatting. We did our very best to parse the line \ - correctly, but please check that it looks right to you by examining the following values.") + self.run.warning("While parsing a KEGG Module line, we found an issue with the formatting. We did our very best to parse " + "the line correctly, but please check that it looks right to you by examining the following values.") self.run.info("Incorrectly parsed data value field", data_vals) self.run.info("Corrected data values", corrected_vals) self.run.info("Corrected data definition", corrected_def) @@ -1865,16 +1935,21 @@ def parse_kegg_modules_line(self, line, current_module, line_num=None, current_d PARAMETERS ========== - line str, the line to parse - current_module str, which module we are working on. We need this to keep track of which modules throw parsing errors - line_num int, which line number we are working on. We need this to keep track of which entities come from the same line of the file. - current_data_name str, which data name we are working on. If this is None, we need to parse this info from the first field in the line. + line : str + the line to parse + current_module : str + which module we are working on. We need this to keep track of which modules throw parsing errors + line_num : int + which line number we are working on. We need this to keep track of which entities come from the same line of the file. + current_data_name : str + which data name we are working on. If this is None, we need to parse this info from the first field in the line. RETURNS ======= - line_entries a list of tuples, each containing information for one db entry, namely data name, data value, data definition, and line number. - Not all parts of the db entry will be included (module num, for instance), so this information must be parsed and combined with - the missing information before being added to the database. + line_entries : list + tuples, each containing information for one db entry, namely data name, data value, data definition, and line number. + Not all parts of the db entry will be included (module num, for instance), so this information must be parsed and combined with + the missing information before being added to the database. """ fields = re.split('\s{2,}', line) @@ -1886,9 +1961,9 @@ def parse_kegg_modules_line(self, line, current_module, line_num=None, current_d if not current_data_name: # sanity check: if line starts with space then there is no data name field and we should have passed a current_data_name if line[0] == ' ': - raise ConfigError("Oh, please. Some silly developer (you know who you are) has tried to call parse_kegg_modules_line() on \ - a line without a data name field, and forgot to give it the current data name. Shame on you, go fix this. (For reference here \ - is the line: %s)" % (line)) + raise ConfigError("Oh, please. Some silly developer (you know who you are) has tried to call parse_kegg_modules_line() on " + "a line without a data name field, and forgot to give it the current data name. Shame on you, go fix " + "this. (For reference here is the line: %s)" % (line)) current_data_name = fields[0] # note that if data name is known, first field still exists but is actually the empty string '' @@ -1925,11 +2000,12 @@ def create(self): # sanity check that we setup the modules previously. # It shouldn't be a problem since this function should only be called during the setup process after modules download, but just in case. if not os.path.exists(self.module_data_dir) or len(self.module_dict.keys()) == 0: - raise ConfigError("Appparently, the Kegg Modules were not correctly setup and now all sorts of things are broken. The \ - Modules DB cannot be created from broken things. BTW, this error is not supposed to happen to anyone except maybe developers, so \ - if you do not fall into that category you are likely in deep doo-doo. Maybe re-running setup with --reset will work? (if not, you \ - probably should email/Slack/telepathically cry out for help to the developers). Anyway, if this helps make things any clearer, \ - the number of modules in the module dictionary is currently %s" % len(self.module_dict.keys())) + raise ConfigError("Appparently, the Kegg Modules were not correctly setup and now all sorts of things are broken. The " + "Modules DB cannot be created from broken things. BTW, this error is not supposed to happen to anyone " + "except maybe developers, so if you do not fall into that category you are likely in deep doo-doo. " + "Maybe re-running setup with --reset will work? (if not, you probably should email/Slack/telepathically " + "cry out for help to the developers). Anyway, if this helps make things any clearer, the number of modules " + "in the module dictionary is currently %s" % len(self.module_dict.keys())) # init the Modules table mod_table = KeggModulesTable(self.module_table_name) @@ -1986,21 +2062,21 @@ def create(self): # warn user about parsing errors if anvio.DEBUG: - self.run.warning("Several parsing errors were encountered while building the KEGG Modules DB. \ - Below you will see which modules threw each type of parsing error. Note that modules which threw multiple \ - errors will occur in the list as many times as it threw each error.") + self.run.warning("Several parsing errors were encountered while building the KEGG Modules DB. " + "Below you will see which modules threw each type of parsing error. Note that modules which " + "threw multiple errors will occur in the list as many times as it threw each error.") self.run.info("Bad line splitting (usually due to rogue or missing spaces)", self.parsing_error_dict["bad_line_splitting"]) self.run.info("Bad KEGG code format (not corrected; possibly problematic)", self.parsing_error_dict["bad_kegg_code_format"]) else: # less verbose - self.run.warning("First things first - don't panic. Several parsing errors were encountered while building the KEGG Modules DB. But that \ - is probably okay, because if you got to this point it is likely that we already fixed all of them ourselves. So don't worry too much. \ - Below you will see how many of each type of error was encountered. If you would like to see which modules threw these errors, please \ - re-run the setup using the --debug flag (you will also probably need the --reset flag). When doing so, you will also see which lines \ - caused issues; this can be a lot of output, so you can suppress the line-specific output with the --quiet flag if that makes things easier to read. \ - So, in summary: You can probably ignore this warning. But if you want more info: \ - run setup again with --reset --debug --quiet to see exactly which modules had issues, or \ - run --reset --debug to see exactly which lines in which modules had issues. \ - Now, here is a kiss for you because you have been so patient and good with anvi'o 😚") + self.run.warning("First things first - don't panic. Several parsing errors were encountered while building the KEGG Modules DB. " + "But that is probably okay, because if you got to this point it is likely that we already fixed all of them " + "ourselves. So don't worry too much. Below you will see how many of each type of error was encountered. If " + "you would like to see which modules threw these errors, please re-run the setup using the --debug flag (you " + "will also probably need the --reset flag). When doing so, you will also see which lines caused issues; this " + "can be a lot of output, so you can suppress the line-specific output with the --quiet flag if that makes things " + "easier to read. So, in summary: You can probably ignore this warning. But if you want more info: run setup again " + "with --reset --debug --quiet to see exactly which modules had issues, or run with --reset --debug to see exactly " + "which lines in which modules had issues. Now, here is a kiss for you because you have been so patient and good with anvi'o 😚") self.run.info("Bad line splitting (usually due to rogue or missing spaces)", len(self.parsing_error_dict["bad_line_splitting"])) self.run.info("Bad KEGG code format (usually not correctable)", len(self.parsing_error_dict["bad_kegg_code_format"])) @@ -2011,7 +2087,6 @@ def create(self): self.run.info('Number of parsing errors (corrected)', self.num_corrected_errors, quiet=self.quiet) self.run.info('Number of parsing errors (uncorrected)', self.num_uncorrected_errors, quiet=self.quiet) - # record some useful metadata self.db.set_meta_value('db_type', 'modules') self.db.set_meta_value('num_modules', num_modules_parsed) @@ -2038,7 +2113,6 @@ def get_db_content_hash(self): return str(hashlib.sha224(mods_and_orths.encode('utf-8')).hexdigest())[0:12] - # KEGG Modules Table functions for data access and parsing start below # ==================================================================== def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): @@ -2050,12 +2124,15 @@ def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): PARAMETERS ========== - module_num str, the module to fetch data for - data_name str, which data_name field we want + module_num : str + the module to fetch data for + data_name : str + which data_name field we want RETURNS ======= - data_values_to_ret list of str, the data_values corresponding to the module/data_name pair + data_values_to_ret : list of str + the data_values corresponding to the module/data_name pair """ where_clause_string = "module = '%s'" % (module_num) @@ -2068,25 +2145,30 @@ def get_data_value_entries_for_module_by_data_name(self, module_num, data_name): data_values_to_ret.append(dict_from_mod_table[key]['data_value']) if not data_values_to_ret: - self.run.warning("Just so you know, we tried to fetch data from the KEGG Modules database for the data_name field %s and KEGG module %s, \ - but didn't come up with anything, so an empty list is being returned. This may cause errors down the line, and if so we're very sorry for that.") + self.run.warning("Just so you know, we tried to fetch data from the KEGG Modules database for the data_name field %s " + "and KEGG module %s, but didn't come up with anything, so an empty list is being returned. This may " + "cause errors down the line, and if so we're very sorry for that.") return data_values_to_ret + def get_all_modules_as_list(self): """This function returns a list of all modules in the DB.""" return self.db.get_single_column_from_table(self.module_table_name, 'module', unique=True) + def get_all_knums_as_list(self): """This function returns a list of all KO numbers in the DB.""" where_clause_string = "data_name = 'ORTHOLOGY'" return self.db.get_single_column_from_table(self.module_table_name, 'data_value', unique=True, where_clause=where_clause_string) + def get_modules_for_knum(self, knum): """This function returns a list of modules that the given KO belongs to.""" where_clause_string = "data_value = '%s'" % (knum) return self.db.get_single_column_from_table(self.module_table_name, 'module', unique=True, where_clause=where_clause_string) + def get_module_classes_for_knum_as_dict(self, knum): """This function returns the classes for the modules that a given KO belongs to in a dictionary of dictionaries keyed by module number.""" mods = self.get_modules_for_knum(knum) @@ -2095,6 +2177,7 @@ def get_module_classes_for_knum_as_dict(self, knum): all_mods_classes_dict[mnum] = self.get_kegg_module_class_dict(mnum) return all_mods_classes_dict + def get_module_classes_for_knum_as_list(self, knum): """This function returns the classes for the modules that a given KO belongs to as a list of strings.""" mods = self.get_modules_for_knum(knum) @@ -2104,12 +2187,14 @@ def get_module_classes_for_knum_as_list(self, knum): all_mods_classes_list.append(mod_class) return all_mods_classes_list + def get_module_name(self, mnum): """This function returns the name of the specified KEGG module.""" where_clause_string = "module = '%s'" % (mnum) # there should only be one NAME per module, so we return the first list element return self.get_data_value_entries_for_module_by_data_name(mnum, "NAME")[0] + def get_module_names_for_knum(self, knum): """This function returns all names of each KEGG module that the given KO belongs to in a dictionary keyed by module number.""" mods = self.get_modules_for_knum(knum) @@ -2118,6 +2203,7 @@ def get_module_names_for_knum(self, knum): module_names[mnum] = self.get_module_name(mnum) return module_names + def parse_kegg_class_value(self, class_data_val): """This function takes a data_value string for the CLASS field in the modules table and parses it into a dictionary. @@ -2129,6 +2215,7 @@ def parse_kegg_class_value(self, class_data_val): class_dict = {"class" : fields[0], "category" : fields[1], "subcategory" : fields[2] if len(fields) > 2 else None} return class_dict + def get_kegg_module_class_dict(self, mnum): """This function returns a dictionary of values in the CLASS field for a specific module @@ -2166,16 +2253,29 @@ def unroll_module_definition(self, mnum): def split_by_delim_not_within_parens(self, d, delims, return_delims=False): """Takes a string, and splits it on the given delimiter(s) as long as the delimeter is not within parentheses. + This function exists because regular expressions don't handle nested parentheses very well. It is used in the + recursive module definition unrolling functions to split module steps, but it is generically written in case + it could have other uses in the future. + + The function can also be used to determine if the parentheses in the string are unbalanced (it will return False + instead of the list of splits in this situation) + PARAMETERS ========== - d string - delims a single delimiter, or a list of delimiters - return_delims boolean, if this is true then the list of delimiters found at between each split is also returned + d : str + string to split + delims : str or list of str + a single delimiter, or a list of delimiters, to split on + return_delims : boolean + if this is true then the list of delimiters found between each split is also returned RETURNS ======= - splits list of strings that were split from d - delim_list list of delimiters that were ofund between each split + If parentheses are unbalanced in the string, this function returns False. Otherwise: + splits : list + strings that were split from d + delim_list : list + delimiters that were found between each split (only returned if return_delims is True) """ parens_level = 0 @@ -2214,6 +2314,16 @@ def recursive_definition_unroller(self, step): their respective components, which may be split further by the split_paths() function to find all possible alternative complexes, before being used to extend each path. Compound steps are split and recursively processed by the split_paths() function before the resulting downstream paths are used to extend each path. + + PARAMETERS + ========== + step : str + step definition to split into component steps as necessary + + RETURNS + ======= + paths_list : list + all paths that the input step has been unrolled into """ split_steps = self.split_by_delim_not_within_parens(step, " ") @@ -2251,7 +2361,7 @@ def recursive_definition_unroller(self, step): for a in alts: if len(a) > 1: raise ConfigError("Uh oh. recursive_definition_unroller() speaking. We found a protein complex with more " - "than one KO per alternative option here: %s" % s) + "than one KO per alternative option here: %s" % s) for cs in complex_strs: extended_complex = cs + a[0] new_complex_strs.append(extended_complex) @@ -2285,6 +2395,7 @@ def recursive_definition_unroller(self, step): return paths_list + def split_path(self, step): """This function handles compound steps that should be split into multiple alternative paths. @@ -2292,6 +2403,7 @@ def split_path(self, step): it recursively calls the definition unrolling function to parse it. The list of all alternative paths that can be made from this step is returned. """ + if step[0] == "(" and step[-1] == ")": substeps = self.split_by_delim_not_within_parens(step[1:-1], ",") if not substeps: # if it doesn't work, try without removing surrounding parentheses From 10e8b564b187fe750c70b8ddab19c25693167135 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 13:38:25 -0500 Subject: [PATCH 388/400] sanity check to avoid overwriting user-specified directories with --reset, and check for writability of dir --- anvio/kegg.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index a76545f9b0..26dcd00ffb 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -65,6 +65,15 @@ def __init__(self, args): self.kegg_pathway_file = os.path.join(self.kegg_data_dir, "pathways.keg") self.kegg_modules_db_path = os.path.join(self.kegg_data_dir, "MODULES.db") + # sanity check to prevent automatic overwriting of non-default kegg data dir + if A('reset') and A('kegg_data_dir'): + raise ConfigError("You are attempting to run KEGG setup on a non-default data directory (%s) using the --reset flag. " + "To avoid automatically deleting a directory that may be important to you, anvi'o refuses to reset " + "directories that have been specified with --kegg-data-dir. If you really want to get rid of this " + "directory and regenerate it with KEGG data inside, then please remove the directory yourself using " + "a command like `rm -r %s`. We are sorry to make you go through this extra trouble, but it really is " + "the safest way to handle things." % (self.kegg_data_dir, self.kegg_data dir)) + def setup_ko_dict(self): """The purpose of this function is to process the ko_list file into usable form by KEGG sub-classes. @@ -171,6 +180,8 @@ def __init__(self, args, run=run, progress=progress): filesnpaths.is_program_exists('hmmpress') + filesnpaths.is_output_dir_writable(self.kegg_data_dir) + if not args.reset and not anvio.DEBUG: self.is_database_exists() @@ -197,28 +208,32 @@ def is_database_exists(self): """This function determines whether the user has already downloaded the Kofam HMM profiles and KEGG modules.""" if os.path.exists(self.kofam_hmm_file_path): - raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use --reset flag " - "if you want to re-download it." % self.kegg_data_dir) + raise ConfigError("It seems you already have KOfam HMM profiles installed in '%s', please use the --reset flag " + "or delete this directory manually if you want to re-download it." % self.kegg_data_dir) if os.path.exists(self.kegg_module_file): raise ConfigError("Interestingly, though KOfam HMM profiles are not installed on your system, KEGG module " - "information seems to have been already downloaded in %s. Please use the --reset flag to " - "re-download everything from scratch." % self.kegg_data_dir) + "information seems to have been already downloaded in %s. Please use the --reset flag or " + "delete this directory manually to let this script re-download everything from scratch." + % self.kegg_data_dir) if os.path.exists(self.kegg_pathway_file): raise ConfigError("Interestingly, though KOfam HMM profiles are not installed on your system, KEGG pathway " - "information seems to have been already downloaded in %s. Please use the --reset flag to " - "re-download everything from scratch." % self.kegg_data_dir) + "information seems to have been already downloaded in %s. Please use the --reset flag or " + "delete this directory manually to let this script re-download everything from scratch." + % self.kegg_data_dir) if os.path.exists(self.module_data_dir): raise ConfigError("It seems the KEGG module directory %s already exists on your system. This is even more " "strange because Kofam HMM profiles have not been downloaded. We suggest you to use the " - "--reset flag to download everything from scratch." % self.module_data_dir) + "--reset flag or delete the KEGG directory (%s) manually to download everything from scratch." + % (self.module_data_dir, self.kegg_data_dir)) if os.path.exists(self.pathway_data_dir): raise ConfigError("It seems the KEGG pathway directory %s already exists on your system. This is even more " "strange because Kofam HMM profiles have not been downloaded. We suggest you to use the " - "--reset flag to download everything from scratch." % self.pathway_data_dir) + "--reset flag or delete the KEGG directory (%s) manually to download everything from scratch." + % (self.pathway_data_dir, self.kegg_data_dir)) def download_profiles(self): @@ -2403,7 +2418,7 @@ def split_path(self, step): it recursively calls the definition unrolling function to parse it. The list of all alternative paths that can be made from this step is returned. """ - + if step[0] == "(" and step[-1] == ")": substeps = self.split_by_delim_not_within_parens(step[1:-1], ",") if not substeps: # if it doesn't work, try without removing surrounding parentheses From d348a0de9e5795a311dcf8490353251d37fb7514 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 15:06:33 -0500 Subject: [PATCH 389/400] bug fix --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 26dcd00ffb..09a6b9ac38 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -72,7 +72,7 @@ def __init__(self, args): "directories that have been specified with --kegg-data-dir. If you really want to get rid of this " "directory and regenerate it with KEGG data inside, then please remove the directory yourself using " "a command like `rm -r %s`. We are sorry to make you go through this extra trouble, but it really is " - "the safest way to handle things." % (self.kegg_data_dir, self.kegg_data dir)) + "the safest way to handle things." % (self.kegg_data_dir, self.kegg_data_dir)) def setup_ko_dict(self): From f5d4cab53c9f067d0cae07f85c9ec0fb1ed57ae7 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 15:07:58 -0500 Subject: [PATCH 390/400] missing space --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 09a6b9ac38..7ff27995a7 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -925,7 +925,7 @@ def init_hits_and_splits(self): if len(splits_missing_in_profile_db): self.progress.reset() self.run.warning("Please note that anvi'o found %s splits in your contigs database with KOfam hits. But only %s of them " - "appear in the profile database. As a result, anvi'o will now remove the %s splits with KOfam hits" + "appear in the profile database. As a result, anvi'o will now remove the %s splits with KOfam hits " "that occur only in the contigs db from all downstream analyses. Where is this difference coming from though? " "Well. This is often the case because the 'minimum contig length parameter' set during the `anvi-profile` " "step can exclude many contigs from downstream analyses (often for good reasons, too). For " From 5b1c2e870a9625b862d1674d3c506804a99043d3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 15:12:13 -0500 Subject: [PATCH 391/400] remove erroneous explanation --- anvio/parsers/hmmscan.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index f5c90c592c..675c338c4e 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -130,7 +130,6 @@ def get_search_results(self, ko_list_dict = None): 'e_value': hit['e_value']} else: - # but in Pfams, we don't care, we just keep all hits entry = {'entry_id': entry_id, 'gene_name': hit['gene_name'], 'gene_hmm_id': hit['gene_hmm_id'], From a408a5cad7fd4ceed48f0e1b6422b95f4c4e1e8b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 15:19:13 -0500 Subject: [PATCH 392/400] refactor variable names and error messages to be agnostic to kegg --- anvio/kegg.py | 2 +- anvio/parsers/hmmscan.py | 34 +++++++++++++++++----------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 7ff27995a7..ffb095d230 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -716,7 +716,7 @@ def process_kofam_hmms(self): # parse hmmscan output parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE', program=self.hmm_program) - search_results_dict = parser.get_search_results(ko_list_dict=self.ko_dict) + search_results_dict = parser.get_search_results(noise_cutoff_dict=self.ko_dict) # add functions and KEGG modules info to database functions_dict = {} diff --git a/anvio/parsers/hmmscan.py b/anvio/parsers/hmmscan.py index 675c338c4e..c6d7b256ec 100644 --- a/anvio/parsers/hmmscan.py +++ b/anvio/parsers/hmmscan.py @@ -66,14 +66,14 @@ def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE', program='hm Parser.__init__(self, 'HMMScan', [hmm_scan_hits_txt], files_expected, files_structure) - def get_search_results(self, ko_list_dict = None): + def get_search_results(self, noise_cutoff_dict = None): """This function goes through the hits provided by `hmmscan` and generates an annotation dictionary with the relevant information about each hit. - If we are parsing Kofam hits, then this function makes sure only hits with a high enough bit score make it into the annotation dictionary. + This function makes sure only hits with a high enough bit score make it into the annotation dictionary. Parameters ========== - ko_list_dict dictionary of the ko_list file; see setup_ko_dict in kofam.py for more details + noise_cutoff_dict dictionary of noise cutoff terms; see setup_ko_dict in kofam.py for an example Returns ======= @@ -90,12 +90,12 @@ def get_search_results(self, ko_list_dict = None): for hit in list(self.dicts['hits'].values()): entry = None if self.context == 'GENE': - # This is for KEGG Kofams. Here we only add the hit to the annotations_dict if the appropriate bit score is above the - # threshold set in ko_list_dict (which is indexed by ko num, aka gene_name in the hits dict) - if ko_list_dict and hit['gene_name'] in ko_list_dict.keys(): - knum = hit['gene_name'] - score_type = ko_list_dict[knum]['score_type'] - threshold = ko_list_dict[knum]['threshold'] + # Here we only add the hit to the annotations_dict if the appropriate bit score is above the + # threshold set in noise_cutoff_dict (which is indexed by profile name (aka gene_name in the hits dict) + if noise_cutoff_dict and hit['gene_name'] in noise_cutoff_dict.keys(): + hmm_entry_name = hit['gene_name'] + score_type = noise_cutoff_dict[hmm_entry_name]['score_type'] + threshold = noise_cutoff_dict[hmm_entry_name]['threshold'] keep = True if score_type == 'full': if hit['bit_score'] < float(threshold): @@ -104,9 +104,9 @@ def get_search_results(self, ko_list_dict = None): if hit['dom_bit_score'] < float(threshold): keep = False else: - self.run.warning("Oh dear. The Kofam profile %s has a strange score_type value: %s. The only accepted values \ - for this type are 'full' or 'domain', so anvi'o cannot parse the hits to this profile. All hits will be kept \ - regardless of bit score. You have been warned." % (hit['gene_name'], score_type)) + self.run.warning("Oh dear. The HMM profile %s has a strange score_type value: %s. The only accepted values " + "for this type are 'full' or 'domain', so anvi'o cannot parse the hits to this profile. All hits " + "will be kept regardless of bit score. You have been warned." % (hit['gene_name'], score_type)) if keep: entry = {'entry_id': entry_id, @@ -117,12 +117,12 @@ def get_search_results(self, ko_list_dict = None): else: num_hits_removed += 1 - elif ko_list_dict and hit['gene_name'] not in ko_list_dict.keys(): + elif noise_cutoff_dict and hit['gene_name'] not in noise_cutoff_dict.keys(): # this should never happen, in an ideal world where everything is filled with butterflies and happiness - self.run.warning("Hmm. While parsing your Kofam hits, it seems the Kofam profile %s was not found in the ko_list dictionary. \ - This should probably not ever happen, and you should contact a developer as soon as possible to figure out what \ - is going on. But for now, anvi'o is going to keep all hits to this profile. Consider those hits with a grain of salt, \ - as not all of them may be good." % hit['gene_name']) + self.run.warning("Hmm. While parsing your HMM hits, it seems the HMM profile %s was not found in the noise cutoff dictionary. " + "This should probably not ever happen, and you should contact a developer as soon as possible to figure out what " + "is going on. But for now, anvi'o is going to keep all hits to this profile. Consider those hits with a grain of salt, " + "as not all of them may be good." % hit['gene_name']) entry = {'entry_id': entry_id, 'gene_name': hit['gene_name'], 'gene_hmm_id': hit['gene_hmm_id'], From a399268cfe3cde261db843d53615825a7dcb153f Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 15:36:55 -0500 Subject: [PATCH 393/400] cosmetic updates :) --- anvio/pfam.py | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/anvio/pfam.py b/anvio/pfam.py index 315727b46b..434779a64e 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -129,21 +129,21 @@ def confirm_downloaded_files(self): def decompress_files(self): - + """Decompresses and runs hmmpress on Pfam HMM profiles.""" for file_name in self.files: full_path = os.path.join(self.pfam_data_dir, file_name) if full_path.endswith('.gz'): if not os.path.exists(full_path) and os.path.exists(full_path[:-3]): - self.run.warning("It seems the file at %s is already decompressed. You are probably seeing \ - this message because Pfams was set up previously on this computer. Hakuna Matata. Anvi'o will \ - simply skip decompressing this file at this time. But if you think there is an issue, you can \ - re-do the Pfam setup by running `anvi-setup-pfams` again and using the --reset flag." \ - % (full_path[:-3])) + self.run.warning("It seems the file at %s is already decompressed. You are probably seeing " + "this message because Pfams was set up previously on this computer. Hakuna Matata. Anvi'o will " + "simply skip decompressing this file at this time. But if you think there is an issue, you can " + "re-do the Pfam setup by running `anvi-setup-pfams` again and using the --reset flag." + % (full_path[:-3])) continue elif not os.path.exists(full_path): - raise ConfigError("Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running \ - `anvi-setup-pfams` using the --reset flag." % (full_path)) + raise ConfigError("Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running " + "`anvi-setup-pfams` using the --reset flag." % (full_path)) utils.gzip_decompress_file(full_path) os.remove(full_path) @@ -153,8 +153,8 @@ def decompress_files(self): ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: - raise ConfigError("Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. \ - Check out the log file ('%s') to see what went wrong." % (log_file_path)) + raise ConfigError("Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. " + "Check out the log file ('%s') to see what went wrong." % (log_file_path)) else: # getting rid of the log file because hmmpress was successful os.remove(log_file_path) @@ -191,18 +191,16 @@ def __init__(self, args, run=run, progress=progress): def is_database_exists(self): """ This function verifies that pfam_data_dir contains the Pfam hmm profiles and checks whether they are compressed or not. - If they are compressed, we decompress them and run hmmpress. - - PARAMETERS: N/A - RETURNS: N/A + If they are compressed, we decompress them and run hmmpress. """ + if not (os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')) or os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm'))): raise ConfigError("It seems you do not have Pfam database installed, please run 'anvi-setup-pfams' to download it.") # here we check if the HMM profile is compressed so we can decompress it for next time if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')): - self.run.warning("Anvi'o has detected that your Pfam database is currently compressed. It will now be unpacked before \ - running HMMs.") + self.run.warning("Anvi'o has detected that your Pfam database is currently compressed. It will now be unpacked before " + "running HMMs.") utils.gzip_decompress_file(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz'), keep_original=False) cmd_line = ['hmmpress', os.path.join(self.pfam_data_dir, 'Pfam-A.hmm')] @@ -210,8 +208,8 @@ def is_database_exists(self): ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: - raise ConfigError("Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. \ - Check out the log file ('%s') to see what went wrong." % (log_file_path)) + raise ConfigError("Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. " + "Check out the log file ('%s') to see what went wrong." % (log_file_path)) else: # getting rid of the log file because hmmpress was successful os.remove(log_file_path) From 99917d32a5a3261ec5d068a1f8b2ffd1d2cd7201 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 15:40:39 -0500 Subject: [PATCH 394/400] add safety check for writable pfams dir and avoid overwriting user-specified directory with --reset --- anvio/pfam.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/anvio/pfam.py b/anvio/pfam.py index 434779a64e..4318c854b1 100644 --- a/anvio/pfam.py +++ b/anvio/pfam.py @@ -59,9 +59,19 @@ def __init__(self, args, run=run, progress=progress): filesnpaths.is_program_exists('hmmpress') + if self.pfam_data_dir and args.reset: + raise ConfigError("You are attempting to run Pfam setup on a non-default data directory (%s) using the --reset flag. " + "To avoid automatically deleting a directory that may be important to you, anvi'o refuses to reset " + "directories that have been specified with --pfam-data-dir. If you really want to get rid of this " + "directory and regenerate it with Pfam data inside, then please remove the directory yourself using " + "a command like `rm -r %s`. We are sorry to make you go through this extra trouble, but it really is " + "the safest way to handle things." % (self.pfam_data_dir, self.pfam_data_dir)) + if not self.pfam_data_dir: self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/Pfam') + filesnpaths.is_output_dir_writable(self.pfam_data_dir) + if not args.reset and not anvio.DEBUG: self.is_database_exists() From 2d22ab3e21f5c9f81cfc46b2b3a077d049408e7d Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 15:41:17 -0500 Subject: [PATCH 395/400] move check for tarfile from utils to filesnpaths --- anvio/filesnpaths.py | 13 +++++++++++++ anvio/utils.py | 13 ++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/anvio/filesnpaths.py b/anvio/filesnpaths.py index 9c27a48ba9..0f7a361279 100644 --- a/anvio/filesnpaths.py +++ b/anvio/filesnpaths.py @@ -7,6 +7,7 @@ import time import shutil import tempfile +import tarfile import anvio import anvio.fastalib as u @@ -187,6 +188,18 @@ def is_file_plain_text(file_path, dont_raise=False): return True +def is_file_tar_file(file_path, dont_raise=False): + is_file_exists(file_path) + + is_tarfile = tarfile.is_tarfile(file_path) + if dont_raise: + return False + else: + raise FilesNPathsError("The file at '%s' does not seem to be a tarfile." % file_path) + + return True + + def is_program_exists(program): """adapted from http://stackoverflow.com/a/377028""" def is_exe(fpath): diff --git a/anvio/utils.py b/anvio/utils.py index 87b6a4bdac..8bc075d928 100755 --- a/anvio/utils.py +++ b/anvio/utils.py @@ -383,16 +383,12 @@ def gzip_decompress_file(input_file_path, output_file_path=None, keep_original=T return output_file_path def tar_extract_file(input_file_path, output_file_path=None, keep_original=True): - filesnpaths.is_file_exists(input_file_path) - - if not tarfile.is_tarfile(input_file_path): - raise ConfigError("the tar_extract_file function is terribly upset because your input file ('%s') is\ - apparently not a tar file 🤷") + filesnpaths.is_file_tar_file(input_file_path) if not output_file_path: - raise ConfigError("the tar_extract_file function is displeased because an output file path has not been specified.\ - If you are seeing this message, you are probably a developer, so go fix your code please, and \ - everyone will be happy then.") + raise ConfigError("The tar_extract_file function is displeased because an output file path has not been specified. " + "If you are seeing this message, you are probably a developer, so go fix your code please, and " + "everyone will be happy then.") tf = tarfile.open(input_file_path) tf.extractall(path = output_file_path) @@ -401,7 +397,6 @@ def tar_extract_file(input_file_path, output_file_path=None, keep_original=True) os.remove(input_file_path) - class RunInDirectory(object): """ Run any block of code in a specified directory. Return to original directory From d2475a92f9e95e41e42ede48fd5c1ab3be9dfca4 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 16:05:35 -0500 Subject: [PATCH 396/400] add option to return none if a key is not in the self table --- anvio/db.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anvio/db.py b/anvio/db.py index 8ee057c846..d142ec7102 100644 --- a/anvio/db.py +++ b/anvio/db.py @@ -185,10 +185,13 @@ def get_max_value_in_column(self, table_name, column_name, value_if_empty=None, return val - def get_meta_value(self, key, try_as_type_int=True): + def get_meta_value(self, key, try_as_type_int=True, return_none_if_not_in_table=False): """if try_as_type_int, value is attempted to be converted to integer. If it fails, no harm no foul.""" + response = self._exec("""SELECT value FROM self WHERE key='%s'""" % key) rows = response.fetchall() + if not rows and return_none_if_not_in_table: + return None if not rows: raise ConfigError("A value for '%s' does not seem to be set in table 'self'." % key) @@ -611,7 +614,7 @@ def get_some_rows_from_table_as_dict(self, table_name, where_clause, error_if_no if true, this function will raise an error if no data is selected from the table. otherwise, it will quietly return the empty dictionary string_the_key: bool - if true, the row number will be converted to a string before being used as a key in the dictionary + if true, the row number will be converted to a string before being used as a key in the dictionary row_num_as_key: bool added as parameter so this function works for KEGG MODULES.db, which does not have unique IDs in the first column. If True, the returned dictionary will be keyed by integers from 0 to (# rows returned - 1) From aa1855f68ca3a6668093b9cc1c1e7867f9ae2134 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 16:06:11 -0500 Subject: [PATCH 397/400] sanity check for already-annotated contigs db --- anvio/kegg.py | 13 +++++++++++-- bin/anvi-run-kegg-kofams | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index ffb095d230..27aa989258 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -645,6 +645,14 @@ def set_hash_in_contigs_db(self): self.contigs_db_path = A('contigs_db') contigs_db = ContigsDatabase(self.contigs_db_path) + current_module_hash_in_contigs_db = contigs_db.db.get_meta_value('modules_db_hash', return_none_if_not_in_table=True) + + if current_module_hash_in_contigs_db and not self.just_do_it: + raise ConfigError("The contigs database (%s) has already been annotated with KOfam hits. If you really want to " + "overwrite these annotations with new ones, please re-run the command with the flag --just-do-it. " + "For those who need this information, the Modules DB used to annotate this contigs database previously " + "had the following hash: %s" % (self.contigs_db_path, current_module_hash_in_contigs_db)) + contigs_db.db.set_meta_value('modules_db_hash', self.kegg_modules_db.db.get_meta_value('hash')) contigs_db.disconnect() @@ -685,6 +693,9 @@ def process_kofam_hmms(self): tmp_directory_path = filesnpaths.get_temp_directory_path() contigs_db = ContigsSuperclass(self.args) # initialize contigs db + # mark contigs db with hash of modules.db content for version tracking + # this function also includes a safety check for previous annotations so that people don't overwrite those if they don't want to + self.set_hash_in_contigs_db() # get AA sequences as FASTA target_files_dict = {'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')} @@ -776,8 +787,6 @@ def process_kofam_hmms(self): "a functional source.") gene_function_calls_table.add_empty_sources_to_functional_sources({'KOfam'}) - # mark contigs db with hash of modules.db content for version tracking - self.set_hash_in_contigs_db() if anvio.DEBUG: run.warning("The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up " diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams index ef94d7f9fe..7035a98b43 100755 --- a/bin/anvi-run-kegg-kofams +++ b/bin/anvi-run-kegg-kofams @@ -37,6 +37,7 @@ if __name__ == '__main__': groupO.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) groupO.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) groupO.add_argument(*anvio.A('hmmer-program'), **anvio.K('hmmer-program')) + groupO.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) args = anvio.get_args(parser) From a80b3e3a895e1a6e7bc740718308618720dce866 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 16:08:53 -0500 Subject: [PATCH 398/400] update provides/requires statements --- bin/anvi-estimate-kegg-metabolism | 4 ++-- bin/anvi-run-kegg-kofams | 2 +- bin/anvi-setup-kegg-kofams | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/anvi-estimate-kegg-metabolism b/bin/anvi-estimate-kegg-metabolism index 99fe0c2842..2593ba75cb 100755 --- a/bin/anvi-estimate-kegg-metabolism +++ b/bin/anvi-estimate-kegg-metabolism @@ -15,8 +15,8 @@ __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "Iva Veseli" __email__ = "iveseli@uchicago.edu" -__requires__ = ["contigs-db", "kofam-data", "kegg-modules-db", "kegg-functions",] -__provides__ = ["kegg-metabolism",] +__requires__ = ["contigs-db", "kegg-db", "kegg-functions",] +__provides__ = ["kegg-metabolism",] __description__ = "Reconstructs metabolic pathways and estimates pathway completeness for a given set of contigs." diff --git a/bin/anvi-run-kegg-kofams b/bin/anvi-run-kegg-kofams index 7035a98b43..c7a3fb4f98 100755 --- a/bin/anvi-run-kegg-kofams +++ b/bin/anvi-run-kegg-kofams @@ -16,7 +16,7 @@ __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "Iva Veseli" __email__ = "iveseli@uchicago.edu" -__requires__ = ["contigs-db", "kofam-data", "kegg-modules-db",] +__requires__ = ["contigs-db", "kegg-db",] __provides__ = ["kegg-functions",] __description__ = "Run KOfam HMMs on an anvi'o contigs database." diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams index 0ab10aceec..673abc187a 100755 --- a/bin/anvi-setup-kegg-kofams +++ b/bin/anvi-setup-kegg-kofams @@ -15,7 +15,7 @@ __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "Iva Veseli" __email__ = "iveseli@uchicago.edu" -__provides__ = ["kofam-data", "kegg-modules-db",] +__provides__ = ["kegg-db",] __description__ = "Download and setup KEGG KOfam HMM profiles." @time_program From 4d6cc479692f10187e5cc1f2845a0f9e2d1a7a06 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 16:18:37 -0500 Subject: [PATCH 399/400] add kegg-related items to anvio_items --- anvio/programs.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anvio/programs.py b/anvio/programs.py index 03671393ca..98506fa50c 100644 --- a/anvio/programs.py +++ b/anvio/programs.py @@ -34,7 +34,7 @@ J = lambda x: '\n'.join(x) if x else '' # this dictionary describes all anvi'o items that are referred from 'requires' and -# 'provudes' statements written in anvi'o programs +# 'provides' statements written in anvi'o programs ANVIO_ITEMS = {'pan-db': {'name': 'PAN', 'type': 'DB', 'internal': True}, 'contigs-db': {'name': 'CONTIGS', 'type': 'DB', 'internal': True}, 'contigs-fasta': {'name': 'CONTIGS', 'type': 'FASTA', 'internal': False}, @@ -47,6 +47,7 @@ 'locus-fasta': {'name': 'LOCUS', 'type': 'FASTA', 'internal': False}, 'structure-db': {'name': 'STRUCTURE', 'type': 'DB', 'internal': True}, 'pdb-db': {'name': 'PDB DB', 'type': 'CONCEPT', 'internal': True}, + 'kegg-db': {'name': 'KEGG DB', type': 'DB', 'internal': True}, 'single-profile-db': {'name': 'SINGLE PROFILE', 'type': 'DB', 'internal': True}, 'profile-db': {'name': 'PROFILE', 'type': 'DB', 'internal': True}, 'genes-db': {'name': 'GENES', 'type': 'DB', 'internal': True}, @@ -80,6 +81,7 @@ 'functions': {'name': 'GENE FUNCTIONS', 'type': 'CONCEPT', 'internal': True}, 'functions-txt': {'name': 'GENE FUNCTIONS', 'type': 'TXT', 'internal': False}, 'functional-enrichment-txt': {'name': 'ENRICHED FUNCTIONS', 'type': 'TXT', 'internal': False}, + 'kegg-functions': {'name': 'KOFAM FUNCTIONS', 'type': 'CONCEPT', 'internal': True}, 'interactive': {'name': 'INTERACTIVE DISPLAY', 'type': 'DISPLAY', 'internal': True}, 'view-data': {'name': 'VIEW DATA', 'type': 'TXT', 'internal': False}, 'layer-taxonomy': {'name': 'LAYER TAXONOMY', 'type': 'CONCEPT', 'internal': True}, @@ -103,7 +105,8 @@ 'split-bins': {'name': 'SPLIT BINS', 'type': 'CONCEPT', 'internal': False}, 'state': {'name': 'INTERACTIVE STATE', 'type': 'CONCEPT', 'internal': True}, 'ngrams': {'name': 'NGRAM', 'type': 'CONCEPT', 'internal': True}, - 'state-json': {'name': 'INTERACTIVE STATE', 'type': 'JSON', 'internal': False}} + 'state-json': {'name': 'INTERACTIVE STATE', 'type': 'JSON', 'internal': False}, + 'kegg-metabolism': {'name': 'KEGG METABOLISM ESTIMATES', 'type': 'TXT', 'internal': False}} ANVIO_CONCEPTS = {'functions': {'goes_in': ['contigs_db', 'genomes-storage-db'], 'used_by': ['anvi-search-functions']} From b4cc0e12675ea4020b6a2e61cd994d8454d9cf72 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 24 Apr 2020 16:19:38 -0500 Subject: [PATCH 400/400] missing quote --- anvio/programs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/programs.py b/anvio/programs.py index 98506fa50c..a7b9a1fff6 100644 --- a/anvio/programs.py +++ b/anvio/programs.py @@ -47,7 +47,7 @@ 'locus-fasta': {'name': 'LOCUS', 'type': 'FASTA', 'internal': False}, 'structure-db': {'name': 'STRUCTURE', 'type': 'DB', 'internal': True}, 'pdb-db': {'name': 'PDB DB', 'type': 'CONCEPT', 'internal': True}, - 'kegg-db': {'name': 'KEGG DB', type': 'DB', 'internal': True}, + 'kegg-db': {'name': 'KEGG DB', 'type': 'DB', 'internal': True}, 'single-profile-db': {'name': 'SINGLE PROFILE', 'type': 'DB', 'internal': True}, 'profile-db': {'name': 'PROFILE', 'type': 'DB', 'internal': True}, 'genes-db': {'name': 'GENES', 'type': 'DB', 'internal': True},