From 1071ca9e6468f7d6191ce607dff6cbea6e6c67d8 Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Mon, 14 Oct 2024 15:40:51 +0200 Subject: [PATCH 1/8] ENH: code for gff output added --- q2_moshpit/eggnog/orthologs/common.py | 24 ++++++++++++++++++------ q2_moshpit/eggnog/orthologs/diamond.py | 12 +++++++----- q2_moshpit/plugin_setup.py | 7 +++++-- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/q2_moshpit/eggnog/orthologs/common.py b/q2_moshpit/eggnog/orthologs/common.py index e9ea59d7..7d1808e8 100644 --- a/q2_moshpit/eggnog/orthologs/common.py +++ b/q2_moshpit/eggnog/orthologs/common.py @@ -7,6 +7,7 @@ # ---------------------------------------------------------------------------- import glob import os +import shutil import subprocess from typing import List @@ -16,7 +17,7 @@ from q2_types.feature_data import FeatureData from q2_types.feature_data_mag import MAG, MAGSequencesDirFmt -from q2_types.genome_data import SeedOrthologDirFmt, OrthologFileFmt +from q2_types.genome_data import SeedOrthologDirFmt, OrthologFileFmt, LociDirectoryFormat from q2_types.per_sample_sequences import ( Contigs, MAGs, ContigSequencesDirFmt, MultiMAGSequencesDirFmt ) @@ -88,14 +89,18 @@ def _run_eggnog_search_pipeline( _eggnog_feature_table = ctx.get_action("moshpit", "_eggnog_feature_table") (partitioned_sequences,) = partition_method(sequences, num_partitions) + gff_dir = LociDirectoryFormat() + # empty file in the gff_dir to avoid an error when moving the gff files + open(os.path.join(str(gff_dir), 'empty.gff'), 'w').close() + gff_artifact = ctx.make_artifact("GenomeData[Loci]", gff_dir) hits = [] for seq in partitioned_sequences.values(): - (hit, _) = _eggnog_search(seq, *db, num_cpus, db_in_memory) + (hit, _) = _eggnog_search(seq, *db, gff_artifact, num_cpus, db_in_memory) hits.append(hit) (collated_hits,) = collate_hits(hits) (collated_tables,) = _eggnog_feature_table(collated_hits) - return collated_hits, collated_tables + return collated_hits, collated_tables, gff_artifact def _search_runner( @@ -121,8 +126,8 @@ def _search_runner( """ cmd = [ 'emapper.py', '-i', str(input_path), '-o', sample_label, - '-m', *runner_args, - '--itype', 'metagenome', '--output_dir', output_loc, + '-m', *runner_args, '--genepred', 'prodigal', + '--itype', 'metagenome', '--decorate_gff', 'yes', '--output_dir', output_loc, '--cpu', str(num_cpus), '--no_annot' ] if db_in_memory: @@ -138,7 +143,7 @@ def _search_runner( def _eggnog_search( - sequences, search_runner, output_loc + sequences, search_runner, output_loc, gff_loc ) -> (SeedOrthologDirFmt, pd.DataFrame): # run analysis if isinstance(sequences, ContigSequencesDirFmt): @@ -152,6 +157,13 @@ def _eggnog_search( for mag_id, mag_fp in mags.items(): search_runner(input_path=mag_fp, sample_label=mag_id) + # iterate over the gff files and move them to the correct location + for fn in os.listdir(output_loc): + if fn.endswith('.emapper.decorated.gff'): + new_fn = fn.replace('.emapper.decorated.gff', '.gff') + shutil.move(os.path.join(str(output_loc), fn), + os.path.join(str(gff_loc), new_fn)) + result = SeedOrthologDirFmt() ortholog_fps = [ os.path.basename(x) for x diff --git a/q2_moshpit/eggnog/orthologs/diamond.py b/q2_moshpit/eggnog/orthologs/diamond.py index c496b094..e52edd03 100644 --- a/q2_moshpit/eggnog/orthologs/diamond.py +++ b/q2_moshpit/eggnog/orthologs/diamond.py @@ -19,7 +19,7 @@ MAGSequencesDirFmt ) from q2_types.genome_data import ( - SeedOrthologDirFmt + SeedOrthologDirFmt, LociDirectoryFormat ) from q2_types.per_sample_sequences import ( ContigSequencesDirFmt, MultiMAGSequencesDirFmt @@ -34,8 +34,9 @@ def _eggnog_diamond_search( MAGSequencesDirFmt ], diamond_db: DiamondDatabaseDirFmt, + gff_dir:LociDirectoryFormat, num_cpus: int = 1, - db_in_memory: bool = False + db_in_memory: bool = False, ) -> (SeedOrthologDirFmt, pd.DataFrame): with tempfile.TemporaryDirectory() as output_loc: db_fp = os.path.join(str(diamond_db), 'ref_db.dmnd') @@ -44,7 +45,8 @@ def _eggnog_diamond_search( num_cpus=num_cpus, db_in_memory=db_in_memory, runner_args=['diamond', '--dmnd_db', str(db_fp)] ) - result, ft = _eggnog_search(sequences, search_runner, str(output_loc)) + result, ft = _eggnog_search(sequences, search_runner, + str(output_loc), str(gff_dir)) return result, ft @@ -52,8 +54,8 @@ def eggnog_diamond_search( ctx, sequences, diamond_db, num_cpus=1, db_in_memory=False, num_partitions=None ): - collated_hits, collated_tables = _run_eggnog_search_pipeline( + collated_hits, collated_tables, gff_artifact = _run_eggnog_search_pipeline( ctx, sequences, [diamond_db], num_cpus, db_in_memory, num_partitions, "_eggnog_diamond_search" ) - return collated_hits, collated_tables + return collated_hits, collated_tables, gff_artifact diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index bca10433..4498b632 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -691,7 +691,8 @@ }, outputs=[ ('eggnog_hits', SampleData[Orthologs]), - ('table', FeatureTable[Frequency]) + ('table', FeatureTable[Frequency]), + ('gff_files', GenomeData[Loci]) ], name='Run eggNOG search using diamond aligner', description="This method performs the steps by which we find our " @@ -751,7 +752,8 @@ inputs={ 'sequences': SampleData[Contigs] | SampleData[MAGs] | FeatureData[MAG], - 'diamond_db': ReferenceDB[Diamond] + 'diamond_db': ReferenceDB[Diamond], + 'gff_dir': GenomeData[Loci], }, parameters={ 'num_cpus': Int, @@ -760,6 +762,7 @@ input_descriptions={ 'sequences': 'Sequences to be searched for ortholog hits.', 'diamond_db': 'Diamond database.', + 'gff_dir': 'Decorated hits files' }, parameter_descriptions={ 'num_cpus': 'Number of CPUs to utilize. \'0\' will ' From d1ce3fcee9c66070e71a462ae90f73559eecb3da Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Tue, 15 Oct 2024 13:55:29 +0200 Subject: [PATCH 2/8] ENH: code refined after adding prodigal --- q2_moshpit/eggnog/orthologs/common.py | 14 ++++++-------- q2_moshpit/eggnog/orthologs/diamond.py | 8 ++++---- q2_moshpit/plugin_setup.py | 10 +++++----- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/q2_moshpit/eggnog/orthologs/common.py b/q2_moshpit/eggnog/orthologs/common.py index 7d1808e8..e651adf4 100644 --- a/q2_moshpit/eggnog/orthologs/common.py +++ b/q2_moshpit/eggnog/orthologs/common.py @@ -89,18 +89,16 @@ def _run_eggnog_search_pipeline( _eggnog_feature_table = ctx.get_action("moshpit", "_eggnog_feature_table") (partitioned_sequences,) = partition_method(sequences, num_partitions) - gff_dir = LociDirectoryFormat() - # empty file in the gff_dir to avoid an error when moving the gff files - open(os.path.join(str(gff_dir), 'empty.gff'), 'w').close() - gff_artifact = ctx.make_artifact("GenomeData[Loci]", gff_dir) + loci_dir = LociDirectoryFormat() hits = [] for seq in partitioned_sequences.values(): - (hit, _) = _eggnog_search(seq, *db, gff_artifact, num_cpus, db_in_memory) + (hit, _) = _eggnog_search(seq, *db, str(loci_dir), num_cpus, db_in_memory) hits.append(hit) (collated_hits,) = collate_hits(hits) (collated_tables,) = _eggnog_feature_table(collated_hits) - return collated_hits, collated_tables, gff_artifact + loci = ctx.make_artifact("GenomeData[Loci]", loci_dir) + return collated_hits, collated_tables, loci def _search_runner( @@ -143,7 +141,7 @@ def _search_runner( def _eggnog_search( - sequences, search_runner, output_loc, gff_loc + sequences, search_runner, output_loc, loci_dir ) -> (SeedOrthologDirFmt, pd.DataFrame): # run analysis if isinstance(sequences, ContigSequencesDirFmt): @@ -162,7 +160,7 @@ def _eggnog_search( if fn.endswith('.emapper.decorated.gff'): new_fn = fn.replace('.emapper.decorated.gff', '.gff') shutil.move(os.path.join(str(output_loc), fn), - os.path.join(str(gff_loc), new_fn)) + os.path.join(loci_dir, new_fn)) result = SeedOrthologDirFmt() ortholog_fps = [ diff --git a/q2_moshpit/eggnog/orthologs/diamond.py b/q2_moshpit/eggnog/orthologs/diamond.py index e52edd03..30eb991b 100644 --- a/q2_moshpit/eggnog/orthologs/diamond.py +++ b/q2_moshpit/eggnog/orthologs/diamond.py @@ -34,7 +34,7 @@ def _eggnog_diamond_search( MAGSequencesDirFmt ], diamond_db: DiamondDatabaseDirFmt, - gff_dir:LociDirectoryFormat, + loci_dir:str, num_cpus: int = 1, db_in_memory: bool = False, ) -> (SeedOrthologDirFmt, pd.DataFrame): @@ -46,7 +46,7 @@ def _eggnog_diamond_search( runner_args=['diamond', '--dmnd_db', str(db_fp)] ) result, ft = _eggnog_search(sequences, search_runner, - str(output_loc), str(gff_dir)) + str(output_loc), loci_dir) return result, ft @@ -54,8 +54,8 @@ def eggnog_diamond_search( ctx, sequences, diamond_db, num_cpus=1, db_in_memory=False, num_partitions=None ): - collated_hits, collated_tables, gff_artifact = _run_eggnog_search_pipeline( + collated_hits, collated_tables, loci = _run_eggnog_search_pipeline( ctx, sequences, [diamond_db], num_cpus, db_in_memory, num_partitions, "_eggnog_diamond_search" ) - return collated_hits, collated_tables, gff_artifact + return collated_hits, collated_tables, loci diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 4498b632..3d942810 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -692,7 +692,7 @@ outputs=[ ('eggnog_hits', SampleData[Orthologs]), ('table', FeatureTable[Frequency]), - ('gff_files', GenomeData[Loci]) + ('loci', GenomeData[Loci]) ], name='Run eggNOG search using diamond aligner', description="This method performs the steps by which we find our " @@ -752,17 +752,16 @@ inputs={ 'sequences': SampleData[Contigs] | SampleData[MAGs] | FeatureData[MAG], - 'diamond_db': ReferenceDB[Diamond], - 'gff_dir': GenomeData[Loci], + 'diamond_db': ReferenceDB[Diamond] }, parameters={ 'num_cpus': Int, 'db_in_memory': Bool, + 'loci_dir': Str }, input_descriptions={ 'sequences': 'Sequences to be searched for ortholog hits.', - 'diamond_db': 'Diamond database.', - 'gff_dir': 'Decorated hits files' + 'diamond_db': 'Diamond database.' }, parameter_descriptions={ 'num_cpus': 'Number of CPUs to utilize. \'0\' will ' @@ -771,6 +770,7 @@ 'database can be very large, so this ' 'option should only be used on clusters or other ' 'machines with enough memory.', + 'loci_dir': 'Decorated hits files directory path.' }, outputs=[ ('eggnog_hits', SampleData[Orthologs]), From 596c11e56339fbe09bcae3f6f318f3447e9a1cd3 Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Mon, 9 Dec 2024 15:49:56 +0100 Subject: [PATCH 3/8] ENH: removing decorate-gff flag --- q2_moshpit/eggnog/orthologs/common.py | 34 ++++++++++++++--------- q2_moshpit/eggnog/orthologs/diamond.py | 11 ++++---- q2_moshpit/eggnog/tests/test_orthologs.py | 18 ++++++------ q2_moshpit/plugin_setup.py | 10 +++---- 4 files changed, 40 insertions(+), 33 deletions(-) diff --git a/q2_moshpit/eggnog/orthologs/common.py b/q2_moshpit/eggnog/orthologs/common.py index 6dc84a98..daf83e10 100644 --- a/q2_moshpit/eggnog/orthologs/common.py +++ b/q2_moshpit/eggnog/orthologs/common.py @@ -41,7 +41,6 @@ def _create_symlinks( os.path.join(target_dir, filename) ) - def _run_eggnog_search_pipeline( ctx: Context, sequences: qiime2.Artifact, @@ -86,19 +85,21 @@ def _run_eggnog_search_pipeline( partition_method = ctx.get_action(plugin, action_name) _eggnog_search = ctx.get_action("moshpit", search_action) collate_hits = ctx.get_action("types", "collate_orthologs") + collate_loci = ctx.get_action("types", "collate_loci") _eggnog_feature_table = ctx.get_action("moshpit", "_eggnog_feature_table") (partitioned_sequences,) = partition_method(sequences, num_partitions) - loci_dir = LociDirectoryFormat() hits = [] + loci = [] for seq in partitioned_sequences.values(): - (hit, _) = _eggnog_search(seq, *db, str(loci_dir), num_cpus, db_in_memory) + (hit, _, loci_dir) = _eggnog_search(seq, *db, num_cpus, db_in_memory) hits.append(hit) + loci.append(loci_dir) (collated_hits,) = collate_hits(hits) (collated_tables,) = _eggnog_feature_table(collated_hits) - loci = ctx.make_artifact("GenomeData[Loci]", loci_dir) - return collated_hits, collated_tables, loci + (collated_loci,) = collate_loci(loci) + return collated_hits, collated_tables, collated_loci def _search_runner( @@ -122,10 +123,11 @@ def _search_runner( into memory. - runner_args: Additional arguments to pass to the eggNOG-mapper command. """ + #output_loc = os.path.join("/home","dgrabocka", "q2_moshpit_enh", "trial") cmd = [ 'emapper.py', '-i', str(input_path), '-o', sample_label, '-m', *runner_args, '--genepred', 'prodigal', - '--itype', 'metagenome', '--decorate_gff', 'yes', '--output_dir', output_loc, + '--itype', 'metagenome', '--output_dir', output_loc, '--cpu', str(num_cpus), '--no_annot' ] if db_in_memory: @@ -141,7 +143,7 @@ def _search_runner( def _eggnog_search( - sequences, search_runner, output_loc, loci_dir + sequences, search_runner, output_loc ) -> (SeedOrthologDirFmt, pd.DataFrame): # run analysis if isinstance(sequences, ContigSequencesDirFmt): @@ -156,11 +158,17 @@ def _eggnog_search( search_runner(input_path=mag_fp, sample_label=mag_id) # iterate over the gff files and move them to the correct location - for fn in os.listdir(output_loc): - if fn.endswith('.emapper.decorated.gff'): - new_fn = fn.replace('.emapper.decorated.gff', '.gff') - shutil.move(os.path.join(str(output_loc), fn), - os.path.join(loci_dir, new_fn)) + loci_dir = LociDirectoryFormat() + gff_fp = [ + os.path.basename(x) for x + in glob.glob(f'{output_loc}/*.emapper.genepred.gff') + ] + for fn in gff_fp: + new_fn = fn.replace('.emapper.genepred.gff', '.gff') + qiime2.util.duplicate( + os.path.join(output_loc, fn), + os.path.join(loci_dir.path, new_fn) + ) result = SeedOrthologDirFmt() ortholog_fps = [ @@ -174,7 +182,7 @@ def _eggnog_search( ) ft = _eggnog_feature_table(result) - return result, ft + return result, ft, loci_dir def _eggnog_feature_table(seed_orthologs: SeedOrthologDirFmt) -> pd.DataFrame: diff --git a/q2_moshpit/eggnog/orthologs/diamond.py b/q2_moshpit/eggnog/orthologs/diamond.py index 30eb991b..f12f8e60 100644 --- a/q2_moshpit/eggnog/orthologs/diamond.py +++ b/q2_moshpit/eggnog/orthologs/diamond.py @@ -34,10 +34,9 @@ def _eggnog_diamond_search( MAGSequencesDirFmt ], diamond_db: DiamondDatabaseDirFmt, - loci_dir:str, num_cpus: int = 1, db_in_memory: bool = False, -) -> (SeedOrthologDirFmt, pd.DataFrame): +) -> (SeedOrthologDirFmt, pd.DataFrame, LociDirectoryFormat): with tempfile.TemporaryDirectory() as output_loc: db_fp = os.path.join(str(diamond_db), 'ref_db.dmnd') search_runner = partial( @@ -45,9 +44,9 @@ def _eggnog_diamond_search( num_cpus=num_cpus, db_in_memory=db_in_memory, runner_args=['diamond', '--dmnd_db', str(db_fp)] ) - result, ft = _eggnog_search(sequences, search_runner, - str(output_loc), loci_dir) - return result, ft + result, ft, loci = _eggnog_search(sequences, search_runner, + str(output_loc)) + return result, ft, loci def eggnog_diamond_search( @@ -58,4 +57,4 @@ def eggnog_diamond_search( ctx, sequences, [diamond_db], num_cpus, db_in_memory, num_partitions, "_eggnog_diamond_search" ) - return collated_hits, collated_tables, loci + return collated_hits, collated_tables, loci \ No newline at end of file diff --git a/q2_moshpit/eggnog/tests/test_orthologs.py b/q2_moshpit/eggnog/tests/test_orthologs.py index 6baee696..90b9c975 100644 --- a/q2_moshpit/eggnog/tests/test_orthologs.py +++ b/q2_moshpit/eggnog/tests/test_orthologs.py @@ -218,7 +218,7 @@ def test_good_small_search_contigs(self): self.get_data_path('contig-sequences-1') ).view(ContigSequencesDirFmt) - _, obs = _eggnog_diamond_search( + _, obs, _ = _eggnog_diamond_search( sequences=contigs, diamond_db=self.diamond_db ) @@ -234,7 +234,7 @@ def test_good_small_search_mags_derep(self): self.get_data_path('mag-sequences') ).view(MAGSequencesDirFmt) - _, obs = _eggnog_diamond_search( + _, obs, _ = _eggnog_diamond_search( sequences=mags, diamond_db=self.diamond_db ) @@ -253,7 +253,7 @@ def test_good_small_search_mags(self): self.get_data_path('mag-sequences-per-sample') ).view(MultiMAGSequencesDirFmt) - _, obs = _eggnog_diamond_search( + _, obs, _ = _eggnog_diamond_search( sequences=mags, diamond_db=self.diamond_db ) @@ -279,12 +279,12 @@ def test_eggnog_search_parallel_contigs(self): ) with self.test_config: - _, parallel = self.eggnog_diamond_search.parallel( + _, parallel, _ = self.eggnog_diamond_search.parallel( contigs, self.diamond_db_artifact )._result() - _, single = self._eggnog_diamond_search( + _, single, _ = self._eggnog_diamond_search( sequences=contigs, diamond_db=self.diamond_db_artifact ) @@ -301,12 +301,12 @@ def test_eggnog_search_parallel_mags_derep(self): ) with self.test_config: - _, parallel = self.eggnog_diamond_search.parallel( + _, parallel, _ = self.eggnog_diamond_search.parallel( mags, self.diamond_db_artifact )._result() - _, single = self._eggnog_diamond_search( + _, single, _ = self._eggnog_diamond_search( sequences=mags, diamond_db=self.diamond_db_artifact ) @@ -323,12 +323,12 @@ def test_eggnog_search_parallel_mags(self): ) with self.test_config: - _, parallel = self.eggnog_diamond_search.parallel( + _, parallel, _ = self.eggnog_diamond_search.parallel( mags, self.diamond_db_artifact )._result() - _, single = self._eggnog_diamond_search( + _, single, _ = self._eggnog_diamond_search( sequences=mags, diamond_db=self.diamond_db_artifact ) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index ab0a45dc..7a0cdb37 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -756,8 +756,7 @@ }, parameters={ 'num_cpus': Int, - 'db_in_memory': Bool, - 'loci_dir': Str + 'db_in_memory': Bool }, input_descriptions={ 'sequences': 'Sequences to be searched for ortholog hits.', @@ -770,16 +769,17 @@ 'database can be very large, so this ' 'option should only be used on clusters or other ' 'machines with enough memory.', - 'loci_dir': 'Decorated hits files directory path.' }, outputs=[ ('eggnog_hits', SampleData[Orthologs]), - ('table', FeatureTable[Frequency]) + ('table', FeatureTable[Frequency]), + ('loci', GenomeData[Loci]) ], output_descriptions={ 'eggnog_hits': 'BLAST6-like table(s) describing the identified ' 'orthologs. One table per sample or MAG in the input.', - 'table': 'Feature table with counts of orthologs per sample/MAG.' + 'table': 'Feature table with counts of orthologs per sample/MAG.', + 'loci': 'Loci of the identified orthologs.' }, name='Run eggNOG search using Diamond aligner', description="This method performs the steps by which we find our " From d12734f18f9fa53dc4b574e4db10decd789e9b56 Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Wed, 11 Dec 2024 13:49:39 +0100 Subject: [PATCH 4/8] ENH: removing decorate-gff flag --- q2_moshpit/eggnog/tests/test_orthologs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/q2_moshpit/eggnog/tests/test_orthologs.py b/q2_moshpit/eggnog/tests/test_orthologs.py index 90b9c975..ace64e39 100644 --- a/q2_moshpit/eggnog/tests/test_orthologs.py +++ b/q2_moshpit/eggnog/tests/test_orthologs.py @@ -35,6 +35,7 @@ ) + class TestHMMER(TestPluginBase): package = 'q2_moshpit.eggnog.tests' From 9ea433d4f984bb092115a734aca4ba018f739fed Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Wed, 11 Dec 2024 14:03:38 +0100 Subject: [PATCH 5/8] ENH: removing decorate-gff flag --- q2_moshpit/eggnog/tests/test_orthologs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/q2_moshpit/eggnog/tests/test_orthologs.py b/q2_moshpit/eggnog/tests/test_orthologs.py index ace64e39..a1a31e5d 100644 --- a/q2_moshpit/eggnog/tests/test_orthologs.py +++ b/q2_moshpit/eggnog/tests/test_orthologs.py @@ -36,9 +36,15 @@ + + + class TestHMMER(TestPluginBase): package = 'q2_moshpit.eggnog.tests' + + + def setUp(self): super().setUp() self.idmap_artifact = qiime2.Artifact.import_data( From de7458500149f2b8856176863270f25122bc5583 Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Wed, 11 Dec 2024 14:10:26 +0100 Subject: [PATCH 6/8] ENH: removing decorate-gff flag --- q2_moshpit/eggnog/tests/test_orthologs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/q2_moshpit/eggnog/tests/test_orthologs.py b/q2_moshpit/eggnog/tests/test_orthologs.py index a1a31e5d..8741a216 100644 --- a/q2_moshpit/eggnog/tests/test_orthologs.py +++ b/q2_moshpit/eggnog/tests/test_orthologs.py @@ -45,6 +45,9 @@ class TestHMMER(TestPluginBase): + + + def setUp(self): super().setUp() self.idmap_artifact = qiime2.Artifact.import_data( From 910c597ca8bf0521c177304ccaf94b8dd26b09ef Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Thu, 12 Dec 2024 11:34:54 +0100 Subject: [PATCH 7/8] ENH: code sanity checked --- q2_moshpit/eggnog/orthologs/common.py | 6 +++--- q2_moshpit/eggnog/orthologs/diamond.py | 4 ++-- q2_moshpit/eggnog/tests/test_orthologs.py | 10 ---------- 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/q2_moshpit/eggnog/orthologs/common.py b/q2_moshpit/eggnog/orthologs/common.py index daf83e10..901d461e 100644 --- a/q2_moshpit/eggnog/orthologs/common.py +++ b/q2_moshpit/eggnog/orthologs/common.py @@ -7,7 +7,6 @@ # ---------------------------------------------------------------------------- import glob import os -import shutil import subprocess from typing import List @@ -17,7 +16,8 @@ from q2_types.feature_data import FeatureData from q2_types.feature_data_mag import MAG, MAGSequencesDirFmt -from q2_types.genome_data import SeedOrthologDirFmt, OrthologFileFmt, LociDirectoryFormat +from q2_types.genome_data import (SeedOrthologDirFmt, OrthologFileFmt, + LociDirectoryFormat) from q2_types.per_sample_sequences import ( Contigs, MAGs, ContigSequencesDirFmt, MultiMAGSequencesDirFmt ) @@ -41,6 +41,7 @@ def _create_symlinks( os.path.join(target_dir, filename) ) + def _run_eggnog_search_pipeline( ctx: Context, sequences: qiime2.Artifact, @@ -123,7 +124,6 @@ def _search_runner( into memory. - runner_args: Additional arguments to pass to the eggNOG-mapper command. """ - #output_loc = os.path.join("/home","dgrabocka", "q2_moshpit_enh", "trial") cmd = [ 'emapper.py', '-i', str(input_path), '-o', sample_label, '-m', *runner_args, '--genepred', 'prodigal', diff --git a/q2_moshpit/eggnog/orthologs/diamond.py b/q2_moshpit/eggnog/orthologs/diamond.py index f12f8e60..71309317 100644 --- a/q2_moshpit/eggnog/orthologs/diamond.py +++ b/q2_moshpit/eggnog/orthologs/diamond.py @@ -45,7 +45,7 @@ def _eggnog_diamond_search( runner_args=['diamond', '--dmnd_db', str(db_fp)] ) result, ft, loci = _eggnog_search(sequences, search_runner, - str(output_loc)) + str(output_loc)) return result, ft, loci @@ -57,4 +57,4 @@ def eggnog_diamond_search( ctx, sequences, [diamond_db], num_cpus, db_in_memory, num_partitions, "_eggnog_diamond_search" ) - return collated_hits, collated_tables, loci \ No newline at end of file + return collated_hits, collated_tables, loci diff --git a/q2_moshpit/eggnog/tests/test_orthologs.py b/q2_moshpit/eggnog/tests/test_orthologs.py index 8741a216..90b9c975 100644 --- a/q2_moshpit/eggnog/tests/test_orthologs.py +++ b/q2_moshpit/eggnog/tests/test_orthologs.py @@ -35,19 +35,9 @@ ) - - - - class TestHMMER(TestPluginBase): package = 'q2_moshpit.eggnog.tests' - - - - - - def setUp(self): super().setUp() self.idmap_artifact = qiime2.Artifact.import_data( From 0c7c6ddbf1b199360e50f9f47c7a10c68a65686f Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Fri, 13 Dec 2024 12:39:34 +0100 Subject: [PATCH 8/8] ENH: eggnog_hmmer_search outputs gff files now --- q2_moshpit/eggnog/orthologs/common.py | 2 +- q2_moshpit/eggnog/orthologs/hmmer.py | 25 ++++++++++++----------- q2_moshpit/eggnog/tests/test_orthologs.py | 18 ++++++++-------- q2_moshpit/plugin_setup.py | 9 +++++--- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/q2_moshpit/eggnog/orthologs/common.py b/q2_moshpit/eggnog/orthologs/common.py index 901d461e..e51ea455 100644 --- a/q2_moshpit/eggnog/orthologs/common.py +++ b/q2_moshpit/eggnog/orthologs/common.py @@ -86,8 +86,8 @@ def _run_eggnog_search_pipeline( partition_method = ctx.get_action(plugin, action_name) _eggnog_search = ctx.get_action("moshpit", search_action) collate_hits = ctx.get_action("types", "collate_orthologs") - collate_loci = ctx.get_action("types", "collate_loci") _eggnog_feature_table = ctx.get_action("moshpit", "_eggnog_feature_table") + collate_loci = ctx.get_action("types", "collate_loci") (partitioned_sequences,) = partition_method(sequences, num_partitions) hits = [] diff --git a/q2_moshpit/eggnog/orthologs/hmmer.py b/q2_moshpit/eggnog/orthologs/hmmer.py index a178d362..c030c0ae 100644 --- a/q2_moshpit/eggnog/orthologs/hmmer.py +++ b/q2_moshpit/eggnog/orthologs/hmmer.py @@ -19,7 +19,7 @@ from q2_moshpit.eggnog.types import EggnogHmmerIdmapDirectoryFmt from q2_types.feature_data_mag import MAGSequencesDirFmt from q2_types.genome_data import ( - ProteinsDirectoryFormat, SeedOrthologDirFmt + ProteinsDirectoryFormat, SeedOrthologDirFmt, LociDirectoryFormat ) from q2_types.per_sample_sequences import ( ContigSequencesDirFmt, MultiMAGSequencesDirFmt @@ -38,7 +38,7 @@ def _eggnog_hmmer_search( seed_alignments: ProteinsDirectoryFormat, num_cpus: int = 1, db_in_memory: bool = False -) -> (SeedOrthologDirFmt, pd.DataFrame): +) -> (SeedOrthologDirFmt, pd.DataFrame, LociDirectoryFormat): with tempfile.TemporaryDirectory() as output_loc: taxon_id = os.listdir(idmap.path)[0].split(".")[0] tmp_subdir = f"{output_loc}/hmmer/{taxon_id}" @@ -54,17 +54,18 @@ def _eggnog_hmmer_search( '--genepred', 'prodigal' # default incompatible with HMMER ] ) - result, ft = _eggnog_search(sequences, search_runner, output_loc) - return result, ft + result, ft, loci = _eggnog_search(sequences, search_runner, output_loc) + return result, ft, loci def eggnog_hmmer_search( - ctx, sequences, pressed_hmm_db, idmap, seed_alignments, - num_cpus=1, db_in_memory=False, num_partitions=None + ctx, sequences, pressed_hmm_db, idmap, seed_alignments, + num_cpus=1, db_in_memory=False, num_partitions=None ): - collated_hits, collated_tables = _run_eggnog_search_pipeline( - ctx, sequences, [idmap, pressed_hmm_db, seed_alignments], - num_cpus, db_in_memory, num_partitions, - "_eggnog_hmmer_search" - ) - return collated_hits, collated_tables + collated_hits, collated_tables, collated_loci = ( + _run_eggnog_search_pipeline( + ctx, sequences, [idmap, pressed_hmm_db, seed_alignments], + num_cpus, db_in_memory, num_partitions, + "_eggnog_hmmer_search" + )) + return collated_hits, collated_tables, collated_loci diff --git a/q2_moshpit/eggnog/tests/test_orthologs.py b/q2_moshpit/eggnog/tests/test_orthologs.py index 90b9c975..f417ce5d 100644 --- a/q2_moshpit/eggnog/tests/test_orthologs.py +++ b/q2_moshpit/eggnog/tests/test_orthologs.py @@ -73,9 +73,11 @@ def setUp(self): def test_eggnog_hmmer_search_pipeline(self): mock_action = MagicMock(side_effect=[ lambda sequences, num_partitions: ({"mag1": {}, "mag2": {}},), - lambda seq, pressed, idmap, fastas, num_cpus, db_in_memory: (0, 0), + lambda seq, pressed, idmap, fastas, num_cpus, db_in_memory: + (0, 0, 0), lambda hits: ("collated_hits",), lambda collated_hits: ("collated_tables",), + lambda collated_loci: ("collated_loci",), ]) mock_ctx = MagicMock(get_action=mock_action) obs = eggnog_hmmer_search( @@ -85,7 +87,7 @@ def test_eggnog_hmmer_search_pipeline(self): idmap=self.idmap_artifact, seed_alignments=self.fastas_artifact ) - exp = ("collated_hits", "collated_tables") + exp = ("collated_hits", "collated_tables", "collated_loci") self.assertTupleEqual(obs, exp) def test_symlink_files_to_target_dir(self): @@ -114,8 +116,8 @@ def test_eggnog_hmmer_search( self, mock_eggnog_search, mock_symlink, mock_tmpdir, mock_makedirs ): mock_tmpdir.return_value.__enter__.return_value = "tmp" - mock_eggnog_search.return_value = (0, 1) - result, ft = _eggnog_hmmer_search( + mock_eggnog_search.return_value = (0, 1, 2) + result, ft, loci = _eggnog_hmmer_search( sequences=self.mags, idmap=self.idmap, pressed_hmm_db=self.pressed_hmm, @@ -129,7 +131,7 @@ def test_eggnog_hmmer_search( ANY, # partial() method not patchable or comparable "tmp" ) - self.assertTupleEqual((result, ft), (0, 1)) + self.assertTupleEqual((result, ft, loci), (0, 1, 2)) def test_eggnog_search_mags(self): sequences = MultiMAGSequencesDirFmt( @@ -138,7 +140,7 @@ def test_eggnog_search_mags(self): output_loc = self.get_data_path('hits') search_runner = MagicMock() - result, ft = _eggnog_search(sequences, search_runner, output_loc) + result, ft, _ = _eggnog_search(sequences, search_runner, output_loc) result.validate() self.assertIsInstance(ft, pd.DataFrame) @@ -155,7 +157,7 @@ def test_eggnog_search_contigs(self): output_loc = self.get_data_path('hits') search_runner = MagicMock() - result, ft = _eggnog_search(sequences, search_runner, output_loc) + result, ft, _ = _eggnog_search(sequences, search_runner, output_loc) result.validate() self.assertIsInstance(ft, pd.DataFrame) @@ -171,7 +173,7 @@ def test_eggnog_search_mags_derep(self): output_loc = self.get_data_path('hits') search_runner = MagicMock() - result, ft = _eggnog_search(sequences, search_runner, output_loc) + result, ft, _ = _eggnog_search(sequences, search_runner, output_loc) result.validate() self.assertIsInstance(ft, pd.DataFrame) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index b55af017..dd1b3a90 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -736,7 +736,8 @@ }, outputs=[ ('eggnog_hits', SampleData[Orthologs]), - ('table', FeatureTable[Frequency]) + ('table', FeatureTable[Frequency]), + ('loci', GenomeData[Loci]), ], name='Run eggNOG search using HMMER aligner', description="This method uses HMMER to find possible target sequences " @@ -824,12 +825,14 @@ }, outputs=[ ('eggnog_hits', SampleData[Orthologs]), - ('table', FeatureTable[Frequency]) + ('table', FeatureTable[Frequency]), + ('loci', GenomeData[Loci]) ], output_descriptions={ 'eggnog_hits': 'BLAST6-like table(s) describing the identified ' 'orthologs. One table per sample or MAG in the input.', - 'table': 'Feature table with counts of orthologs per sample/MAG.' + 'table': 'Feature table with counts of orthologs per sample/MAG.', + 'loci': 'Loci of the identified orthologs.' }, name='Run eggNOG search using HMMER aligner', description='This method performs the steps by which we find our '