diff --git a/q2_moshpit/eggnog/orthologs/common.py b/q2_moshpit/eggnog/orthologs/common.py index e230f899..e51ea455 100644 --- a/q2_moshpit/eggnog/orthologs/common.py +++ b/q2_moshpit/eggnog/orthologs/common.py @@ -16,7 +16,8 @@ from q2_types.feature_data import FeatureData from q2_types.feature_data_mag import MAG, MAGSequencesDirFmt -from q2_types.genome_data import SeedOrthologDirFmt, OrthologFileFmt +from q2_types.genome_data import (SeedOrthologDirFmt, OrthologFileFmt, + LociDirectoryFormat) from q2_types.per_sample_sequences import ( Contigs, MAGs, ContigSequencesDirFmt, MultiMAGSequencesDirFmt ) @@ -86,16 +87,20 @@ def _run_eggnog_search_pipeline( _eggnog_search = ctx.get_action("moshpit", search_action) collate_hits = ctx.get_action("types", "collate_orthologs") _eggnog_feature_table = ctx.get_action("moshpit", "_eggnog_feature_table") + collate_loci = ctx.get_action("types", "collate_loci") (partitioned_sequences,) = partition_method(sequences, num_partitions) hits = [] + loci = [] for seq in partitioned_sequences.values(): - (hit, _) = _eggnog_search(seq, *db, num_cpus, db_in_memory) + (hit, _, loci_dir) = _eggnog_search(seq, *db, num_cpus, db_in_memory) hits.append(hit) + loci.append(loci_dir) (collated_hits,) = collate_hits(hits) (collated_tables,) = _eggnog_feature_table(collated_hits) - return collated_hits, collated_tables + (collated_loci,) = collate_loci(loci) + return collated_hits, collated_tables, collated_loci def _search_runner( @@ -152,6 +157,19 @@ def _eggnog_search( for mag_id, mag_fp in mags.items(): search_runner(input_path=mag_fp, sample_label=mag_id) + # iterate over the gff files and move them to the correct location + loci_dir = LociDirectoryFormat() + gff_fp = [ + os.path.basename(x) for x + in glob.glob(f'{output_loc}/*.emapper.genepred.gff') + ] + for fn in gff_fp: + new_fn = fn.replace('.emapper.genepred.gff', '.gff') + qiime2.util.duplicate( + os.path.join(output_loc, fn), + os.path.join(loci_dir.path, new_fn) + ) + result = SeedOrthologDirFmt() ortholog_fps = [ os.path.basename(x) for x @@ -164,7 +182,7 @@ def _eggnog_search( ) ft = _eggnog_feature_table(result) - return result, ft + return result, ft, loci_dir def _eggnog_feature_table(seed_orthologs: SeedOrthologDirFmt) -> pd.DataFrame: diff --git a/q2_moshpit/eggnog/orthologs/diamond.py b/q2_moshpit/eggnog/orthologs/diamond.py index c496b094..71309317 100644 --- a/q2_moshpit/eggnog/orthologs/diamond.py +++ b/q2_moshpit/eggnog/orthologs/diamond.py @@ -19,7 +19,7 @@ MAGSequencesDirFmt ) from q2_types.genome_data import ( - SeedOrthologDirFmt + SeedOrthologDirFmt, LociDirectoryFormat ) from q2_types.per_sample_sequences import ( ContigSequencesDirFmt, MultiMAGSequencesDirFmt @@ -35,8 +35,8 @@ def _eggnog_diamond_search( ], diamond_db: DiamondDatabaseDirFmt, num_cpus: int = 1, - db_in_memory: bool = False -) -> (SeedOrthologDirFmt, pd.DataFrame): + db_in_memory: bool = False, +) -> (SeedOrthologDirFmt, pd.DataFrame, LociDirectoryFormat): with tempfile.TemporaryDirectory() as output_loc: db_fp = os.path.join(str(diamond_db), 'ref_db.dmnd') search_runner = partial( @@ -44,16 +44,17 @@ def _eggnog_diamond_search( num_cpus=num_cpus, db_in_memory=db_in_memory, runner_args=['diamond', '--dmnd_db', str(db_fp)] ) - result, ft = _eggnog_search(sequences, search_runner, str(output_loc)) - return result, ft + result, ft, loci = _eggnog_search(sequences, search_runner, + str(output_loc)) + return result, ft, loci def eggnog_diamond_search( ctx, sequences, diamond_db, num_cpus=1, db_in_memory=False, num_partitions=None ): - collated_hits, collated_tables = _run_eggnog_search_pipeline( + collated_hits, collated_tables, loci = _run_eggnog_search_pipeline( ctx, sequences, [diamond_db], num_cpus, db_in_memory, num_partitions, "_eggnog_diamond_search" ) - return collated_hits, collated_tables + return collated_hits, collated_tables, loci diff --git a/q2_moshpit/eggnog/orthologs/hmmer.py b/q2_moshpit/eggnog/orthologs/hmmer.py index a178d362..c030c0ae 100644 --- a/q2_moshpit/eggnog/orthologs/hmmer.py +++ b/q2_moshpit/eggnog/orthologs/hmmer.py @@ -19,7 +19,7 @@ from q2_moshpit.eggnog.types import EggnogHmmerIdmapDirectoryFmt from q2_types.feature_data_mag import MAGSequencesDirFmt from q2_types.genome_data import ( - ProteinsDirectoryFormat, SeedOrthologDirFmt + ProteinsDirectoryFormat, SeedOrthologDirFmt, LociDirectoryFormat ) from q2_types.per_sample_sequences import ( ContigSequencesDirFmt, MultiMAGSequencesDirFmt @@ -38,7 +38,7 @@ def _eggnog_hmmer_search( seed_alignments: ProteinsDirectoryFormat, num_cpus: int = 1, db_in_memory: bool = False -) -> (SeedOrthologDirFmt, pd.DataFrame): +) -> (SeedOrthologDirFmt, pd.DataFrame, LociDirectoryFormat): with tempfile.TemporaryDirectory() as output_loc: taxon_id = os.listdir(idmap.path)[0].split(".")[0] tmp_subdir = f"{output_loc}/hmmer/{taxon_id}" @@ -54,17 +54,18 @@ def _eggnog_hmmer_search( '--genepred', 'prodigal' # default incompatible with HMMER ] ) - result, ft = _eggnog_search(sequences, search_runner, output_loc) - return result, ft + result, ft, loci = _eggnog_search(sequences, search_runner, output_loc) + return result, ft, loci def eggnog_hmmer_search( - ctx, sequences, pressed_hmm_db, idmap, seed_alignments, - num_cpus=1, db_in_memory=False, num_partitions=None + ctx, sequences, pressed_hmm_db, idmap, seed_alignments, + num_cpus=1, db_in_memory=False, num_partitions=None ): - collated_hits, collated_tables = _run_eggnog_search_pipeline( - ctx, sequences, [idmap, pressed_hmm_db, seed_alignments], - num_cpus, db_in_memory, num_partitions, - "_eggnog_hmmer_search" - ) - return collated_hits, collated_tables + collated_hits, collated_tables, collated_loci = ( + _run_eggnog_search_pipeline( + ctx, sequences, [idmap, pressed_hmm_db, seed_alignments], + num_cpus, db_in_memory, num_partitions, + "_eggnog_hmmer_search" + )) + return collated_hits, collated_tables, collated_loci diff --git a/q2_moshpit/eggnog/tests/test_orthologs.py b/q2_moshpit/eggnog/tests/test_orthologs.py index 6baee696..f417ce5d 100644 --- a/q2_moshpit/eggnog/tests/test_orthologs.py +++ b/q2_moshpit/eggnog/tests/test_orthologs.py @@ -73,9 +73,11 @@ def setUp(self): def test_eggnog_hmmer_search_pipeline(self): mock_action = MagicMock(side_effect=[ lambda sequences, num_partitions: ({"mag1": {}, "mag2": {}},), - lambda seq, pressed, idmap, fastas, num_cpus, db_in_memory: (0, 0), + lambda seq, pressed, idmap, fastas, num_cpus, db_in_memory: + (0, 0, 0), lambda hits: ("collated_hits",), lambda collated_hits: ("collated_tables",), + lambda collated_loci: ("collated_loci",), ]) mock_ctx = MagicMock(get_action=mock_action) obs = eggnog_hmmer_search( @@ -85,7 +87,7 @@ def test_eggnog_hmmer_search_pipeline(self): idmap=self.idmap_artifact, seed_alignments=self.fastas_artifact ) - exp = ("collated_hits", "collated_tables") + exp = ("collated_hits", "collated_tables", "collated_loci") self.assertTupleEqual(obs, exp) def test_symlink_files_to_target_dir(self): @@ -114,8 +116,8 @@ def test_eggnog_hmmer_search( self, mock_eggnog_search, mock_symlink, mock_tmpdir, mock_makedirs ): mock_tmpdir.return_value.__enter__.return_value = "tmp" - mock_eggnog_search.return_value = (0, 1) - result, ft = _eggnog_hmmer_search( + mock_eggnog_search.return_value = (0, 1, 2) + result, ft, loci = _eggnog_hmmer_search( sequences=self.mags, idmap=self.idmap, pressed_hmm_db=self.pressed_hmm, @@ -129,7 +131,7 @@ def test_eggnog_hmmer_search( ANY, # partial() method not patchable or comparable "tmp" ) - self.assertTupleEqual((result, ft), (0, 1)) + self.assertTupleEqual((result, ft, loci), (0, 1, 2)) def test_eggnog_search_mags(self): sequences = MultiMAGSequencesDirFmt( @@ -138,7 +140,7 @@ def test_eggnog_search_mags(self): output_loc = self.get_data_path('hits') search_runner = MagicMock() - result, ft = _eggnog_search(sequences, search_runner, output_loc) + result, ft, _ = _eggnog_search(sequences, search_runner, output_loc) result.validate() self.assertIsInstance(ft, pd.DataFrame) @@ -155,7 +157,7 @@ def test_eggnog_search_contigs(self): output_loc = self.get_data_path('hits') search_runner = MagicMock() - result, ft = _eggnog_search(sequences, search_runner, output_loc) + result, ft, _ = _eggnog_search(sequences, search_runner, output_loc) result.validate() self.assertIsInstance(ft, pd.DataFrame) @@ -171,7 +173,7 @@ def test_eggnog_search_mags_derep(self): output_loc = self.get_data_path('hits') search_runner = MagicMock() - result, ft = _eggnog_search(sequences, search_runner, output_loc) + result, ft, _ = _eggnog_search(sequences, search_runner, output_loc) result.validate() self.assertIsInstance(ft, pd.DataFrame) @@ -218,7 +220,7 @@ def test_good_small_search_contigs(self): self.get_data_path('contig-sequences-1') ).view(ContigSequencesDirFmt) - _, obs = _eggnog_diamond_search( + _, obs, _ = _eggnog_diamond_search( sequences=contigs, diamond_db=self.diamond_db ) @@ -234,7 +236,7 @@ def test_good_small_search_mags_derep(self): self.get_data_path('mag-sequences') ).view(MAGSequencesDirFmt) - _, obs = _eggnog_diamond_search( + _, obs, _ = _eggnog_diamond_search( sequences=mags, diamond_db=self.diamond_db ) @@ -253,7 +255,7 @@ def test_good_small_search_mags(self): self.get_data_path('mag-sequences-per-sample') ).view(MultiMAGSequencesDirFmt) - _, obs = _eggnog_diamond_search( + _, obs, _ = _eggnog_diamond_search( sequences=mags, diamond_db=self.diamond_db ) @@ -279,12 +281,12 @@ def test_eggnog_search_parallel_contigs(self): ) with self.test_config: - _, parallel = self.eggnog_diamond_search.parallel( + _, parallel, _ = self.eggnog_diamond_search.parallel( contigs, self.diamond_db_artifact )._result() - _, single = self._eggnog_diamond_search( + _, single, _ = self._eggnog_diamond_search( sequences=contigs, diamond_db=self.diamond_db_artifact ) @@ -301,12 +303,12 @@ def test_eggnog_search_parallel_mags_derep(self): ) with self.test_config: - _, parallel = self.eggnog_diamond_search.parallel( + _, parallel, _ = self.eggnog_diamond_search.parallel( mags, self.diamond_db_artifact )._result() - _, single = self._eggnog_diamond_search( + _, single, _ = self._eggnog_diamond_search( sequences=mags, diamond_db=self.diamond_db_artifact ) @@ -323,12 +325,12 @@ def test_eggnog_search_parallel_mags(self): ) with self.test_config: - _, parallel = self.eggnog_diamond_search.parallel( + _, parallel, _ = self.eggnog_diamond_search.parallel( mags, self.diamond_db_artifact )._result() - _, single = self._eggnog_diamond_search( + _, single, _ = self._eggnog_diamond_search( sequences=mags, diamond_db=self.diamond_db_artifact ) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index a2c290c2..30b35204 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -708,7 +708,8 @@ }, outputs=[ ('eggnog_hits', SampleData[Orthologs]), - ('table', FeatureTable[Frequency]) + ('table', FeatureTable[Frequency]), + ('loci', GenomeData[Loci]) ], name='Run eggNOG search using diamond aligner', description="This method performs the steps by which we find our " @@ -752,7 +753,8 @@ }, outputs=[ ('eggnog_hits', SampleData[Orthologs]), - ('table', FeatureTable[Frequency]) + ('table', FeatureTable[Frequency]), + ('loci', GenomeData[Loci]), ], name='Run eggNOG search using HMMER aligner', description="This method uses HMMER to find possible target sequences " @@ -772,11 +774,11 @@ }, parameters={ 'num_cpus': Int, - 'db_in_memory': Bool, + 'db_in_memory': Bool }, input_descriptions={ 'sequences': 'Sequences to be searched for ortholog hits.', - 'diamond_db': 'Diamond database.', + 'diamond_db': 'Diamond database.' }, parameter_descriptions={ 'num_cpus': 'Number of CPUs to utilize. \'0\' will ' @@ -788,12 +790,14 @@ }, outputs=[ ('eggnog_hits', SampleData[Orthologs]), - ('table', FeatureTable[Frequency]) + ('table', FeatureTable[Frequency]), + ('loci', GenomeData[Loci]) ], output_descriptions={ 'eggnog_hits': 'BLAST6-like table(s) describing the identified ' 'orthologs. One table per sample or MAG in the input.', - 'table': 'Feature table with counts of orthologs per sample/MAG.' + 'table': 'Feature table with counts of orthologs per sample/MAG.', + 'loci': 'Loci of the identified orthologs.' }, name='Run eggNOG search using Diamond aligner', description="This method performs the steps by which we find our " @@ -838,12 +842,14 @@ }, outputs=[ ('eggnog_hits', SampleData[Orthologs]), - ('table', FeatureTable[Frequency]) + ('table', FeatureTable[Frequency]), + ('loci', GenomeData[Loci]) ], output_descriptions={ 'eggnog_hits': 'BLAST6-like table(s) describing the identified ' 'orthologs. One table per sample or MAG in the input.', - 'table': 'Feature table with counts of orthologs per sample/MAG.' + 'table': 'Feature table with counts of orthologs per sample/MAG.', + 'loci': 'Loci of the identified orthologs.' }, name='Run eggNOG search using HMMER aligner', description='This method performs the steps by which we find our '