Skip to content

Commit 323293f

Browse files
committed
feat(itol): Polyphyletic phyla will be the same colour.
1 parent 663fae6 commit 323293f

File tree

6 files changed

+56
-46
lines changed

6 files changed

+56
-46
lines changed

gtdb_itol_decorate/__main__.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
import sys
12
from pathlib import Path
23

34
import typer
45

56
from gtdb_itol_decorate.gtdb import load_taxonomy_file, get_taxon_to_phylum
6-
from gtdb_itol_decorate.itol import get_phylum_to_lca, get_phylum_colours, write_color_datastrip, \
7+
from gtdb_itol_decorate.itol import get_phylum_colours, write_color_datastrip, \
78
get_internal_nodes_with_labels, write_internal_node_labels, write_tree_colours, write_collapse_file, \
89
write_popup_file
910
from gtdb_itol_decorate.newick import load_newick_to_tree, validate_dendropy_namespace, \
@@ -28,14 +29,18 @@ def main(tree_path: Path, tax_path: Path, out_dir: Path):
2829

2930
log('Reverse mapping taxon to phylum')
3031
d_taxon_to_phylum = get_taxon_to_phylum(d_tax)
32+
phyla = frozenset(d_taxon_to_phylum.values())
33+
if len(phyla) == 0:
34+
log('No phyla could be found, please send me your tree!')
35+
sys.exit(1)
3136

3237
log('Annotating internal nodes with descendant taxa')
3338
set_node_desc_taxa(tree)
34-
set_taxon_label_for_internal_nodes(tree, d_tax)
39+
d_taxon_to_lca = set_taxon_label_for_internal_nodes(tree, d_tax)
3540

3641
log('Getting the last common ancestor of each phylum.')
37-
d_phylum_to_lca = get_phylum_to_lca(tree)
38-
d_phylum_palette = get_phylum_colours(d_phylum_to_lca)
42+
d_phylum_to_lca = {k: v for k, v in d_taxon_to_lca.items() if k.startswith('p__')}
43+
d_phylum_palette = get_phylum_colours(phyla)
3944
write_color_datastrip(d_phylum_to_lca, d_phylum_palette, out_dir / 'itol_dataset_strip_phylum.txt')
4045

4146
log('Making tree compatible with iTOL (stripping labels)')

gtdb_itol_decorate/decorate.py

-3
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,13 @@
11
from collections import defaultdict
22

33

4-
54
def decorate_tree(tree, d_gid_to_tax):
6-
75
d_node_to_leaf_taxa = defaultdict(set)
86
for node in tree.postorder_node_iter():
97
# desc_taxa = get_node_desc_taxa(node)
108

119
print(node)
1210

13-
1411
# Create a mapping from the taxon to each gid
1512
d_taxon_to_gids = defaultdict(set)
1613
for gid, taxon in d_gid_to_tax.items():

gtdb_itol_decorate/gtdb.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from collections import defaultdict
21
from typing import Dict
32

43
from gtdblib.taxon.rank import TaxonRank
@@ -10,8 +9,8 @@
109

1110
def load_taxonomy_file(path: str, limit_to_gids: set):
1211
gtdb_ranks = [TaxonRank.DOMAIN, TaxonRank.PHYLUM, TaxonRank.CLASS,
13-
TaxonRank.ORDER, TaxonRank.FAMILY, TaxonRank.GENUS,
14-
TaxonRank.SPECIES]
12+
TaxonRank.ORDER, TaxonRank.FAMILY, TaxonRank.GENUS,
13+
TaxonRank.SPECIES]
1514
out = dict()
1615
with open(path) as f:
1716
for line in f.readlines():
@@ -34,7 +33,6 @@ def load_taxonomy_file(path: str, limit_to_gids: set):
3433
return out
3534

3635

37-
3836
def get_taxon_to_phylum(d_tax: Dict[str, Taxonomy]):
3937
out = dict()
4038
for gid, taxonomy in d_tax.items():
@@ -43,3 +41,9 @@ def get_taxon_to_phylum(d_tax: Dict[str, Taxonomy]):
4341
out[taxon] = taxonomy.p.value
4442
return out
4543

44+
45+
def remove_polyphyletic_suffix(taxon):
46+
if not (taxon.startswith('g__') or taxon.startswith('s__')):
47+
if taxon[-2] == '_':
48+
return taxon[:-2]
49+
return taxon

gtdb_itol_decorate/itol.py

+15-28
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
from collections import deque, defaultdict
1+
from collections import defaultdict
22
from pathlib import Path
3-
from typing import Dict, Set
3+
from typing import Dict, FrozenSet
44

55
import dendropy
6+
import seaborn as sns
67
from gtdblib.file.itol.collapse import iTolCollapseFile
78
from gtdblib.file.itol.dataset_color_strip import iTolDatasetColorStripFile
89
from gtdblib.file.itol.label import iTolLabelFile
@@ -11,37 +12,24 @@
1112
from gtdblib.taxonomy.taxonomy import Taxonomy
1213
from gtdblib.util.color import TABLEAU_20, rgb_to_hex
1314

14-
from gtdb_itol_decorate.newick import parse_label
15-
16-
import seaborn as sns
17-
18-
from gtdb_itol_decorate.util import canonical_gid
19-
20-
21-
def get_lca_str(node: dendropy.Node):
22-
if node.is_leaf():
23-
return node.taxon.label
24-
if len(node.child_nodes()) < 2:
25-
return node.child_nodes()[0].leaf_nodes()[0].taxon.label
26-
left = node.child_nodes()[0].leaf_nodes()[0]
27-
right = node.child_nodes()[1].leaf_nodes()[0]
28-
return f'{left.taxon.label}|{right.taxon.label}'
29-
15+
from gtdb_itol_decorate.gtdb import remove_polyphyletic_suffix
16+
from gtdb_itol_decorate.newick import get_lca_str
3017

3118

32-
def get_phylum_colours(d_phylum_to_lca):
33-
"""Generate a list of colours for each phylum. Order is inferred through a
34-
depth first search, to ensure no clade colours are side by side.
19+
def get_phylum_colours(phyla: FrozenSet[str]):
20+
"""Generate a list of colours for each phylum.
3521
"""
3622
colours = TABLEAU_20
3723

24+
phyla_mono = {remove_polyphyletic_suffix(x) for x in phyla}
3825
d_phylum_to_colour = dict()
39-
for phylum in d_phylum_to_lca.keys():
26+
for phylum in phyla_mono:
4027
d_phylum_to_colour[phylum] = colours[len(d_phylum_to_colour) % len(colours)]
4128

4229
# Generate a colour palette for each phylum (increasing brightness)
4330
out = dict()
44-
for phylum, colour in d_phylum_to_colour.items():
31+
for phylum in phyla:
32+
colour = d_phylum_to_colour[remove_polyphyletic_suffix(phylum)]
4533
cur_pal = sns.light_palette(colour, 6, reverse=True)
4634
out[phylum] = [rgb_to_hex(*[round(y * 255) for y in x]) for x in cur_pal]
4735
return out
@@ -62,7 +50,6 @@ def get_phylum_to_lca(tree: dendropy.Tree):
6250
return out
6351

6452

65-
6653
def write_color_datastrip(d_phylum_to_lca, d_phylum_palette, path):
6754
file = iTolDatasetColorStripFile(path, 'Phylum Labels', '#000000', strip_width=100, show_internal=True)
6855

@@ -83,7 +70,8 @@ def get_internal_nodes_with_labels(tree: dendropy.Tree):
8370
out[';'.join(node.tax_label)].append(get_lca_str(node))
8471
return out
8572

86-
def write_internal_node_labels( d_label_to_lca,path: Path):
73+
74+
def write_internal_node_labels(d_label_to_lca, path: Path):
8775
file = iTolLabelFile(path)
8876

8977
# 1. Add internal node labels (trivial)
@@ -93,7 +81,6 @@ def write_internal_node_labels( d_label_to_lca,path: Path):
9381
file.write()
9482

9583

96-
9784
def write_tree_colours(tree, d_taxon_to_phylum, path, d_color_palette):
9885
file = iTolTreeColorsFile(path)
9986

@@ -122,7 +109,6 @@ def write_tree_colours(tree, d_taxon_to_phylum, path, d_color_palette):
122109
file.write()
123110

124111

125-
126112
def write_collapse_file(d_int_label_to_lca, path, taxon_prefix):
127113
file = iTolCollapseFile(path)
128114
for taxon, lst_lr_nodes in d_int_label_to_lca.items():
@@ -132,6 +118,7 @@ def write_collapse_file(d_int_label_to_lca, path, taxon_prefix):
132118
file.insert(lr_node)
133119
file.write()
134120

121+
135122
def write_popup_file(tree: dendropy.Tree, d_tax: Dict[str, Taxonomy], path: Path):
136123
file = iTolPopupFile(path)
137124

@@ -153,4 +140,4 @@ def write_popup_file(tree: dendropy.Tree, d_tax: Dict[str, Taxonomy], path: Path
153140
file.insert(gid, row.s.value, ''.join(lines))
154141

155142
file.write()
156-
return
143+
return

gtdb_itol_decorate/newick.py

+22-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
import dendropy
21
from collections import Counter, deque, defaultdict
32

3+
import dendropy
4+
45
from gtdb_itol_decorate.util import canonical_gid, log, is_float
56

67

@@ -25,6 +26,7 @@ def load_newick_to_tree(path: str) -> dendropy.Tree:
2526
rooting='force-rooted',
2627
preserve_underscores=True)
2728

29+
2830
def validate_dendropy_namespace(taxa):
2931
taxa_count = Counter(taxa)
3032
duplicates = {k: v for k, v in taxa_count.items() if v > 1}
@@ -33,12 +35,23 @@ def validate_dendropy_namespace(taxa):
3335
return
3436

3537

38+
def get_lca_str(node: dendropy.Node):
39+
if node.is_leaf():
40+
return node.taxon.label
41+
if len(node.child_nodes()) < 2:
42+
return node.child_nodes()[0].leaf_nodes()[0].taxon.label
43+
left = node.child_nodes()[0].leaf_nodes()[0]
44+
right = node.child_nodes()[1].leaf_nodes()[0]
45+
return f'{left.taxon.label}|{right.taxon.label}'
46+
47+
3648
def get_canonical_mapping(gids):
3749
out = dict()
3850
for gid in gids:
3951
out[canonical_gid(gid)] = gid
4052
return out
4153

54+
4255
def validate_sets(newick_gids, tax_gids):
4356
if newick_gids != tax_gids:
4457
log('The following genomes are in the newick file but not in the taxonomy file:')
@@ -85,6 +98,7 @@ def parse_label(label):
8598

8699
return support, taxon, auxiliary_info
87100

101+
88102
def strip_tree_labels(tree: dendropy.Tree):
89103
for node in tree.preorder_node_iter():
90104
if node.is_leaf():
@@ -94,6 +108,7 @@ def strip_tree_labels(tree: dendropy.Tree):
94108
node.label = str(support)
95109
return
96110

111+
97112
def get_node_depth(tree: dendropy.Tree):
98113
"""Return the depth of every node in a tree."""
99114
depth_to_nodes = defaultdict(list)
@@ -126,7 +141,7 @@ def set_node_desc_taxa(tree: dendropy.Tree):
126141

127142
def set_taxon_label_for_internal_nodes(tree: dendropy.Tree, d_tax):
128143
# Find the highest internal node that forms a monophyletic group
129-
144+
taxon_to_lca = defaultdict(list)
130145
ranks = ('d', 'p', 'c', 'o', 'f', 'g', 's')
131146

132147
queue = deque([(tree.seed_node, 'd')])
@@ -139,15 +154,17 @@ def set_taxon_label_for_internal_nodes(tree: dendropy.Tree, d_tax):
139154
# Check if this forms a monophyletic group at the target taxon
140155
desc_taxa = {getattr(d_tax[x], target_rank).value for x in node.desc_taxa}
141156
if len(desc_taxa) == 1:
142-
node.tax_label.append(list(desc_taxa)[0])
157+
cur_taxon = desc_taxa.pop()
158+
node.tax_label.append(cur_taxon)
159+
taxon_to_lca[cur_taxon].append(get_lca_str(node))
143160

144161
# Re-queue this node at the front, in-case it can be extended
145162
if target_rank != 's':
146-
next_rank = ranks[ranks.index(target_rank)+1]
163+
next_rank = ranks[ranks.index(target_rank) + 1]
147164
queue.appendleft((node, next_rank))
148165

149166
# Otherwise, keep going down
150167
else:
151168
for child in node.child_nodes():
152169
queue.append((child, target_rank))
153-
return
170+
return taxon_to_lca

gtdb_itol_decorate/util.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ def log(msg: str, title=False):
88
if title:
99
print('-' * 80)
1010

11+
1112
def canonical_gid(gid):
1213
"""Get canonical form of NCBI genome accession.
1314
@@ -30,7 +31,6 @@ def canonical_gid(gid):
3031
return gid
3132

3233

33-
3434
def is_float(s):
3535
"""Check if a string can be converted to a float.
3636
@@ -50,4 +50,4 @@ def is_float(s):
5050
except ValueError:
5151
return False
5252

53-
return True
53+
return True

0 commit comments

Comments
 (0)