Skip to content

Commit 635eacc

Browse files
committed
Fixed issue when reference nucleotide is in lower case
1 parent 3a2650a commit 635eacc

File tree

2 files changed

+15
-11
lines changed

2 files changed

+15
-11
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
21
.DS_Store
2+
vcf2phylip_v2.4.py

vcf2phylip.py

+14-10
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414

1515
__author__ = "Edgardo M. Ortiz"
1616
__credits__ = "Juan D. Palacio-Mejía"
17-
__version__ = "2.4"
17+
__version__ = "2.5"
1818
__email__ = "e.ortiz.v@gmail.com"
19-
__date__ = "2020-10-04"
19+
__date__ = "2021-03-16"
2020

2121

2222
import argparse
@@ -111,24 +111,28 @@ def get_matrix_column(record, num_samples, resolve_IUPAC):
111111
"""
112112
Transform a VCF record into a phylogenetic matrix column with nucleotides instead of numbers
113113
"""
114-
nt_dict = {str(0): record[3].replace("-","*"), ".": "N"}
114+
nt_dict = {str(0): record[3].replace("-","*").upper(), ".": "N"}
115115
alt = record[4].replace("-", "*")
116116
alt = alt.split(",")
117117
for n in range(len(alt)):
118118
nt_dict[str(n+1)] = alt[n]
119119
column = ""
120120
for i in range(9, num_samples + 9):
121-
genotype = record[i].split(":")[0].replace("/", "").replace("|", "")
122-
if resolve_IUPAC:
123-
column += nt_dict[random.choice(genotype)]
121+
geno_num = record[i].split(":")[0].replace("/", "").replace("|", "")
122+
geno_nuc = "".join(sorted(set([nt_dict[j] for j in geno_num])))
123+
if len(geno_nuc) == 1:
124+
column += geno_nuc
125+
elif resolve_IUPAC is False:
126+
column += ambiguities[geno_nuc]
124127
else:
125-
column += ambiguities["".join(sorted(set([nt_dict[j] for j in genotype])))]
128+
column += nt_dict[random.choice(geno_num)]
126129
return column
127130

128131

129132
def get_matrix_column_bin(record, num_samples):
130133
"""
131-
If VCF is diploid, return an alignment column in NEXUS binary from a VCF record
134+
Return an alignment column in NEXUS binary from a VCF record, if genotype is not diploid with at
135+
most two alleles it will return '?' as state
132136
"""
133137
column = ""
134138
for i in range(9, num_samples + 9):
@@ -200,11 +204,11 @@ def main():
200204
# Get samples names and number of samples in VCF
201205
sample_names = extract_sample_names(filename)
202206
num_samples = len(sample_names)
203-
if len(sample_names) == 0:
207+
if num_samples == 0:
204208
print("\nSample names not found in VCF, your file may be corrupt or missing the header.\n")
205209
sys.exit()
206210
print("\nConverting file '{}':\n".format(filename))
207-
print("Number of samples in VCF: {:d}".format(len(sample_names)))
211+
print("Number of samples in VCF: {:d}".format(num_samples))
208212

209213
# If the 'min_samples_locus' is larger than the actual number of samples in VCF readjust it
210214
min_samples_locus = min(num_samples, min_samples_locus)

0 commit comments

Comments
 (0)