|
14 | 14 |
|
15 | 15 | __author__ = "Edgardo M. Ortiz"
|
16 | 16 | __credits__ = "Juan D. Palacio-Mejía"
|
17 |
| -__version__ = "2.4" |
| 17 | +__version__ = "2.5" |
18 | 18 | __email__ = "e.ortiz.v@gmail.com"
|
19 |
| -__date__ = "2020-10-04" |
| 19 | +__date__ = "2021-03-16" |
20 | 20 |
|
21 | 21 |
|
22 | 22 | import argparse
|
@@ -111,24 +111,28 @@ def get_matrix_column(record, num_samples, resolve_IUPAC):
|
111 | 111 | """
|
112 | 112 | Transform a VCF record into a phylogenetic matrix column with nucleotides instead of numbers
|
113 | 113 | """
|
114 |
| - nt_dict = {str(0): record[3].replace("-","*"), ".": "N"} |
| 114 | + nt_dict = {str(0): record[3].replace("-","*").upper(), ".": "N"} |
115 | 115 | alt = record[4].replace("-", "*")
|
116 | 116 | alt = alt.split(",")
|
117 | 117 | for n in range(len(alt)):
|
118 | 118 | nt_dict[str(n+1)] = alt[n]
|
119 | 119 | column = ""
|
120 | 120 | for i in range(9, num_samples + 9):
|
121 |
| - genotype = record[i].split(":")[0].replace("/", "").replace("|", "") |
122 |
| - if resolve_IUPAC: |
123 |
| - column += nt_dict[random.choice(genotype)] |
| 121 | + geno_num = record[i].split(":")[0].replace("/", "").replace("|", "") |
| 122 | + geno_nuc = "".join(sorted(set([nt_dict[j] for j in geno_num]))) |
| 123 | + if len(geno_nuc) == 1: |
| 124 | + column += geno_nuc |
| 125 | + elif resolve_IUPAC is False: |
| 126 | + column += ambiguities[geno_nuc] |
124 | 127 | else:
|
125 |
| - column += ambiguities["".join(sorted(set([nt_dict[j] for j in genotype])))] |
| 128 | + column += nt_dict[random.choice(geno_num)] |
126 | 129 | return column
|
127 | 130 |
|
128 | 131 |
|
129 | 132 | def get_matrix_column_bin(record, num_samples):
|
130 | 133 | """
|
131 |
| - If VCF is diploid, return an alignment column in NEXUS binary from a VCF record |
| 134 | + Return an alignment column in NEXUS binary from a VCF record, if genotype is not diploid with at |
| 135 | + most two alleles it will return '?' as state |
132 | 136 | """
|
133 | 137 | column = ""
|
134 | 138 | for i in range(9, num_samples + 9):
|
@@ -200,11 +204,11 @@ def main():
|
200 | 204 | # Get samples names and number of samples in VCF
|
201 | 205 | sample_names = extract_sample_names(filename)
|
202 | 206 | num_samples = len(sample_names)
|
203 |
| - if len(sample_names) == 0: |
| 207 | + if num_samples == 0: |
204 | 208 | print("\nSample names not found in VCF, your file may be corrupt or missing the header.\n")
|
205 | 209 | sys.exit()
|
206 | 210 | print("\nConverting file '{}':\n".format(filename))
|
207 |
| - print("Number of samples in VCF: {:d}".format(len(sample_names))) |
| 211 | + print("Number of samples in VCF: {:d}".format(num_samples)) |
208 | 212 |
|
209 | 213 | # If the 'min_samples_locus' is larger than the actual number of samples in VCF readjust it
|
210 | 214 | min_samples_locus = min(num_samples, min_samples_locus)
|
|
0 commit comments