Merge pull request #106 from briney/development

Development
brineylab · Jul 8, 2022 · 939df26 · 939df26
2 parents 21ca273 + fc247eb
commit 939df26
Show file tree

Hide file tree

Showing 9 changed files with 161 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 ![](https://img.shields.io/pypi/v/abstar.svg?colorB=blue)
-[![Build Status](https://travis-ci.com/briney/abstar.svg?branch=master)](https://travis-ci.com/briney/abstar)
+[![Build Status](https://travis-ci.com/briney/abstar.svg?branch=master)](https://app.travis-ci.com/github/briney/abstar)
 [![Documentation Status](https://readthedocs.org/projects/abstar/badge/?version=latest)](https://abstar.readthedocs.io/en/latest/?badge=latest)
 ![](https://img.shields.io/pypi/pyversions/abstar.svg)
 ![](https://img.shields.io/badge/license-MIT-blue.svg)

diff --git a/abstar/core/abstar.py b/abstar/core/abstar.py
@@ -48,7 +48,7 @@
 
 # import dask.dataframe as dd
 
-from abutils.core.sequence import Sequence
+from abutils.core.sequence import Sequence, read_json, read_csv
 from abutils.utils import log
 # from abutils.utils.pipeline import list_files
 
@@ -201,7 +201,7 @@ def __init__(self, project_dir=None, input=None, output=None, log=None, temp=Non
                  sequences=None, chunksize=500, output_type=['json', ], assigner='blastn',
                  merge=False, pandaseq_algo='simple_bayesian', use_test_data=False,
                  parquet=False, nextseq=False, uid=0, isotype=False, pretty=False, num_cores=0,
-                 basespace=False, cluster=False, padding=True, raw=False, json_keys=None,
+                 basespace=False, cluster=False, padding=False, raw=False, json_keys=None,
                  debug=False, germ_db='human', receptor='bcr', gzip=False):
         super(Args, self).__init__()
         self.sequences = sequences
@@ -1098,17 +1098,20 @@ def run(*args, **kwargs):
         ofmt = args.output_type[0]
         ofile = output_files[0]
         if ofmt == 'tabular':
-            with open(ofile) as f:
-                reader = csv.DictReader(f, delimiter=',')
-                output = [Sequence(r) for r in reader]
+            output = read_csv(ofile, delimiter=',', id_key='seq_id', sequence_key='vdj_nt')
+            # with open(ofile) as f:
+                # reader = csv.DictReader(f, delimiter=',')
+                # output = [Sequence(r) for r in reader]
         if ofmt == 'airr':
-            with open(ofile) as f:
-                reader = csv.DictReader(f, delimiter='\t')
-                output = [Sequence(r, id_key='sequence_id', seq_key='sequence')  for r in reader]
+            output = read_csv(ofile, delimiter='\t', id_key='sequence_id', sequence_key='sequence')
+            # with open(ofile) as f:
+            #     reader = csv.DictReader(f, delimiter='\t')
+            #     output = [Sequence(r, id_key='sequence_id', seq_key='sequence')  for r in reader]
         if ofmt == 'json':
-            with open(ofile) as f:
-                output = [Sequence(json.loads(line)) for line in f]
-        output = [Sequence(o) for o in output]
+            output = read_json(ofile, id_key='seq_id', sequence_key='vdj_nt')
+            # with open(ofile) as f:
+            #     output = [Sequence(json.loads(line)) for line in f]
+        # output = [Sequence(o) for o in output]
         if len(output) == 1:
             return output[0]
         else:

diff --git a/abstar/core/germline.py b/abstar/core/germline.py
@@ -270,6 +270,18 @@ def realign_germline(self, antibody, query_start=None, query_end=None):
         else:
             query = oriented_input.sequence[query_start:query_end]
             alignment = local_alignment(query, germline_seq, **aln_params)
+            # fix for a fairly rare edge case where coincidental matching to 2-3 residues at the extreme
+            # 3' end of K/L germline V genes can result in incorrect identification of the
+            # end of the V gene region (making the called V region far too long and, in some cases, 
+            # extending beyond the junction and into FR4). What we do here is drop the last 2 nucleotides
+            # of the aligned germline and re-align to see whether that substantialy truncates the resulting
+            # alignment (by at least 2 additional nucleotides). If so, we use the new alignment instead.
+            if self.gene_type == 'V' and self.chain in ['kappa', 'lambda']:
+                germline_trunc = germline_seq[:alignment.target_end - 2]
+                alignment2 = local_alignment(query, germline_trunc, **aln_params)
+                if alignment.query_end - alignment2.query_end >= 4:
+                    alignment2.raw_target = alignment.raw_target  # swap the truncated germline with the full one
+                    alignment = alignment2
         if alignment:
             self._process_realignment(antibody, alignment, query_start)
         else:

diff --git a/abstar/utils/junction.py b/abstar/utils/junction.py
@@ -26,10 +26,12 @@
 
 import math
 import os
+from termios import NL1
 import traceback
 
 from Bio.Seq import Seq
 
+from abutils.core.sequence import translate
 from abutils.utils import log
 from abutils.utils.alignment import global_alignment, local_alignment
 from abutils.utils.codons import codon_lookup as codons
@@ -105,6 +107,56 @@ def __init__(self, antibody):
             antibody.log('J_NT:', self.j_nt)
             antibody.log('D-GENE DISTANCE FROM CDR3 START:', self.d_dist_from_cdr3_start_nt)
             antibody.log('D-GENE DISTANCE FROM CDR3 END:', self.d_dist_from_cdr3_end_nt)
+            # parse germline junction regions (just V, D and J)
+            self.v_germ_nt = antibody.v.germline_alignment[-len(self.v_nt):]
+            self.d_germ_nt = antibody.d.germline_alignment
+            self.j_germ_nt = antibody.j.germline_alignment[:len(self.j_nt)]
+            # v_germ_nt = antibody.v.raw_germline[:antibody.v.germline_end + 1]
+            # self.v_germ_nt = v_germ_nt[-len(self.v_nt):]
+            # d_germ_nt = antibody.d.raw_germline[:antibody.d.germline_end + 1]
+            # self.d_germ_nt = d_germ_nt[-len(self.d_nt):]
+            # j_germ_nt = antibody.j.raw_germline[antibody.j.germline_start:antibody.j.germline_end]
+            # self.j_germ_nt = j_germ_nt[:len(self.j_nt)]
+            antibody.log('V_GERM_NT:', self.v_germ_nt)
+            antibody.log('D_GERM_NT:', self.d_germ_nt)
+            antibody.log('J_GERM_NT:', self.j_germ_nt)
+
+            # translate the junction regions
+            self.v_aa = translate(self.v_nt)
+            self.v_germ_aa = translate(self.v_germ_nt)
+            # if the final codon of v_nt or the first codon of d_nt contain any n-addition, the whole codon
+            # is considered n-addition for translation purposes, since they're not entirely encoded by the 
+            # germline gene segment
+            v_really_n1 = len(self.v_nt) % 3
+            d_really_n1 = 3 - (len(self.v_nt + self.n1_nt) % 3)
+            n1 = self.v_nt[-v_really_n1:] if v_really_n1 > 0 else ''
+            d_really_n1 = d_really_n1 % 3 # this is to account for case where d_really_n1 == 3, which needs to be converted to 0
+            n1 += self.n1_nt
+            n1 += self.d_nt[:d_really_n1] if d_really_n1 > 0 else ''
+            self.n1_aa = translate(n1)
+            self.d_aa = translate(self.d_nt[d_really_n1:])
+            self.d_germ_aa = translate(self.d_germ_nt[d_really_n1:])
+            # if the final codon of d_nt or the first codon of j_nt contain any n-addition, the whole codon
+            # is considered n-addition for translation purposes, since they're not entirely encoded by the 
+            # germline gene segment
+            d_really_n2 = len(self.v_nt + self.n1_nt + self.d_nt) % 3
+            j_really_n2 = 3 - (len(self.v_nt + self.n1_nt + self.d_nt + self.n2_nt) % 3)
+            j_really_n2 = j_really_n2 % 3 # this is to account for case where j_really_n2 == 3, which needs to be converted to 0
+            n2 = self.d_nt[-d_really_n2:] if d_really_n2 > 0 else ''
+            n2 += self.n2_nt
+            n2 += self.j_nt[:j_really_n2] if j_really_n2 > 0 else ''
+            self.n2_aa = translate(n2)
+            self.j_aa = translate(self.j_nt[j_really_n2:])
+            self.j_germ_aa = translate(self.j_germ_nt[j_really_n2:])
+            self.n_aa = None
+            antibody.log('V_AA:', self.v_aa)
+            antibody.log('N1_AA:', self.n1_aa)
+            antibody.log('D_AA:', self.d_aa)
+            antibody.log('N2_AA:', self.n2_aa)
+            antibody.log('J_AA:', self.j_aa)
+            antibody.log('V_GERM_AA:', self.v_germ_aa)
+            antibody.log('D_GERM_AA:', self.d_germ_aa)
+            antibody.log('J_GERM_AA:', self.j_germ_aa)
         else:
             n_start = antibody.v.query_end + 1
             n_end = antibody.j.query_start
@@ -113,12 +165,44 @@ def __init__(self, antibody):
             self.j_nt = antibody.oriented_input[n_end:self.junction_nt_end]
             self.n1_nt = None
             self.d_nt = None
-            self.n1_nt = None
+            self.n2_nt = None
             self.d_dist_from_cdr3_start_nt = None
             self.d_dist_from_cdr3_end_nt = None
             antibody.log('V_NT:', self.v_nt)
             antibody.log('N_NT:', self.n_nt)
             antibody.log('J_NT:', self.j_nt)
+            v_germ_nt = antibody.v.raw_germline[:antibody.v.germline_end + 1]
+            self.v_germ_nt = v_germ_nt[-len(self.v_nt):]
+            j_germ_nt = antibody.j.raw_germline[antibody.j.germline_start:antibody.j.germline_end + 1]
+            self.j_germ_nt = j_germ_nt[:len(self.j_nt)]
+            self.d_germ_nt = None
+            antibody.log('V_GERM_NT:', self.v_germ_nt)
+            antibody.log('J_GERM_NT:', self.j_germ_nt)
+
+            # translate the junction regions
+            self.v_aa = translate(self.v_nt)
+            self.v_germ_aa = translate(self.v_germ_nt)
+            # if the final codon of v_nt or the first codon of j_nt contain any n-addition, the whole codon
+            # is considered n-addition for translation purposes, since they're not entirely encoded by the 
+            # germline gene segment
+            v_really_n = len(self.v_nt) % 3
+            j_really_n = 3 - (len(self.v_nt + self.n_nt) % 3)
+            j_really_n = j_really_n % 3 # this is to account for case where j_really_n2 == 3, which needs to be converted to 0
+            n = self.v_nt[-v_really_n:] if v_really_n > 0 else ''
+            n += self.n_nt
+            n += self.j_nt[:j_really_n] if j_really_n > 0 else ''
+            self.n_aa = translate(n)
+            self.j_aa = translate(self.j_nt[j_really_n:])
+            self.j_germ_aa = translate(self.j_germ_nt[j_really_n:])
+            self.n1_aa = None
+            self.d_aa = None
+            self.n2_aa = None
+            self.d_germ_aa = None
+            antibody.log('V_AA:', self.v_aa)
+            antibody.log('N_AA:', self.n_aa)
+            antibody.log('J_AA:', self.j_aa)
+            antibody.log('V_GERM_AA:', self.v_germ_aa)
+            antibody.log('J_GERM_AA:', self.j_germ_aa)
 
         # calculate IMGT numbering for the junction and CDR3
         self.cdr3_imgt_nt_numbering = self._calculate_cdr3_imgt_nt_numbering()

diff --git a/abstar/utils/output.py b/abstar/utils/output.py
@@ -39,7 +39,7 @@
 OUTPUT_SEPARATORS = {'airr': '\t',
                      'imgt': ',',
                      'tabular': ','}
-OUTPUT_EXTENSIONS = {'airr': '.txt',
+OUTPUT_EXTENSIONS = {'airr': '.tsv',
                      'json': '.json',
                      'imgt': '.csv',
                      'tabular': '.csv'}
@@ -200,7 +200,7 @@ def _build_json_output(self, raw=False):
                     'n1_nt': self.antibody.junction.n1_nt,
                     'd_nt': self.antibody.junction.d_nt,
                     'n2_nt': self.antibody.junction.n2_nt,
-                    'j_nt': self.antibody.junction.j_nt, #Change from 'self.antibody.junction.v_nt' to 'self.antibody.junction.j_nt'
+                    'j_nt': self.antibody.junction.j_nt,
 #                     'd_cdr3_pos': {'start': self.antibody.junction.d_start_position_nt,
 #                                    'end': self.antibody.junction.d_end_position_nt},
                     'd_dist_from_cdr3_start': self.antibody.junction.d_dist_from_cdr3_start_nt,
@@ -218,7 +218,7 @@ def _build_json_output(self, raw=False):
         else:
             junc = {'v_nt': self.antibody.junction.v_nt,
                     'n_nt': self.antibody.junction.n_nt,
-                    'j_nt': self.antibody.junction.j_nt} #Change from 'self.antibody.junction.v_nt' to 'self.antibody.junction.j_nt'
+                    'j_nt': self.antibody.junction.j_nt}
 
         output = collections.OrderedDict([
             ('seq_id', self.antibody.id),
@@ -465,6 +465,7 @@ def _build_airr_output(self):
             ('umi', self.antibody.uid),
             ('sequence', self.antibody.vdj_nt),
             ('sequence_aa', self.antibody.vdj_aa),
+            ('sequence_input', self.antibody.raw_input.sequence),
             ('rev_comp', 'True' if self.antibody.v.strand == '-' else 'False'),
             ('productive', 'True' if self.antibody.productivity.is_productive else 'False'),
             ('productivity_issues', '|'.join(self.antibody.productivity.productivity_issues)),
@@ -503,16 +504,33 @@ def _build_airr_output(self):
             ('junction', self.antibody.junction.junction_nt),
             ('junction_aa', self.antibody.junction.junction_aa),
             ('junction_in_frame', 'True' if self.antibody.junction.in_frame else 'False'),
+            ('junction_v', self.antibody.junction.v_nt),
+            ('junction_d', self.antibody.junction.d_nt),
+            ('junction_j', self.antibody.junction.j_nt),
+            ('junction_n', self.antibody.junction.n_nt),
+            ('junction_n1', self.antibody.junction.n1_nt),
+            ('junction_n2', self.antibody.junction.n2_nt),
+            ('junction_v_aa', self.antibody.junction.v_aa),
+            ('junction_d_aa', self.antibody.junction.d_aa),
+            ('junction_j_aa', self.antibody.junction.j_aa),
+            ('junction_n_aa', self.antibody.junction.n_aa),
+            ('junction_n1_aa', self.antibody.junction.n1_aa),
+            ('junction_n2_aa', self.antibody.junction.n2_aa),
+            ('junction_germ_v', self.antibody.junction.v_germ_nt),
+            ('junction_germ_d', self.antibody.junction.d_germ_nt),
+            ('junction_germ_j', self.antibody.junction.j_germ_nt),
+            ('junction_germ_v_aa', self.antibody.junction.v_germ_aa),
+            ('junction_germ_d_aa', self.antibody.junction.d_germ_aa),
+            ('junction_germ_j_aa', self.antibody.junction.j_germ_aa),
             ('isotype', c_call),
             ('locus', self.antibody.v.gene[:3].upper()),
             ('v_cigar', make_cigar(self.antibody.v)),
             ('d_cigar', d_cigar),
             ('j_cigar', make_cigar(self.antibody.j)),
             ('species', self.antibody.species),
             ('germline_database', self.antibody.germ_db),
-            ('raw_input', self.antibody.raw_input.sequence),
         ])
-        return '\t'.join(output.values())
+        return '\t'.join([v if v is not None else '' for v in output.values()])
 
 
 def get_output_suffix(output_format):
@@ -1067,6 +1085,7 @@ def _hadoop_tabular_output(vdj):
                'umi',
                'sequence',
                'sequence_aa',
+               'sequence_input',
                'rev_comp',
                'productive',
                'productivity_issues',
@@ -1104,14 +1123,31 @@ def _hadoop_tabular_output(vdj):
                'junction',
                'junction_aa',
                'junction_in_frame',
+               'junction_v',
+               'junction_d',
+               'junction_j',
+               'junction_n',
+               'junction_n1',
+               'junction_n2',
+               'junction_v_aa',
+               'junction_d_aa',
+               'junction_j_aa',
+               'junction_n_aa',
+               'junction_n1_aa',
+               'junction_n2_aa',
+               'junction_germ_v',
+               'junction_germ_d',
+               'junction_germ_j',
+               'junction_germ_v_aa',
+               'junction_germ_d_aa',
+               'junction_germ_j_aa',
                'isotype',
                'locus',
                'v_cigar',
                'd_cigar',
                'j_cigar',
                'species',
-               'germline_database',
-               'raw_input']
+               'germline_database',]
 
 
 

diff --git a/abstar/utils/regions.py b/abstar/utils/regions.py
@@ -36,8 +36,10 @@
 
 
 # Both IMGT_REGION_END_POSITIONS_AA and IMGT_REGION_END_POSITIONS_NT use the actual
-# IMGT end positions, thus they're not suitable for slicing (because Python's slicing
-# end point is exclusive). To use these end points in a slice, you need to add 1.
+# IMGT end positions, thus they're not suitable for slicing (because Python's uses
+# zero-based numbering). To use these end points in a slice, you need to subtract 1 
+# from the start position (since Python's slicing is exclusive, the actual IMGT
+# numbering works fine for the end position of the slice).
 IMGT_REGION_END_POSITIONS_AA = {'FR1': 26, 'CDR1': 38, 'FR2': 55, 'CDR2': 65,
                                 'FR3': 104, 'CDR3': 117, 'FR4': 129}
 

diff --git a/abstar/version.py b/abstar/version.py
@@ -2,4 +2,4 @@
 # 1) we don't load dependencies by storing it in __init__.py
 # 2) we can import it in setup.py for the same reason
 # 3) we can import it into your module module
-__version__ = '0.5.5'
+__version__ = '0.5.6'
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-abutils>=0.2.1
+abutils>=0.2.8
 biopython==1.78
 celery
 dask[complete]

diff --git a/requirements2.txt b/requirements2.txt