Skip to content

Commit

Permalink
Merge pull request #106 from briney/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
briney authored Jul 8, 2022
2 parents 21ca273 + fc247eb commit 939df26
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 32 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
![](https://img.shields.io/pypi/v/abstar.svg?colorB=blue)
[![Build Status](https://travis-ci.com/briney/abstar.svg?branch=master)](https://travis-ci.com/briney/abstar)
[![Build Status](https://travis-ci.com/briney/abstar.svg?branch=master)](https://app.travis-ci.com/github/briney/abstar)
[![Documentation Status](https://readthedocs.org/projects/abstar/badge/?version=latest)](https://abstar.readthedocs.io/en/latest/?badge=latest)
![](https://img.shields.io/pypi/pyversions/abstar.svg)
![](https://img.shields.io/badge/license-MIT-blue.svg)
Expand Down
25 changes: 14 additions & 11 deletions abstar/core/abstar.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@

# import dask.dataframe as dd

from abutils.core.sequence import Sequence
from abutils.core.sequence import Sequence, read_json, read_csv
from abutils.utils import log
# from abutils.utils.pipeline import list_files

Expand Down Expand Up @@ -201,7 +201,7 @@ def __init__(self, project_dir=None, input=None, output=None, log=None, temp=Non
sequences=None, chunksize=500, output_type=['json', ], assigner='blastn',
merge=False, pandaseq_algo='simple_bayesian', use_test_data=False,
parquet=False, nextseq=False, uid=0, isotype=False, pretty=False, num_cores=0,
basespace=False, cluster=False, padding=True, raw=False, json_keys=None,
basespace=False, cluster=False, padding=False, raw=False, json_keys=None,
debug=False, germ_db='human', receptor='bcr', gzip=False):
super(Args, self).__init__()
self.sequences = sequences
Expand Down Expand Up @@ -1098,17 +1098,20 @@ def run(*args, **kwargs):
ofmt = args.output_type[0]
ofile = output_files[0]
if ofmt == 'tabular':
with open(ofile) as f:
reader = csv.DictReader(f, delimiter=',')
output = [Sequence(r) for r in reader]
output = read_csv(ofile, delimiter=',', id_key='seq_id', sequence_key='vdj_nt')
# with open(ofile) as f:
# reader = csv.DictReader(f, delimiter=',')
# output = [Sequence(r) for r in reader]
if ofmt == 'airr':
with open(ofile) as f:
reader = csv.DictReader(f, delimiter='\t')
output = [Sequence(r, id_key='sequence_id', seq_key='sequence') for r in reader]
output = read_csv(ofile, delimiter='\t', id_key='sequence_id', sequence_key='sequence')
# with open(ofile) as f:
# reader = csv.DictReader(f, delimiter='\t')
# output = [Sequence(r, id_key='sequence_id', seq_key='sequence') for r in reader]
if ofmt == 'json':
with open(ofile) as f:
output = [Sequence(json.loads(line)) for line in f]
output = [Sequence(o) for o in output]
output = read_json(ofile, id_key='seq_id', sequence_key='vdj_nt')
# with open(ofile) as f:
# output = [Sequence(json.loads(line)) for line in f]
# output = [Sequence(o) for o in output]
if len(output) == 1:
return output[0]
else:
Expand Down
12 changes: 12 additions & 0 deletions abstar/core/germline.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,18 @@ def realign_germline(self, antibody, query_start=None, query_end=None):
else:
query = oriented_input.sequence[query_start:query_end]
alignment = local_alignment(query, germline_seq, **aln_params)
# fix for a fairly rare edge case where coincidental matching to 2-3 residues at the extreme
# 3' end of K/L germline V genes can result in incorrect identification of the
# end of the V gene region (making the called V region far too long and, in some cases,
# extending beyond the junction and into FR4). What we do here is drop the last 2 nucleotides
# of the aligned germline and re-align to see whether that substantialy truncates the resulting
# alignment (by at least 2 additional nucleotides). If so, we use the new alignment instead.
if self.gene_type == 'V' and self.chain in ['kappa', 'lambda']:
germline_trunc = germline_seq[:alignment.target_end - 2]
alignment2 = local_alignment(query, germline_trunc, **aln_params)
if alignment.query_end - alignment2.query_end >= 4:
alignment2.raw_target = alignment.raw_target # swap the truncated germline with the full one
alignment = alignment2
if alignment:
self._process_realignment(antibody, alignment, query_start)
else:
Expand Down
86 changes: 85 additions & 1 deletion abstar/utils/junction.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@

import math
import os
from termios import NL1
import traceback

from Bio.Seq import Seq

from abutils.core.sequence import translate
from abutils.utils import log
from abutils.utils.alignment import global_alignment, local_alignment
from abutils.utils.codons import codon_lookup as codons
Expand Down Expand Up @@ -105,6 +107,56 @@ def __init__(self, antibody):
antibody.log('J_NT:', self.j_nt)
antibody.log('D-GENE DISTANCE FROM CDR3 START:', self.d_dist_from_cdr3_start_nt)
antibody.log('D-GENE DISTANCE FROM CDR3 END:', self.d_dist_from_cdr3_end_nt)
# parse germline junction regions (just V, D and J)
self.v_germ_nt = antibody.v.germline_alignment[-len(self.v_nt):]
self.d_germ_nt = antibody.d.germline_alignment
self.j_germ_nt = antibody.j.germline_alignment[:len(self.j_nt)]
# v_germ_nt = antibody.v.raw_germline[:antibody.v.germline_end + 1]
# self.v_germ_nt = v_germ_nt[-len(self.v_nt):]
# d_germ_nt = antibody.d.raw_germline[:antibody.d.germline_end + 1]
# self.d_germ_nt = d_germ_nt[-len(self.d_nt):]
# j_germ_nt = antibody.j.raw_germline[antibody.j.germline_start:antibody.j.germline_end]
# self.j_germ_nt = j_germ_nt[:len(self.j_nt)]
antibody.log('V_GERM_NT:', self.v_germ_nt)
antibody.log('D_GERM_NT:', self.d_germ_nt)
antibody.log('J_GERM_NT:', self.j_germ_nt)

# translate the junction regions
self.v_aa = translate(self.v_nt)
self.v_germ_aa = translate(self.v_germ_nt)
# if the final codon of v_nt or the first codon of d_nt contain any n-addition, the whole codon
# is considered n-addition for translation purposes, since they're not entirely encoded by the
# germline gene segment
v_really_n1 = len(self.v_nt) % 3
d_really_n1 = 3 - (len(self.v_nt + self.n1_nt) % 3)
n1 = self.v_nt[-v_really_n1:] if v_really_n1 > 0 else ''
d_really_n1 = d_really_n1 % 3 # this is to account for case where d_really_n1 == 3, which needs to be converted to 0
n1 += self.n1_nt
n1 += self.d_nt[:d_really_n1] if d_really_n1 > 0 else ''
self.n1_aa = translate(n1)
self.d_aa = translate(self.d_nt[d_really_n1:])
self.d_germ_aa = translate(self.d_germ_nt[d_really_n1:])
# if the final codon of d_nt or the first codon of j_nt contain any n-addition, the whole codon
# is considered n-addition for translation purposes, since they're not entirely encoded by the
# germline gene segment
d_really_n2 = len(self.v_nt + self.n1_nt + self.d_nt) % 3
j_really_n2 = 3 - (len(self.v_nt + self.n1_nt + self.d_nt + self.n2_nt) % 3)
j_really_n2 = j_really_n2 % 3 # this is to account for case where j_really_n2 == 3, which needs to be converted to 0
n2 = self.d_nt[-d_really_n2:] if d_really_n2 > 0 else ''
n2 += self.n2_nt
n2 += self.j_nt[:j_really_n2] if j_really_n2 > 0 else ''
self.n2_aa = translate(n2)
self.j_aa = translate(self.j_nt[j_really_n2:])
self.j_germ_aa = translate(self.j_germ_nt[j_really_n2:])
self.n_aa = None
antibody.log('V_AA:', self.v_aa)
antibody.log('N1_AA:', self.n1_aa)
antibody.log('D_AA:', self.d_aa)
antibody.log('N2_AA:', self.n2_aa)
antibody.log('J_AA:', self.j_aa)
antibody.log('V_GERM_AA:', self.v_germ_aa)
antibody.log('D_GERM_AA:', self.d_germ_aa)
antibody.log('J_GERM_AA:', self.j_germ_aa)
else:
n_start = antibody.v.query_end + 1
n_end = antibody.j.query_start
Expand All @@ -113,12 +165,44 @@ def __init__(self, antibody):
self.j_nt = antibody.oriented_input[n_end:self.junction_nt_end]
self.n1_nt = None
self.d_nt = None
self.n1_nt = None
self.n2_nt = None
self.d_dist_from_cdr3_start_nt = None
self.d_dist_from_cdr3_end_nt = None
antibody.log('V_NT:', self.v_nt)
antibody.log('N_NT:', self.n_nt)
antibody.log('J_NT:', self.j_nt)
v_germ_nt = antibody.v.raw_germline[:antibody.v.germline_end + 1]
self.v_germ_nt = v_germ_nt[-len(self.v_nt):]
j_germ_nt = antibody.j.raw_germline[antibody.j.germline_start:antibody.j.germline_end + 1]
self.j_germ_nt = j_germ_nt[:len(self.j_nt)]
self.d_germ_nt = None
antibody.log('V_GERM_NT:', self.v_germ_nt)
antibody.log('J_GERM_NT:', self.j_germ_nt)

# translate the junction regions
self.v_aa = translate(self.v_nt)
self.v_germ_aa = translate(self.v_germ_nt)
# if the final codon of v_nt or the first codon of j_nt contain any n-addition, the whole codon
# is considered n-addition for translation purposes, since they're not entirely encoded by the
# germline gene segment
v_really_n = len(self.v_nt) % 3
j_really_n = 3 - (len(self.v_nt + self.n_nt) % 3)
j_really_n = j_really_n % 3 # this is to account for case where j_really_n2 == 3, which needs to be converted to 0
n = self.v_nt[-v_really_n:] if v_really_n > 0 else ''
n += self.n_nt
n += self.j_nt[:j_really_n] if j_really_n > 0 else ''
self.n_aa = translate(n)
self.j_aa = translate(self.j_nt[j_really_n:])
self.j_germ_aa = translate(self.j_germ_nt[j_really_n:])
self.n1_aa = None
self.d_aa = None
self.n2_aa = None
self.d_germ_aa = None
antibody.log('V_AA:', self.v_aa)
antibody.log('N_AA:', self.n_aa)
antibody.log('J_AA:', self.j_aa)
antibody.log('V_GERM_AA:', self.v_germ_aa)
antibody.log('J_GERM_AA:', self.j_germ_aa)

# calculate IMGT numbering for the junction and CDR3
self.cdr3_imgt_nt_numbering = self._calculate_cdr3_imgt_nt_numbering()
Expand Down
50 changes: 43 additions & 7 deletions abstar/utils/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
OUTPUT_SEPARATORS = {'airr': '\t',
'imgt': ',',
'tabular': ','}
OUTPUT_EXTENSIONS = {'airr': '.txt',
OUTPUT_EXTENSIONS = {'airr': '.tsv',
'json': '.json',
'imgt': '.csv',
'tabular': '.csv'}
Expand Down Expand Up @@ -200,7 +200,7 @@ def _build_json_output(self, raw=False):
'n1_nt': self.antibody.junction.n1_nt,
'd_nt': self.antibody.junction.d_nt,
'n2_nt': self.antibody.junction.n2_nt,
'j_nt': self.antibody.junction.j_nt, #Change from 'self.antibody.junction.v_nt' to 'self.antibody.junction.j_nt'
'j_nt': self.antibody.junction.j_nt,
# 'd_cdr3_pos': {'start': self.antibody.junction.d_start_position_nt,
# 'end': self.antibody.junction.d_end_position_nt},
'd_dist_from_cdr3_start': self.antibody.junction.d_dist_from_cdr3_start_nt,
Expand All @@ -218,7 +218,7 @@ def _build_json_output(self, raw=False):
else:
junc = {'v_nt': self.antibody.junction.v_nt,
'n_nt': self.antibody.junction.n_nt,
'j_nt': self.antibody.junction.j_nt} #Change from 'self.antibody.junction.v_nt' to 'self.antibody.junction.j_nt'
'j_nt': self.antibody.junction.j_nt}

output = collections.OrderedDict([
('seq_id', self.antibody.id),
Expand Down Expand Up @@ -465,6 +465,7 @@ def _build_airr_output(self):
('umi', self.antibody.uid),
('sequence', self.antibody.vdj_nt),
('sequence_aa', self.antibody.vdj_aa),
('sequence_input', self.antibody.raw_input.sequence),
('rev_comp', 'True' if self.antibody.v.strand == '-' else 'False'),
('productive', 'True' if self.antibody.productivity.is_productive else 'False'),
('productivity_issues', '|'.join(self.antibody.productivity.productivity_issues)),
Expand Down Expand Up @@ -503,16 +504,33 @@ def _build_airr_output(self):
('junction', self.antibody.junction.junction_nt),
('junction_aa', self.antibody.junction.junction_aa),
('junction_in_frame', 'True' if self.antibody.junction.in_frame else 'False'),
('junction_v', self.antibody.junction.v_nt),
('junction_d', self.antibody.junction.d_nt),
('junction_j', self.antibody.junction.j_nt),
('junction_n', self.antibody.junction.n_nt),
('junction_n1', self.antibody.junction.n1_nt),
('junction_n2', self.antibody.junction.n2_nt),
('junction_v_aa', self.antibody.junction.v_aa),
('junction_d_aa', self.antibody.junction.d_aa),
('junction_j_aa', self.antibody.junction.j_aa),
('junction_n_aa', self.antibody.junction.n_aa),
('junction_n1_aa', self.antibody.junction.n1_aa),
('junction_n2_aa', self.antibody.junction.n2_aa),
('junction_germ_v', self.antibody.junction.v_germ_nt),
('junction_germ_d', self.antibody.junction.d_germ_nt),
('junction_germ_j', self.antibody.junction.j_germ_nt),
('junction_germ_v_aa', self.antibody.junction.v_germ_aa),
('junction_germ_d_aa', self.antibody.junction.d_germ_aa),
('junction_germ_j_aa', self.antibody.junction.j_germ_aa),
('isotype', c_call),
('locus', self.antibody.v.gene[:3].upper()),
('v_cigar', make_cigar(self.antibody.v)),
('d_cigar', d_cigar),
('j_cigar', make_cigar(self.antibody.j)),
('species', self.antibody.species),
('germline_database', self.antibody.germ_db),
('raw_input', self.antibody.raw_input.sequence),
])
return '\t'.join(output.values())
return '\t'.join([v if v is not None else '' for v in output.values()])


def get_output_suffix(output_format):
Expand Down Expand Up @@ -1067,6 +1085,7 @@ def _hadoop_tabular_output(vdj):
'umi',
'sequence',
'sequence_aa',
'sequence_input',
'rev_comp',
'productive',
'productivity_issues',
Expand Down Expand Up @@ -1104,14 +1123,31 @@ def _hadoop_tabular_output(vdj):
'junction',
'junction_aa',
'junction_in_frame',
'junction_v',
'junction_d',
'junction_j',
'junction_n',
'junction_n1',
'junction_n2',
'junction_v_aa',
'junction_d_aa',
'junction_j_aa',
'junction_n_aa',
'junction_n1_aa',
'junction_n2_aa',
'junction_germ_v',
'junction_germ_d',
'junction_germ_j',
'junction_germ_v_aa',
'junction_germ_d_aa',
'junction_germ_j_aa',
'isotype',
'locus',
'v_cigar',
'd_cigar',
'j_cigar',
'species',
'germline_database',
'raw_input']
'germline_database',]



Expand Down
6 changes: 4 additions & 2 deletions abstar/utils/regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@


# Both IMGT_REGION_END_POSITIONS_AA and IMGT_REGION_END_POSITIONS_NT use the actual
# IMGT end positions, thus they're not suitable for slicing (because Python's slicing
# end point is exclusive). To use these end points in a slice, you need to add 1.
# IMGT end positions, thus they're not suitable for slicing (because Python's uses
# zero-based numbering). To use these end points in a slice, you need to subtract 1
# from the start position (since Python's slicing is exclusive, the actual IMGT
# numbering works fine for the end position of the slice).
IMGT_REGION_END_POSITIONS_AA = {'FR1': 26, 'CDR1': 38, 'FR2': 55, 'CDR2': 65,
'FR3': 104, 'CDR3': 117, 'FR4': 129}

Expand Down
2 changes: 1 addition & 1 deletion abstar/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# 1) we don't load dependencies by storing it in __init__.py
# 2) we can import it in setup.py for the same reason
# 3) we can import it into your module module
__version__ = '0.5.5'
__version__ = '0.5.6'
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
abutils>=0.2.1
abutils>=0.2.8
biopython==1.78
celery
dask[complete]
Expand Down
8 changes: 0 additions & 8 deletions requirements2.txt

This file was deleted.

0 comments on commit 939df26

Please sign in to comment.