Skip to content

Commit

Permalink
Add a function for determining read pair orientation (#201)
Browse files Browse the repository at this point in the history
I have a project where I am re-building a complex alignment situation
for a template and it involves an assessment of read pair orientation
and whether the new pair is "proper" or not.

The functions in this PR allow for an optional R2 such as when you are
iterating through a BAM in coordinate order and don't have easy access
to the R2. When this happens, R1 is required to have enough mate
information to derive the pair status and whether or not the read pair
is "proper".

The implementation honors what is implemented in HTSJDK/fgbio but with:

1. Fewer exceptions by using `None` instead of an exception, when
possible
2. Proper pair status also considers template length / insert size

---------

Co-authored-by: Tim Fennell <tfenne@tfenne.com>
  • Loading branch information
clintval and tfenne authored Dec 27, 2024
1 parent ae54c9e commit f9aa849
Show file tree
Hide file tree
Showing 2 changed files with 340 additions and 4 deletions.
121 changes: 117 additions & 4 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@
import enum
import io
import sys
from collections.abc import Collection
from itertools import chain
from pathlib import Path
from typing import IO
Expand Down Expand Up @@ -607,6 +608,118 @@ def __getitem__(
return self.elements[index]


@enum.unique
class PairOrientation(enum.Enum):
"""Enumerations of read pair orientations."""

FR = "FR"
"""A pair orientation for forward-reverse reads ("innie")."""

RF = "RF"
"""A pair orientation for reverse-forward reads ("outie")."""

TANDEM = "TANDEM"
"""A pair orientation for tandem (forward-forward or reverse-reverse) reads."""

@classmethod
def from_recs( # noqa: C901 # `from_recs` is too complex (11 > 10)
cls, rec1: AlignedSegment, rec2: Optional[AlignedSegment] = None
) -> Optional["PairOrientation"]:
"""Returns the pair orientation if both reads are mapped to the same reference sequence.
Args:
rec1: The first record in the pair.
rec2: The second record in the pair. If None, then mate info on `rec1` will be used.
See:
[`htsjdk.samtools.SamPairUtil.getPairOrientation()`](https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L71-L102)
"""

if rec2 is None:
rec2_is_unmapped = rec1.mate_is_unmapped
rec2_reference_id = rec1.next_reference_id
else:
rec2_is_unmapped = rec2.is_unmapped
rec2_reference_id = rec2.reference_id

if rec1.is_unmapped or rec2_is_unmapped or rec1.reference_id != rec2_reference_id:
return None

if rec2 is None:
rec2_is_forward = rec1.mate_is_forward
rec2_reference_start = rec1.next_reference_start
else:
rec2_is_forward = rec2.is_forward
rec2_reference_start = rec2.reference_start

if rec1.is_forward is rec2_is_forward:
return PairOrientation.TANDEM
if rec1.is_forward and rec1.reference_start <= rec2_reference_start:
return PairOrientation.FR
if rec1.is_reverse and rec2_reference_start < rec1.reference_end:
return PairOrientation.FR
if rec1.is_reverse and rec2_reference_start >= rec1.reference_end:
return PairOrientation.RF

if rec2 is None:
if not rec1.has_tag("MC"):
raise ValueError('Cannot determine pair orientation without a mate cigar ("MC")!')
rec2_cigar = Cigar.from_cigarstring(str(rec1.get_tag("MC")))
rec2_reference_end = rec1.next_reference_start + rec2_cigar.length_on_target()
else:
rec2_reference_end = rec2.reference_end

if rec1.reference_start < rec2_reference_end:
return PairOrientation.FR
else:
return PairOrientation.RF


DefaultProperlyPairedOrientations: set[PairOrientation] = {PairOrientation.FR}
"""The default orientations for properly paired reads."""


def is_proper_pair(
rec1: AlignedSegment,
rec2: Optional[AlignedSegment] = None,
max_insert_size: int = 1000,
orientations: Collection[PairOrientation] = DefaultProperlyPairedOrientations,
) -> bool:
"""Determines if a pair of records are properly paired or not.
Criteria for records in a proper pair are:
- Both records are aligned
- Both records are aligned to the same reference sequence
- The pair orientation of the records is one of the valid pair orientations (default "FR")
- The inferred insert size is not more than a maximum length (default 1000)
Args:
rec1: The first record in the pair.
rec2: The second record in the pair. If None, then mate info on `rec1` will be used.
max_insert_size: The maximum insert size to consider a pair "proper".
orientations: The valid set of orientations to consider a pair "proper".
See:
[`htsjdk.samtools.SamPairUtil.isProperPair()`](https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L106-L125)
"""
if rec2 is None:
rec2_is_mapped = rec1.mate_is_mapped
rec2_reference_id = rec1.next_reference_id
else:
rec2_is_mapped = rec2.is_mapped
rec2_reference_id = rec2.reference_id

return (
rec1.is_mapped
and rec2_is_mapped
and rec1.reference_id == rec2_reference_id
and PairOrientation.from_recs(rec1=rec1, rec2=rec2) in orientations
# TODO: consider replacing with `abs(isize(r1, r2)) <= max_insert_size`
# which can only be done if isize() is modified to allow for an optional R2.
and 0 < abs(rec1.template_length) <= max_insert_size
)


@attr.s(frozen=True, auto_attribs=True)
class SupplementaryAlignment:
"""Stores a supplementary alignment record produced by BWA and stored in the SA SAM tag.
Expand Down Expand Up @@ -707,9 +820,8 @@ def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = Tr
r1: read 1
r2: read 2 with the same queryname as r1
"""
assert not r1.is_unmapped, f"Cannot process unmapped mate {r1.query_name}/1"
assert not r2.is_unmapped, f"Cannot process unmapped mate {r2.query_name}/2"
assert r1.query_name == r2.query_name, "Attempting to pair reads with different qnames."
if r1.query_name != r2.query_name:
raise ValueError("Cannot set pair info on reads with different query names!")

for r in [r1, r2]:
r.is_paired = True
Expand All @@ -724,8 +836,9 @@ def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = Tr
dest.next_reference_id = src.reference_id
dest.next_reference_start = src.reference_start
dest.mate_is_reverse = src.is_reverse
dest.mate_is_unmapped = False
dest.mate_is_unmapped = src.mate_is_unmapped
dest.set_tag("MC", src.cigarstring)
dest.set_tag("MQ", src.mapping_quality)

insert_size = isize(r1, r2)
r1.template_length = insert_size
Expand Down
223 changes: 223 additions & 0 deletions tests/fgpyo/sam/test_sam.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from fgpyo.sam import CigarElement
from fgpyo.sam import CigarOp
from fgpyo.sam import CigarParsingException
from fgpyo.sam import PairOrientation
from fgpyo.sam import SamFileType
from fgpyo.sam import is_proper_pair
from fgpyo.sam.builder import SamBuilder


Expand Down Expand Up @@ -300,6 +302,227 @@ def test_is_clipping() -> None:
assert clips == [CigarOp.S, CigarOp.H]


def test_pair_orientation_build_with_r2() -> None:
"""Test that we can build all pair orientations with R1 and R2."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert PairOrientation.from_recs(r1, r2) is PairOrientation.FR
assert PairOrientation.from_recs(r1) is PairOrientation.FR
assert PairOrientation.from_recs(r2) is PairOrientation.FR

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1, r2) is PairOrientation.RF
assert PairOrientation.from_recs(r1) is PairOrientation.RF
assert PairOrientation.from_recs(r2) is PairOrientation.RF

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1, r2) is PairOrientation.TANDEM
assert PairOrientation.from_recs(r1) is PairOrientation.TANDEM
assert PairOrientation.from_recs(r2) is PairOrientation.TANDEM

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1, r2) is PairOrientation.TANDEM
assert PairOrientation.from_recs(r1) is PairOrientation.TANDEM
assert PairOrientation.from_recs(r2) is PairOrientation.TANDEM


def test_pair_orientation_is_fr_if_opposite_directions_and_overlapping() -> None:
"""Test the pair orientation is always FR if the reads overlap and are oriented opposite."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
assert PairOrientation.from_recs(r1, r2) is PairOrientation.FR
assert PairOrientation.from_recs(r1) is PairOrientation.FR
assert PairOrientation.from_recs(r2) is PairOrientation.FR

builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
r1.is_reverse = True
r2.is_reverse = False
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1, r2) is PairOrientation.FR
assert PairOrientation.from_recs(r1) is PairOrientation.FR
assert PairOrientation.from_recs(r2) is PairOrientation.FR


def test_a_single_bp_alignment_at_end_of_rec_one_is_still_fr_orientations() -> None:
"""Test a single bp alignment at the end of a mate's alignment is still FR based on rec1."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=5, cigar1="5M", start2=5, cigar2="1M")
assert PairOrientation.from_recs(r1, r2) is PairOrientation.FR
assert PairOrientation.from_recs(r1) is PairOrientation.FR
assert PairOrientation.from_recs(r2) is PairOrientation.FR


def test_pair_orientation_build_with_either_unmapped() -> None:
"""Test that we can return None with either R1 and R2 unmapped (or both)."""
builder = SamBuilder()
r1, r2 = builder.add_pair()
assert r1.is_unmapped
assert r2.is_unmapped
assert PairOrientation.from_recs(r1, r2) is None
assert PairOrientation.from_recs(r1) is None
assert PairOrientation.from_recs(r2) is None

r1, r2 = builder.add_pair(chrom="chr1", start1=100)
assert r1.is_mapped
assert r2.is_unmapped
assert PairOrientation.from_recs(r1, r2) is None
assert PairOrientation.from_recs(r1) is None
assert PairOrientation.from_recs(r2) is None

r1, r2 = builder.add_pair(chrom="chr1", start2=100)
assert r1.is_unmapped
assert r2.is_mapped
assert PairOrientation.from_recs(r1, r2) is None
assert PairOrientation.from_recs(r1) is None
assert PairOrientation.from_recs(r2) is None


def test_pair_orientation_build_with_no_r2_but_r2_mapped() -> None:
"""Test that we can build all pair orientations with R1 and no R2, but R2 is mapped."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert PairOrientation.from_recs(r1) is PairOrientation.FR

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1) is PairOrientation.RF

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1) is PairOrientation.TANDEM

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1) is PairOrientation.TANDEM


def test_pair_orientation_build_raises_if_it_cant_find_mate_cigar_tag_positive_fr() -> None:
"""Test that an exception is raised if we cannot find the mate cigar tag."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=16, cigar1="10M", start2=15, cigar2="10M")
sam.set_pair_info(r1, r2)
r1.set_tag("MC", None) # Clear out the MC tag.
r2.set_tag("MC", None) # Clear out the MC tag.

assert PairOrientation.from_recs(r1, r2) is PairOrientation.FR

with pytest.raises(ValueError):
PairOrientation.from_recs(r1)

assert PairOrientation.from_recs(r2) is PairOrientation.FR


def test_pair_orientation_build_raises_if_it_cant_find_mate_cigar_tag_positive_rf() -> None:
"""Test that an exception is raised if we cannot find the mate cigar tag."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=16, cigar1="1M", start2=15, cigar2="1M")
sam.set_pair_info(r1, r2)

assert PairOrientation.from_recs(r1, r2) is PairOrientation.RF

r1.set_tag("MC", None) # Clear out the MC tag.
r2.set_tag("MC", None) # Clear out the MC tag.

with pytest.raises(ValueError):
PairOrientation.from_recs(r1)

assert PairOrientation.from_recs(r2) is PairOrientation.RF


def test_is_proper_pair_when_actually_proper() -> None:
"""Test that is_proper_pair returns True when reads are properly paired."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert is_proper_pair(r1, r2)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
r1.is_reverse = True
r2.is_reverse = False
sam.set_pair_info(r1, r2)
assert is_proper_pair(r1, r2)


def test_is_proper_pair_when_actually_proper_and_no_r2() -> None:
"""Test that is_proper_pair returns True when reads are properly paired, but no R2."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert is_proper_pair(r1)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
r1.is_reverse = True
r2.is_reverse = False
sam.set_pair_info(r1, r2)
assert is_proper_pair(r1)


def test_not_is_proper_pair_if_wrong_orientation() -> None:
"""Test that reads are not properly paired if they are not in the right orientation."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1, r2)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1, r2)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1, r2)


def test_not_is_proper_pair_if_wrong_orientation_and_no_r2() -> None:
"""Test reads are not properly paired if they are not in the right orientation, but no R2."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1)


def test_not_is_proper_pair_if_too_far_apart() -> None:
"""Test that reads are not properly paired if they are too far apart."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, start2=100 + 1000)
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1, r2)


def test_isize() -> None:
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
Expand Down

0 comments on commit f9aa849

Please sign in to comment.