Skip to content

Commit

Permalink
Lots of cleanup of the initial bolus of code. (#3)
Browse files Browse the repository at this point in the history
A bunch of refactoring, most notably of how we accrue counts and assign IDs to breakpoints, so that we are guaranteed to count all evidence for breakpoint the same way and only assign it a single ID.
  • Loading branch information
tfenne committed Feb 14, 2022
1 parent 0a01a05 commit 974b92e
Show file tree
Hide file tree
Showing 15 changed files with 305 additions and 258 deletions.
7 changes: 2 additions & 5 deletions build.sc
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ trait ReleaseModule extends JavaModule {
object tools extends CommonModule with PublishModule with ReleaseModule {
def scalaVersion = "2.13.3"
def millSourcePath = super.millSourcePath / ammonite.ops.up
def mainClass = Some("com.fulcrumgenomics.sv.cmdline.SVMain")
def mainClass = Some("com.fulcrumgenomics.sv.cmdline.SvMain")
def artifactName = "fgsv"
def gitHash = Process("git rev-parse --short HEAD").lineStream.head
def publishVersion = s"0.0.1-${gitHash}-SNAPSHOT"
Expand Down Expand Up @@ -89,10 +89,7 @@ object tools extends CommonModule with PublishModule with ReleaseModule {

def ivyDeps = Agg(
ivy"org.scala-lang:scala-compiler:${scalaVersion()}",
ivy"mysql:mysql-connector-java:5.1.24",
ivy"com.fulcrumgenomics:commons_2.13:1.1.0",
ivy"com.fulcrumgenomics:sopt_2.13:1.1.0",
ivy"com.fulcrumgenomics:fgbio_2.13:1.4.0-61dde53-SNAPSHOT".excludeOrg(orgsToExclude:_*),
ivy"com.fulcrumgenomics:fgbio_2.13:1.5.0".excludeOrg(orgsToExclude:_*),
)

object test extends Tests {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package com.fulcrumgenomics.sv.util
package com.fulcrumgenomics.sv

import com.fulcrumgenomics.FgBioDef.FgBioEnum
import com.fulcrumgenomics.FgBioDef.{FgBioEnum, SumBy}
import com.fulcrumgenomics.alignment.Cigar
import com.fulcrumgenomics.bam.Template
import com.fulcrumgenomics.bam.api.SamRecord
Expand Down Expand Up @@ -63,21 +63,18 @@ object AlignedSegment extends LazyLogging {
require(rec.mapped)
val range = GenomicRange(refIndex = rec.refIndex, start = rec.start, end = rec.end)
val cigar = rec.cigar
val leadingClipping = cigar.iterator
.takeWhile(_.operator.isClipping) // take leading clipping
.map(_.length).sum

val leadingClipping = cigar.iterator.takeWhile(_.operator.isClipping).sumBy(_.length)
val trailingClipping = cigar.reverseIterator.takeWhile(_.operator.isClipping).sumBy(_.length)
val middle = cigar.iterator
.dropWhile(_.operator.isClipping) // skip leading clipping
.takeWhile(!_.operator.isClipping) // take until we hit any clipping at the end
.filter(_.operator.consumesReadBases()) // only operators that consume read bases
.map(_.length).sum
val trailingClipping = cigar.iterator
.dropWhile(_.operator.isClipping) // skip leading clipping
.dropWhile(!_.operator.isClipping) // skip until we hit any clipping at the end
.map(_.length).sum
.sumBy(_.length)

val (start, end) = {
if (rec.positiveStrand) (leadingClipping + 1, leadingClipping + middle)
else (trailingClipping + 1, trailingClipping + middle)
if (rec.positiveStrand) (leadingClipping + 1, leadingClipping + middle)
else (trailingClipping + 1, trailingClipping + middle)
}

AlignedSegment(origin = SegmentOrigin(rec), readStart = start, readEnd = end, positiveStrand = rec.positiveStrand, cigar = rec.cigar, range = range)
Expand All @@ -99,7 +96,7 @@ object AlignedSegment extends LazyLogging {
require(supplementals.nonEmpty)
val primarySegment = AlignedSegment(primary)
val supplSegments = supplementals.map(AlignedSegment(_))
this.segmentsFrom(primary=primarySegment, supplementals=supplSegments, readLength=primary.length, minUniqueBasesToAdd=minUniqueBasesToAdd)
segmentsFrom(primary=primarySegment, supplementals=supplSegments, readLength=primary.length, minUniqueBasesToAdd=minUniqueBasesToAdd)
}


Expand Down Expand Up @@ -185,7 +182,7 @@ object AlignedSegment extends LazyLogging {
}
// reverse the R2 segments as the first segments in R2 are last segments in the template when starting from the start
// of R1, also need to switch strand as we expect FR pair
mergeAlignedSegments(r1Segments=r1Segments, r2Segments=r2Segments, numOverlappingSegments=1)
mergeAlignedSegments(r1Segments=r1Segments, r2Segments=r2Segments)
}
}

Expand All @@ -209,7 +206,7 @@ object AlignedSegment extends LazyLogging {
@tailrec
def mergeAlignedSegments(r1Segments: IndexedSeq[AlignedSegment],
r2Segments: IndexedSeq[AlignedSegment],
numOverlappingSegments: Int): IndexedSeq[AlignedSegment] = {
numOverlappingSegments: Int = 1): IndexedSeq[AlignedSegment] = {
if (numOverlappingSegments > r1Segments.length || numOverlappingSegments > r2Segments.length) {
r1Segments ++ r2Segments // no overlapping segments
} else {
Expand All @@ -231,9 +228,16 @@ object AlignedSegment extends LazyLogging {
}

sealed trait SegmentOrigin extends EnumEntry {
import SegmentOrigin._

/** Returns whether this origin and the other origin are R1 and R2, or R2 and R1, respectively.
* This treats Both as being from either R1 and R2. */
def isPairedWith(other: SegmentOrigin): Boolean = this != other || (this == SegmentOrigin.Both && other == SegmentOrigin.Both)

/** Returns true if both `this` and `that` represent a single read and the two reads are different (i.e. R1 and R2). */
def isInterRead(that: SegmentOrigin): Boolean = {
this != Both && that != Both && this != that
}
}

/** Enumeration for where a given [[AlignedSegment]] origintes. */
Expand Down
Original file line number Diff line number Diff line change
@@ -1,50 +1,78 @@
package com.fulcrumgenomics.sv.util
package com.fulcrumgenomics.sv

/** Stores the genomic break ends of a putative breakpoint
object Breakpoint {
/**
* Builds a breakpoint from two aligned segments.
*
* The two segments must be given in such a way that a read traversing the breakpoint would
* read through `from` in the genomic orientation given by `from` and then into `into` in the
* orientation given by `into`. E.g. if the two segments originate from a single (split-read)
* alignment then `from` should come from earlier in the read in sequencing order. Since the
* segments could be from different reads in a mate pair, this cannot be validated/required()
* in this method, but violating this assumption will lead to invalid breakpoints.
*
* The returned breakpoint will be canonicalized such that the `left` side of the breakpoint will have the
* lower (or equal to) position on the genome vs. the `right` side.
*/
def apply(from: AlignedSegment, into: AlignedSegment): Breakpoint = {
val bp = Breakpoint(
leftRefIndex = from.range.refIndex,
leftPos = if (from.positiveStrand) from.range.end else from.range.start,
leftPositive = from.positiveStrand,
rightRefIndex = into.range.refIndex,
rightPos = if (into.positiveStrand) into.range.start else into.range.end,
rightPositive = into.positiveStrand
)

if (bp.isCanonical) bp else bp.reversed
}

/**
* An alternative ordering for breakpoints that orders by left/right chrom, then left/right pos etc.
* so that breakpoints between approximately the same positions on the same chromosomes group together.
*/
val PairedOrdering: Ordering[Breakpoint] = new Ordering[Breakpoint] {
override def compare(x: Breakpoint, y: Breakpoint): Int = {
var result = x.leftRefIndex.compare(y.leftRefIndex)
if (result == 0) result = x.rightRefIndex.compare(y.rightRefIndex)
if (result == 0) result = x.leftPos.compare(y.rightPos)
if (result == 0) result = x.rightPos.compare(y.rightPos)
if (result == 0) result = x.leftPositive.compare(y.rightPositive)
if (result == 0) result = x.rightPositive.compare(y.rightPositive)
result
}
}
}


/** Stores the genomic break ends of a possible breakpoint
*
* @param id the identifier for this breakpoint
* @param leftRefIndex the reference index (0-based) of the left side of the breakpoint
* @param leftPos the genomic position (1-based) of the left side of the breakpoint
* @param leftPositive true if the sequence on the left hand side of the breakpoint is on the positive genomic strand
* @param rightRefIndex the reference index (0-based) of the right side of the breakpoint
* @param rightPos the genomic position (1-based) of the right side of the breakpoint
* @param rightPositive true if the sequence on the right hand side of the breakpoint is on the positive genomic strand
* @param evidence the type of evidence for this breakpoint
*/
case class PutativeBreakpoint(id: Long = 0,
leftRefIndex: Int,
leftPos: Int,
leftPositive: Boolean,
rightRefIndex: Int,
rightPos: Int,
rightPositive: Boolean,
evidence: EvidenceType
) extends Ordered[PutativeBreakpoint] {
case class Breakpoint(leftRefIndex: Int,
leftPos: Int,
leftPositive: Boolean,
rightRefIndex: Int,
rightPos: Int,
rightPositive: Boolean,
) extends Ordered[Breakpoint] {

@inline final def leftNegative: Boolean = !leftPositive
@inline final def rightNegative: Boolean = !rightPositive


/** Returns true the other breakpoint has the same break ends. This compares all fields except the evidence type. */
def sameBreakEnds(that: PutativeBreakpoint): Boolean = {
this.leftRefIndex == that.leftRefIndex &&
this.leftPos == that.leftPos &&
this.leftPositive == that.leftPositive &&
this.rightRefIndex == that.rightRefIndex &&
this.rightPos == that.rightPos &&
this.rightPositive == that.rightPositive
}

/** Defines an ordering over breakpoints, first by genomic coordinates of the left then right end, then evidence type. */
def compare(that: PutativeBreakpoint): Int = {
var result: Int = 0
if (result == 0) result = this.leftRefIndex - that.leftRefIndex
/** Orders breakpoints by their left genomic position then their right genomic position. */
override def compare(that: Breakpoint): Int = {
var result = this.leftRefIndex - that.leftRefIndex
if (result == 0) result = this.leftPos - that.leftPos
if (result == 0) result = this.leftPositive.compare(that.leftPositive)
if (result == 0) result = this.rightRefIndex - that.rightRefIndex
if (result == 0) result = this.rightPos - that.rightPos
if (result == 0) result = this.rightPositive.compare(that.rightPositive)
if (result == 0) result = EvidenceType.indexOf(this.evidence) - EvidenceType.indexOf(that.evidence)
result
}

Expand All @@ -53,53 +81,18 @@ case class PutativeBreakpoint(id: Long = 0,
* swapped and the strand information has been flipped. E.g. if the breakpoint were to specify
* `chr1:500:F > chr1:100:F`, inverting it would yield `chr1:100:R > chr1:500R` to maintain consistency.
*/
def reversed: PutativeBreakpoint = copy(
leftRefIndex = rightRefIndex,
leftPos = rightPos,
leftPositive = !rightPositive,
rightRefIndex = leftRefIndex,
rightPos = leftPos,
rightPositive = !leftPositive
)
def reversed: Breakpoint = copy(
leftRefIndex = rightRefIndex,
leftPos = rightPos,
leftPositive = !rightPositive,
rightRefIndex = leftRefIndex,
rightPos = leftPos,
rightPositive = !leftPositive
)

/** Returns true if the representation of the breakend is canonical, with the left hand side of the break
* earlier on the genome than the right hand side. */
def isCanonical: Boolean = (leftRefIndex < rightRefIndex) ||
(leftRefIndex == rightRefIndex && leftPos < rightPos) ||
(leftRefIndex == rightRefIndex && leftPos == rightPos && leftPositive)

@Deprecated
/** TODO: remove this one other code is fixed. */
def sameStrand: Boolean = this.leftPositive == this.rightPositive
}

object PutativeBreakpoint {
/**
* Builds a breakpoint from two aligned segments.
*
* The two segments must be given in such a way that a read traversing the breakpoint would
* read through `from` in the genomic orientation given by `from` and then into `into` in the
* orientation given by `into`. E.g. if the two segments originate from a single (split-read)
* alignment then `from` should come from earlier in the read in sequencing order. Since the
* segments could be from different reads in a mate pair, this cannot be validated/required()
* in this method, but violating this assumption will lead to invalid breakpoints.
*
* The returned breakpoint will be canonicalized such that the `left` side of the breakpoint will have the
* lower (or equal to) position on the genome vs. the `right` side.
*/
def apply(from: AlignedSegment,
into: AlignedSegment,
evidence: EvidenceType): PutativeBreakpoint = {
val bp = new PutativeBreakpoint(
leftRefIndex = from.range.refIndex,
leftPos = if (from.positiveStrand) from.range.end else from.range.start,
leftPositive = from.positiveStrand,
rightRefIndex = into.range.refIndex,
rightPos = if (into.positiveStrand) into.range.start else into.range.end,
rightPositive = into.positiveStrand,
evidence = evidence
)

if (bp.isCanonical) bp else bp.reversed
}
}
38 changes: 38 additions & 0 deletions src/main/scala/com/fulcrumgenomics/sv/BreakpointEvidence.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package com.fulcrumgenomics.sv

object BreakpointEvidence {
/**
* Builds a breakpoint evidence from two aligned segments and an evidence type.
*
* The two segments must be given in such a way that a read traversing the breakpoint would
* read through `from` in the genomic orientation given by `from` and then into `into` in the
* orientation given by `into`. E.g. if the two segments originate from a single (split-read)
* alignment then `from` should come from earlier in the read in sequencing order. Since the
* segments could be from different reads in a mate pair, this cannot be validated/required()
* in this method, but violating this assumption will lead to invalid breakpoints.
*
* The returned breakpoint will be canonicalized such that the `left` side of the breakpoint will have the
* lower (or equal to) position on the genome vs. the `right` side.
*/
def apply(from: AlignedSegment,
into: AlignedSegment,
evidence: EvidenceType): BreakpointEvidence = {
new BreakpointEvidence(breakpoint = Breakpoint(from, into), evidence = evidence)
}
}


/** Stores the genomic break ends of a putative breakpoint
*
* @param breakpoint the breakpoint for which we have evidence
* @param evidence the type of evidence for this breakpoint
*/
case class BreakpointEvidence(breakpoint: Breakpoint, evidence: EvidenceType) extends Ordered[BreakpointEvidence] {

/** Defines an ordering over breakpoints, first by genomic coordinates of the left then right end, then evidence type. */
def compare(that: BreakpointEvidence): Int = {
var result = this.breakpoint.compare(that.breakpoint)
if (result == 0) result = this.evidence.compare(that.evidence)
result
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.fulcrumgenomics.sv.util
package com.fulcrumgenomics.sv

import com.fulcrumgenomics.FgBioDef.FgBioEnum
import com.fulcrumgenomics.commons.util.StringUtil
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.fulcrumgenomics.sv.util
package com.fulcrumgenomics.sv

import htsjdk.samtools.util.CoordMath

Expand All @@ -13,7 +13,7 @@ case class GenomicRange(refIndex: Int, start: Int, end: Int) extends Ordered[Gen

/** Returns the union of the two genomic intervals. The intervals must overlap. */
def union(other: GenomicRange): GenomicRange = {
require(this.overlaps(other))
require(this.overlaps(other), s"Can't union non-overlapping ranges: ${this} and ${other}")
this.copy(start = Math.min(this.start, other.start), end = Math.max(this.end, other.end))
}

Expand All @@ -27,4 +27,6 @@ case class GenomicRange(refIndex: Int, start: Int, end: Int) extends Ordered[Gen

/** The length of the genomic range in bp. */
def length: Int = this.end - this.start + 1

override def toString: String = s"${refIndex}:${start}-${end}"
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ import com.fulcrumgenomics.commons.collection.BetterBufferedIterator
import com.fulcrumgenomics.commons.util.{DelimitedDataParser, NumericCounter, Row, SimpleCounter}
import com.fulcrumgenomics.fasta.SequenceDictionary
import com.fulcrumgenomics.sv.cmdline.{ClpGroups, SvTool}
import com.fulcrumgenomics.sv.util.EvidenceType
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.sv.EvidenceType
import com.fulcrumgenomics.util.{Io, Metric}

import scala.annotation.tailrec
Expand Down
Loading

0 comments on commit 974b92e

Please sign in to comment.