Lots of cleanup of the initial bolus of code. (#3)

A bunch of refactoring, most notably of how we accrue counts and assign IDs to breakpoints, so that we are guaranteed to count all evidence for breakpoint the same way and only assign it a single ID.
fulcrumgenomics · Feb 14, 2022 · 974b92e · 974b92e
1 parent 0a01a05
commit 974b92e
Show file tree

Hide file tree

Showing 15 changed files with 305 additions and 258 deletions.
diff --git a/build.sc b/build.sc
@@ -61,7 +61,7 @@ trait ReleaseModule extends JavaModule {
 object tools extends CommonModule with PublishModule with ReleaseModule {
   def scalaVersion = "2.13.3"
   def millSourcePath = super.millSourcePath / ammonite.ops.up
-  def mainClass = Some("com.fulcrumgenomics.sv.cmdline.SVMain")
+  def mainClass = Some("com.fulcrumgenomics.sv.cmdline.SvMain")
   def artifactName = "fgsv"
   def gitHash = Process("git rev-parse --short HEAD").lineStream.head
   def publishVersion = s"0.0.1-${gitHash}-SNAPSHOT"
@@ -89,10 +89,7 @@ object tools extends CommonModule with PublishModule with ReleaseModule {
 
   def ivyDeps = Agg(
     ivy"org.scala-lang:scala-compiler:${scalaVersion()}",
-    ivy"mysql:mysql-connector-java:5.1.24",
-    ivy"com.fulcrumgenomics:commons_2.13:1.1.0",
-    ivy"com.fulcrumgenomics:sopt_2.13:1.1.0",
-    ivy"com.fulcrumgenomics:fgbio_2.13:1.4.0-61dde53-SNAPSHOT".excludeOrg(orgsToExclude:_*),
+    ivy"com.fulcrumgenomics:fgbio_2.13:1.5.0".excludeOrg(orgsToExclude:_*),
     )
 
   object test extends Tests {

diff --git a/...crumgenomics/sv/util/AlignedSegment.scala → ...m/fulcrumgenomics/sv/AlignedSegment.scala b/...crumgenomics/sv/util/AlignedSegment.scala → ...m/fulcrumgenomics/sv/AlignedSegment.scala
@@ -1,6 +1,6 @@
-package com.fulcrumgenomics.sv.util
+package com.fulcrumgenomics.sv
 
-import com.fulcrumgenomics.FgBioDef.FgBioEnum
+import com.fulcrumgenomics.FgBioDef.{FgBioEnum, SumBy}
 import com.fulcrumgenomics.alignment.Cigar
 import com.fulcrumgenomics.bam.Template
 import com.fulcrumgenomics.bam.api.SamRecord
@@ -63,21 +63,18 @@ object AlignedSegment extends LazyLogging {
     require(rec.mapped)
     val range = GenomicRange(refIndex = rec.refIndex, start = rec.start, end = rec.end)
     val cigar = rec.cigar
-    val leadingClipping = cigar.iterator
-      .takeWhile(_.operator.isClipping) // take leading clipping
-      .map(_.length).sum
+
+    val leadingClipping  = cigar.iterator.takeWhile(_.operator.isClipping).sumBy(_.length)
+    val trailingClipping = cigar.reverseIterator.takeWhile(_.operator.isClipping).sumBy(_.length)
     val middle = cigar.iterator
       .dropWhile(_.operator.isClipping) // skip leading clipping
       .takeWhile(!_.operator.isClipping) // take until we hit any clipping at the end
       .filter(_.operator.consumesReadBases()) // only operators that consume read bases
-      .map(_.length).sum
-    val trailingClipping = cigar.iterator
-      .dropWhile(_.operator.isClipping) // skip leading clipping
-      .dropWhile(!_.operator.isClipping) // skip until we hit any clipping at the end
-      .map(_.length).sum
+      .sumBy(_.length)
+
     val (start, end) = {
-      if (rec.positiveStrand) (leadingClipping + 1, leadingClipping + middle)
-      else (trailingClipping + 1, trailingClipping + middle)
+      if (rec.positiveStrand) (leadingClipping + 1,  leadingClipping + middle)
+      else                    (trailingClipping + 1, trailingClipping + middle)
     }
 
     AlignedSegment(origin = SegmentOrigin(rec), readStart = start, readEnd = end, positiveStrand = rec.positiveStrand, cigar = rec.cigar, range = range)
@@ -99,7 +96,7 @@ object AlignedSegment extends LazyLogging {
     require(supplementals.nonEmpty)
     val primarySegment = AlignedSegment(primary)
     val supplSegments  = supplementals.map(AlignedSegment(_))
-    this.segmentsFrom(primary=primarySegment, supplementals=supplSegments, readLength=primary.length, minUniqueBasesToAdd=minUniqueBasesToAdd)
+    segmentsFrom(primary=primarySegment, supplementals=supplSegments, readLength=primary.length, minUniqueBasesToAdd=minUniqueBasesToAdd)
   }
 
 
@@ -185,7 +182,7 @@ object AlignedSegment extends LazyLogging {
       }
       // reverse the R2 segments as the first segments in R2 are last segments in the template when starting from the start
       // of R1, also need to switch strand as we expect FR pair
-      mergeAlignedSegments(r1Segments=r1Segments, r2Segments=r2Segments, numOverlappingSegments=1)
+      mergeAlignedSegments(r1Segments=r1Segments, r2Segments=r2Segments)
     }
   }
 
@@ -209,7 +206,7 @@ object AlignedSegment extends LazyLogging {
   @tailrec
   def mergeAlignedSegments(r1Segments: IndexedSeq[AlignedSegment],
                            r2Segments: IndexedSeq[AlignedSegment],
-                           numOverlappingSegments: Int): IndexedSeq[AlignedSegment] = {
+                           numOverlappingSegments: Int = 1): IndexedSeq[AlignedSegment] = {
     if (numOverlappingSegments > r1Segments.length || numOverlappingSegments > r2Segments.length) {
       r1Segments ++ r2Segments // no overlapping segments
     } else {
@@ -231,9 +228,16 @@ object AlignedSegment extends LazyLogging {
 }
 
 sealed trait SegmentOrigin extends EnumEntry {
+  import SegmentOrigin._
+
   /** Returns whether this origin and the other origin are R1 and R2, or R2 and R1, respectively.
    * This treats Both as being from either R1 and R2. */
   def isPairedWith(other: SegmentOrigin): Boolean = this != other || (this == SegmentOrigin.Both && other == SegmentOrigin.Both)
+
+  /** Returns true if both `this` and `that` represent a single read and the two reads are different (i.e. R1 and R2). */
+  def isInterRead(that: SegmentOrigin): Boolean = {
+    this != Both && that != Both && this != that
+  }
 }
 
 /** Enumeration for where a given [[AlignedSegment]] origintes. */

diff --git a/...genomics/sv/util/PutativeBreakpoint.scala → ...a/com/fulcrumgenomics/sv/Breakpoint.scala b/...genomics/sv/util/PutativeBreakpoint.scala → ...a/com/fulcrumgenomics/sv/Breakpoint.scala
@@ -1,50 +1,78 @@
-package com.fulcrumgenomics.sv.util
+package com.fulcrumgenomics.sv
 
-/** Stores the genomic break ends of a putative breakpoint
+object Breakpoint {
+  /**
+   * Builds a breakpoint from two aligned segments.
+   *
+   * The two segments must be given in such a way that a read traversing the breakpoint would
+   * read through `from` in the genomic orientation given by `from` and then into `into` in the
+   * orientation given by `into`.  E.g. if the two segments originate from a single (split-read)
+   * alignment then `from` should come from earlier in the read in sequencing order.  Since the
+   * segments could be from different reads in a mate pair, this cannot be validated/required()
+   * in this method, but violating this assumption will lead to invalid breakpoints.
+   *
+   * The returned breakpoint will be canonicalized such that the `left` side of the breakpoint will have the
+   * lower (or equal to) position on the genome vs. the `right` side.
+   */
+  def apply(from: AlignedSegment, into: AlignedSegment): Breakpoint = {
+    val bp = Breakpoint(
+      leftRefIndex  = from.range.refIndex,
+      leftPos       = if (from.positiveStrand) from.range.end else from.range.start,
+      leftPositive  = from.positiveStrand,
+      rightRefIndex = into.range.refIndex,
+      rightPos      = if (into.positiveStrand) into.range.start else into.range.end,
+      rightPositive = into.positiveStrand
+    )
+
+    if (bp.isCanonical) bp else bp.reversed
+  }
+
+  /**
+   * An alternative ordering for breakpoints that orders by left/right chrom, then left/right pos etc.
+   * so that breakpoints between approximately the same positions on the same chromosomes group together.
+   */
+  val PairedOrdering: Ordering[Breakpoint] = new Ordering[Breakpoint] {
+    override def compare(x: Breakpoint, y: Breakpoint): Int = {
+      var result              = x.leftRefIndex.compare(y.leftRefIndex)
+      if (result == 0) result = x.rightRefIndex.compare(y.rightRefIndex)
+      if (result == 0) result = x.leftPos.compare(y.rightPos)
+      if (result == 0) result = x.rightPos.compare(y.rightPos)
+      if (result == 0) result = x.leftPositive.compare(y.rightPositive)
+      if (result == 0) result = x.rightPositive.compare(y.rightPositive)
+      result
+    }
+  }
+}
+
+
+/** Stores the genomic break ends of a possible breakpoint
  *
- * @param id the identifier for this breakpoint
  * @param leftRefIndex the reference index (0-based) of the left side of the breakpoint
  * @param leftPos the genomic position (1-based) of the left side of the breakpoint
  * @param leftPositive true if the sequence on the left hand side of the breakpoint is on the positive genomic strand
  * @param rightRefIndex  the reference index (0-based) of the right side of the breakpoint
  * @param rightPos the genomic position (1-based) of the right side of the breakpoint
  * @param rightPositive true if the sequence on the right hand side of the breakpoint is on the positive genomic strand
- * @param evidence the type of evidence for this breakpoint
  */
-case class PutativeBreakpoint(id: Long = 0,
-                              leftRefIndex: Int,
-                              leftPos: Int,
-                              leftPositive: Boolean,
-                              rightRefIndex: Int,
-                              rightPos: Int,
-                              rightPositive: Boolean,
-                              evidence: EvidenceType
-                             ) extends Ordered[PutativeBreakpoint] {
+case class Breakpoint(leftRefIndex: Int,
+                      leftPos: Int,
+                      leftPositive: Boolean,
+                      rightRefIndex: Int,
+                      rightPos: Int,
+                      rightPositive: Boolean,
+                     ) extends Ordered[Breakpoint] {
 
   @inline final def leftNegative: Boolean = !leftPositive
   @inline final def rightNegative: Boolean = !rightPositive
 
-
-  /** Returns true the other breakpoint has the same break ends.  This compares all fields except the evidence type. */
-  def sameBreakEnds(that: PutativeBreakpoint): Boolean = {
-    this.leftRefIndex == that.leftRefIndex &&
-      this.leftPos == that.leftPos &&
-      this.leftPositive == that.leftPositive &&
-      this.rightRefIndex == that.rightRefIndex &&
-      this.rightPos == that.rightPos &&
-      this.rightPositive == that.rightPositive
-  }
-
-  /** Defines an ordering over breakpoints, first by genomic coordinates of the left then right end, then evidence type. */
-  def compare(that: PutativeBreakpoint): Int = {
-    var result: Int = 0
-    if (result == 0) result = this.leftRefIndex - that.leftRefIndex
+  /** Orders breakpoints by their left genomic position then their right genomic position. */
+  override def compare(that: Breakpoint): Int = {
+    var result              = this.leftRefIndex - that.leftRefIndex
     if (result == 0) result = this.leftPos - that.leftPos
     if (result == 0) result = this.leftPositive.compare(that.leftPositive)
     if (result == 0) result = this.rightRefIndex - that.rightRefIndex
     if (result == 0) result = this.rightPos - that.rightPos
     if (result == 0) result = this.rightPositive.compare(that.rightPositive)
-    if (result == 0) result = EvidenceType.indexOf(this.evidence) - EvidenceType.indexOf(that.evidence)
     result
   }
 
@@ -53,53 +81,18 @@ case class PutativeBreakpoint(id: Long = 0,
    * swapped and the strand information has been flipped.  E.g. if the breakpoint were to specify
    * `chr1:500:F > chr1:100:F`, inverting it would yield `chr1:100:R > chr1:500R` to maintain consistency.
    */
-  def reversed: PutativeBreakpoint = copy(
-      leftRefIndex  = rightRefIndex,
-      leftPos       = rightPos,
-      leftPositive  = !rightPositive,
-      rightRefIndex = leftRefIndex,
-      rightPos      = leftPos,
-      rightPositive = !leftPositive
-    )
+  def reversed: Breakpoint = copy(
+    leftRefIndex  = rightRefIndex,
+    leftPos       = rightPos,
+    leftPositive  = !rightPositive,
+    rightRefIndex = leftRefIndex,
+    rightPos      = leftPos,
+    rightPositive = !leftPositive
+  )
 
   /** Returns true if the representation of the breakend is canonical, with the left hand side of the break
    * earlier on the genome than the right hand side. */
   def isCanonical: Boolean = (leftRefIndex < rightRefIndex) ||
     (leftRefIndex == rightRefIndex && leftPos < rightPos) ||
     (leftRefIndex == rightRefIndex && leftPos == rightPos && leftPositive)
-
-  @Deprecated
-  /** TODO: remove this one other code is fixed. */
-  def sameStrand: Boolean = this.leftPositive == this.rightPositive
 }
-
-object PutativeBreakpoint {
-  /**
-   * Builds a breakpoint from two aligned segments.
-   *
-   * The two segments must be given in such a way that a read traversing the breakpoint would
-   * read through `from` in the genomic orientation given by `from` and then into `into` in the
-   * orientation given by `into`.  E.g. if the two segments originate from a single (split-read)
-   * alignment then `from` should come from earlier in the read in sequencing order.  Since the
-   * segments could be from different reads in a mate pair, this cannot be validated/required()
-   * in this method, but violating this assumption will lead to invalid breakpoints.
-   *
-   * The returned breakpoint will be canonicalized such that the `left` side of the breakpoint will have the
-   * lower (or equal to) position on the genome vs. the `right` side.
-   */
-  def apply(from: AlignedSegment,
-            into: AlignedSegment,
-            evidence: EvidenceType): PutativeBreakpoint = {
-    val bp = new PutativeBreakpoint(
-      leftRefIndex  = from.range.refIndex,
-      leftPos       = if (from.positiveStrand) from.range.end else from.range.start,
-      leftPositive  = from.positiveStrand,
-      rightRefIndex = into.range.refIndex,
-      rightPos      = if (into.positiveStrand) into.range.start else into.range.end,
-      rightPositive = into.positiveStrand,
-      evidence      = evidence
-    )
-
-    if (bp.isCanonical) bp else bp.reversed
-  }
-}
diff --git a/src/main/scala/com/fulcrumgenomics/sv/BreakpointEvidence.scala b/src/main/scala/com/fulcrumgenomics/sv/BreakpointEvidence.scala
@@ -0,0 +1,38 @@
+package com.fulcrumgenomics.sv
+
+object BreakpointEvidence {
+  /**
+   * Builds a breakpoint evidence from two aligned segments and an evidence type.
+   *
+   * The two segments must be given in such a way that a read traversing the breakpoint would
+   * read through `from` in the genomic orientation given by `from` and then into `into` in the
+   * orientation given by `into`.  E.g. if the two segments originate from a single (split-read)
+   * alignment then `from` should come from earlier in the read in sequencing order.  Since the
+   * segments could be from different reads in a mate pair, this cannot be validated/required()
+   * in this method, but violating this assumption will lead to invalid breakpoints.
+   *
+   * The returned breakpoint will be canonicalized such that the `left` side of the breakpoint will have the
+   * lower (or equal to) position on the genome vs. the `right` side.
+   */
+  def apply(from: AlignedSegment,
+            into: AlignedSegment,
+            evidence: EvidenceType): BreakpointEvidence = {
+    new BreakpointEvidence(breakpoint = Breakpoint(from, into), evidence = evidence)
+  }
+}
+
+
+/** Stores the genomic break ends of a putative breakpoint
+ *
+ * @param breakpoint the breakpoint for which we have evidence
+ * @param evidence the type of evidence for this breakpoint
+ */
+case class BreakpointEvidence(breakpoint: Breakpoint, evidence: EvidenceType) extends Ordered[BreakpointEvidence] {
+
+  /** Defines an ordering over breakpoints, first by genomic coordinates of the left then right end, then evidence type. */
+  def compare(that: BreakpointEvidence): Int = {
+    var result              = this.breakpoint.compare(that.breakpoint)
+    if (result == 0) result = this.evidence.compare(that.evidence)
+    result
+  }
+}
diff --git a/...ulcrumgenomics/sv/util/EvidenceType.scala → ...com/fulcrumgenomics/sv/EvidenceType.scala b/...ulcrumgenomics/sv/util/EvidenceType.scala → ...com/fulcrumgenomics/sv/EvidenceType.scala
@@ -1,4 +1,4 @@
-package com.fulcrumgenomics.sv.util
+package com.fulcrumgenomics.sv
 
 import com.fulcrumgenomics.FgBioDef.FgBioEnum
 import com.fulcrumgenomics.commons.util.StringUtil

diff --git a/...ulcrumgenomics/sv/util/GenomicRange.scala → ...com/fulcrumgenomics/sv/GenomicRange.scala b/...ulcrumgenomics/sv/util/GenomicRange.scala → ...com/fulcrumgenomics/sv/GenomicRange.scala
@@ -1,4 +1,4 @@
-package com.fulcrumgenomics.sv.util
+package com.fulcrumgenomics.sv
 
 import htsjdk.samtools.util.CoordMath
 
@@ -13,7 +13,7 @@ case class GenomicRange(refIndex: Int, start: Int, end: Int) extends Ordered[Gen
 
   /** Returns the union of the two genomic intervals.  The intervals must overlap. */
   def union(other: GenomicRange): GenomicRange = {
-    require(this.overlaps(other))
+    require(this.overlaps(other), s"Can't union non-overlapping ranges: ${this} and ${other}")
     this.copy(start = Math.min(this.start, other.start), end = Math.max(this.end, other.end))
   }
 
@@ -27,4 +27,6 @@ case class GenomicRange(refIndex: Int, start: Int, end: Int) extends Ordered[Gen
 
   /** The length of the genomic range in bp. */
   def length: Int = this.end - this.start + 1
+
+  override def toString: String =  s"${refIndex}:${start}-${end}"
 }
diff --git a/...m/fulcrumgenomics/sv/cmdline/SVMain.scala → ...m/fulcrumgenomics/sv/cmdline/SvMain.scala b/...m/fulcrumgenomics/sv/cmdline/SVMain.scala → ...m/fulcrumgenomics/sv/cmdline/SvMain.scala
diff --git a/...m/fulcrumgenomics/sv/cmdline/SVTool.scala → ...m/fulcrumgenomics/sv/cmdline/SvTool.scala b/...m/fulcrumgenomics/sv/cmdline/SVTool.scala → ...m/fulcrumgenomics/sv/cmdline/SvTool.scala
diff --git a/src/main/scala/com/fulcrumgenomics/sv/tools/FilterAndMerge.scala b/src/main/scala/com/fulcrumgenomics/sv/tools/FilterAndMerge.scala
@@ -5,8 +5,8 @@ import com.fulcrumgenomics.commons.collection.BetterBufferedIterator
 import com.fulcrumgenomics.commons.util.{DelimitedDataParser, NumericCounter, Row, SimpleCounter}
 import com.fulcrumgenomics.fasta.SequenceDictionary
 import com.fulcrumgenomics.sv.cmdline.{ClpGroups, SvTool}
-import com.fulcrumgenomics.sv.util.EvidenceType
 import com.fulcrumgenomics.sopt.{arg, clp}
+import com.fulcrumgenomics.sv.EvidenceType
 import com.fulcrumgenomics.util.{Io, Metric}
 
 import scala.annotation.tailrec