From 5213029a04c78659b1450dfe8a6addb481faed70 Mon Sep 17 00:00:00 2001
From: walaj <jwala@broadinstitute.org>
Date: Mon, 19 Dec 2016 00:24:20 -0500
Subject: [PATCH] dbsnp missing from indel header. cleaned some code

---
 src/Snowman/AlignedContig.cpp     |   3 -
 src/Snowman/AlignmentFragment.cpp |   2 -
 src/Snowman/BreakPoint.cpp        | 313 ++++++++----------------------
 src/Snowman/BreakPoint.h          |  10 +-
 src/Snowman/DiscordantCluster.cpp |   4 +-
 src/Snowman/LearnBamParams.cpp    |  81 --------
 src/Snowman/run_snowman.cpp       |   6 +-
 src/Snowman/vcf.cpp               |  11 +-
 web/batch.functions.R             |  45 ++++-
 9 files changed, 131 insertions(+), 344 deletions(-)

diff --git a/src/Snowman/AlignedContig.cpp b/src/Snowman/AlignedContig.cpp
index a4489b3..19a8a4d 100644
--- a/src/Snowman/AlignedContig.cpp
+++ b/src/Snowman/AlignedContig.cpp
@@ -363,8 +363,6 @@ void AlignedContig::blacklist(SeqLib::GRC &grv) {
 
 	  bp.secondary = a.m_align.SecondaryFlag() || b.m_align.SecondaryFlag();
 
-	  assert(bp.valid());
-
 	  // add the the vector of breakpoints
 	  if (!bp.secondary) {
 	    m_local_breaks.push_back(bp);
@@ -434,7 +432,6 @@ void AlignedContig::blacklist(SeqLib::GRC &grv) {
     // order the breakpoint
     m_global_bp.order();
     
-    assert(m_global_bp.valid());
   }
   
   std::string AlignedContig::printDiscordantClusters() const {
diff --git a/src/Snowman/AlignmentFragment.cpp b/src/Snowman/AlignmentFragment.cpp
index f40faaa..fe504c8 100644
--- a/src/Snowman/AlignmentFragment.cpp
+++ b/src/Snowman/AlignmentFragment.cpp
@@ -97,7 +97,6 @@ void AlignmentFragment::SetIndels(const AlignedContig * c) {
     while (parseIndelBreak(bp) && fail_safe_count++ < MAX_INDELS && !m_align.SecondaryFlag()) {
       bp.aligned_covered = c->aligned_covered;
       assert(bp.aligned_covered);
-      assert(bp.valid());
       for (auto& i : c->prefixes)
 	bp.allele[i].indel = true;
       m_indel_breaks.push_back(bp);
@@ -290,7 +289,6 @@ void AlignmentFragment::indelCigarMatches(const std::unordered_map<std::string,
     bp.b1.gr.strand = '+';
     bp.b2.gr.strand = '-';
 
-    assert(bp.valid());
     return true;
   }
 
diff --git a/src/Snowman/BreakPoint.cpp b/src/Snowman/BreakPoint.cpp
index 921446c..1d128f2 100644
--- a/src/Snowman/BreakPoint.cpp
+++ b/src/Snowman/BreakPoint.cpp
@@ -10,8 +10,8 @@
 #define MAX_ERROR 0.04
 #define MIN_ERROR 0.0005
 
-#define T_SPLIT_BUFF 8
-#define N_SPLIT_BUFF 8
+#define T_SPLIT_BUFF 5
+#define N_SPLIT_BUFF 5
 // if the insertion is this big or larger, don't require splits to span both sides
 #define INSERT_SIZE_TOO_BIG_SPAN_READS 16
 
@@ -737,7 +737,7 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) {
 
   }
 
-  void BreakPoint::__score_assembly_only() {
+  void BreakPoint::score_assembly_only() {
 
 
     int span = getSpan();
@@ -814,93 +814,39 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) {
 
   }
   
-  void BreakPoint::__score_somatic(double NODBCUTOFF, double DBCUTOFF) {
-    
-    double ratio = n.alt > 0 ? (double)t.alt / (double)n.alt : 100;
-    //double taf = t.cov > 0 ? (double)t.alt / (double)t.cov : 0;
-    bool immediate_reject = (ratio <= 12 && n.cov > 10) || n.alt >= 2; // can't call somatic with 3+ normal reads or <5x more tum than norm ALT
+  void BreakPoint::score_somatic(double NODBCUTOFF, double DBCUTOFF) {
 
-    double somatic_ratio = 0;
-    size_t ncount = 0;
-  
-  if (evidence == "INDEL") {
-    
     // this is LOD of normal being REF vs AF = 0.5+
     // We want this to be high for a somatic call
     somatic_lod = n.LO_n;
-
-    if (rs.empty())
-      somatic_score = somatic_lod > NODBCUTOFF && !immediate_reject;
-    else
-      somatic_score = somatic_lod > DBCUTOFF && !immediate_reject;	
-
-    // reject if reall low AF and at DBSNP
-    //if (taf < 0.2 && !rs.empty())
-    //  somatic_score = 0;
-
-  } else {
     
-    somatic_lod = n.LO_n;
-
-    // old model
-    ncount = std::max(n.split, dc.ncount);
-    somatic_ratio = ncount > 0 ? std::max(t.split,dc.tcount)/ncount : 100;
+    // can't call somatic with 3+ normal reads or <5x more tum than norm ALT
+    double ratio = n.alt > 0 ? (double)t.alt / (double)n.alt : 100;
+    if (( ratio <= MIN_SOMATIC_RATIO && n.cov > 10) || n.alt > 2) {
+      somatic_score = 0;
+      return;
+    }
+    
+  if (evidence == "INDEL") {
     
+    // somatic score is just true or false for now
+    // use the specified cutoff for indels, taking into account whether at dbsnp site
+    somatic_score = somatic_lod > (rs.empty() ? NODBCUTOFF : DBCUTOFF);
+
+  // for SVs, just use a hard cutoff for gauging somatic vs germline
+  } else {
+    double ncount = std::max(n.split, dc.ncount);
+    double somatic_ratio = ncount > 0 ? (double)std::max(t.split,dc.tcount)/ncount : 100;
     somatic_score = somatic_ratio >= MIN_SOMATIC_RATIO && n.split < 2;
   }
-  
-  // kill all if too many normal support
-  if (n.alt > 2)
-    somatic_score = 0;
-  
-  // kill if single normal read in discordant clsuter
+    
+  // set germline if single normal read in discordant clsuter
   if (evidence == "DSCRD" && n.alt > 0)
     somatic_score = 0;
-
-  // kill if bad ratio
-  double r = n.alt > 0 ? (double)t.alt / (double)n.alt : 100;
-  if (r < 10)
-    somatic_score = 0;
-
-}
-
-  /*  void BreakPoint::__set_total_reads() {
-
-    // total unique read support
-
-    t_reads = 0;
-    n_reads = 0;
-
-    for (auto& a : allele) {
-      if (a.first.at(0) == 't')
-	t_reads += a.alt;
-      else
-	n_reads += a.alt;
-    }
-
-    //if (evidence == "ASDIS") {
-      std::unordered_set<std::string> this_reads_t;
-      std::unordered_set<std::string> this_reads_n;
-      for (auto& i : split_reads) 
-	if (!this_reads_t.count(i) && i.at(0) == 't')
-	  this_reads_t.insert(i);
-      for (auto& i : dc.reads)
-	if (!this_reads_t.count(i.first) && i.first.at(0) == 't')
-	  this_reads_t.insert(i.first);
-      for (auto& i : split_reads)
-	if (!this_reads_n.count(i) && i.at(0) == 'n')
-	  this_reads_n.insert(i);
-      for (auto& i : dc.reads)
-	if (!this_reads_n.count(i.first) && i.first.at(0) == 'n')
-	  this_reads_n.insert(i.first);
-      
-      t_reads = this_reads_t.size();
-      n_reads = this_reads_n.size();
-
+  
   }
-  */
 
-  void BreakPoint::__score_assembly_dscrd() {
+  void BreakPoint::score_assembly_dscrd() {
 
     int this_mapq1 = b1.mapq;
     int this_mapq2 = b2.mapq;
@@ -910,9 +856,6 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) {
     int max_a_mapq = std::max(this_mapq1, dc.mapq1);
     int max_b_mapq = std::max(this_mapq2, dc.mapq2);
 
-    //int min_assm_mapq = std::min(this_mapq1, this_mapq2);
-    //int max_assm_mapq = std::max(this_mapq1, this_mapq2);
-
     // how much of contig is covered by split reads
     int cov_span = split_cov_bounds.second - split_cov_bounds.first ;
 
@@ -957,7 +900,7 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) {
       confidence = "PASS";
   }
 
-  void BreakPoint::__score_dscrd(int min_dscrd_size) {
+  void BreakPoint::score_dscrd(int min_dscrd_size) {
 
     t.alt = dc.tcount;
     n.alt = dc.ncount;
@@ -973,7 +916,7 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) {
       confidence = "LOWMAPQDISC";
     else if (!dc.m_id_competing.empty())
       confidence = "COMPETEDISC";
-    else if ( disc_count < disc_cutoff) // || (dc.ncount > 0 && disc_count < 15) )  // be stricter about germline disc only
+    else if ( disc_count < disc_cutoff)
       confidence = "WEAKDISC";
     else 
       confidence = "PASS";
@@ -981,42 +924,42 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) {
     assert(confidence.length());
   }
 
-void BreakPoint::__score_indel(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP) {
+void BreakPoint::score_indel(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP) {
 
     assert(b1.mapq == b2.mapq);
 
-    bool is_refilter = !confidence.empty(); // act differently if this is refilter runx
+    bool is_refilter = !confidence.empty(); // act differently if this is refilter run
     
     // for refilter, only consider ones that were low lod or PASS
     // ie ones that with a different lod threshold may be changed
     // if confidence is empty, this is original run so keep going
     if (confidence != "LOWLOD" && confidence != "PASS" && is_refilter)
       return;
-
-    //double ratio = n.alt > 0 ? t.alt / n.alt : 100;
-
+    
     double max_lod = 0;
     for (auto& s : allele) 
       max_lod = std::max(max_lod, s.second.LO);
 
-    bool homozygous_ref = false;
+    // check if homozygous reference is most likely GT
+    bool homozygous_ref = true;
     for (auto& s : allele) {
-      if (s.second.genotype_likelihoods[0] < s.second.genotype_likelihoods[1]  && 
-	  s.second.genotype_likelihoods[0] < s.second.genotype_likelihoods[2])
-	homozygous_ref = true;
+      if (s.second.genotype_likelihoods[0] > s.second.genotype_likelihoods[1] ||
+	  s.second.genotype_likelihoods[0] > s.second.genotype_likelihoods[2])
+	homozygous_ref = false;
     }
     
+    // get the allelic fractions, just for VLOWAF filter
     double af_t = t.cov > 0 ? (double)t.alt / (double)t.cov : 0;
     double af_n = n.cov > 0 ? (double)n.alt / (double)n.cov : 0;
     double af = std::max(af_t, af_n);
 
-    if (b1.mapq < 10) // || std::max(b1.nm, b2.nm) > 6)
+    if (b1.mapq < 10) 
       confidence="LOWMAPQ";
     else if (!is_refilter && (double)aligned_covered / (double)seq.length() < 0.80) // less than 80% of read is covered by some alignment
       confidence = "LOWAS";  
     else if ((b1.sub_n && b1.mapq < 50) || (b2.sub_n && b2.mapq < 50)) 
       confidence = "MULTIMATCH";      
-    else if (max_lod < LOD_CUTOFF && rs.empty()) // non db snp site
+    else if (max_lod < LOD_CUTOFF && rs.empty())        // non db snp site
       confidence = "LOWLOD";
     else if (max_lod < LOD_CUTOFF_DBSNP && !rs.empty()) // be more permissive for dbsnp site
       confidence = "LOWLOD";
@@ -1057,15 +1000,15 @@ void BreakPoint::scoreBreakpoint(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP, dou
       n.disc = dc.ncount;
     }
 
-    // scale the LOD for MAPQ    
+    // provide a scaled LOD that accounts for MAPQ. Heuristic, not really used
     int mapqr1 = b1.local ? std::max(30, b1.mapq) : b1.mapq; // if local, don't drop below 30
     int mapqr2 = b2.local ? std::max(30, b2.mapq) : b2.mapq; // if local (aligns to within window), don't drop below 30
     double scale = (double)( std::min(mapqr1, mapqr2) - 2 * b1.nm) / (double)60;
     for (auto& i : allele) 
       i.second.SLO = i.second.LO * scale;
-    t.LO = t.SLO * scale;
-    n.LO = n.SLO * scale;
-    a.LO = a.SLO * scale;
+    t.SLO = t.LO * scale;
+    n.SLO = n.LO * scale;
+    a.SLO = a.LO * scale;
     
     // sanity check
     int split =0;
@@ -1075,36 +1018,26 @@ void BreakPoint::scoreBreakpoint(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP, dou
 
     // do the scoring
     if (evidence == "ASSMB" || (evidence == "COMPL" && (dc.ncount + dc.tcount)==0))
-      __score_assembly_only();
+      score_assembly_only();
     if (evidence == "ASDIS" || (evidence == "COMPL" && (dc.ncount + dc.tcount))) 
-      __score_assembly_dscrd();
+      score_assembly_dscrd();
     if (evidence == "DSCRD")
-      __score_dscrd(min_dscrd_size);
-    if (evidence == "ASDIS" && confidence != "PASS") {
+      score_dscrd(min_dscrd_size);
+    // it failed assembly filters, but might pass discordant filters
+    if (evidence == "ASDIS" && confidence != "PASS") { 
       evidence = "DSCRD";
-      __score_dscrd(min_dscrd_size);
+      score_dscrd(min_dscrd_size);
     } else if (evidence == "INDEL") 
-      __score_indel(LOD_CUTOFF, LOD_CUTOFF_DBSNP);
-
-    // 
-    __score_somatic(LOD_CUTOFF_SOMATIC, LOD_CUTOFF_SOMATIC_DBSNP);
-
-    //if (confidence == "PASS")
-    //  quality = 99;
-    //else
-    //  quality = 0;
-    quality = t.NH_GQ; // a.SLO;   
+      score_indel(LOD_CUTOFF, LOD_CUTOFF_DBSNP);
 
-    double LR = -1000000;
-    for (auto& i : allele) {
-      LR = std::max(-i.second.LO_n, LR); 
-      // neg because want to evaluate likelihood that is ALT in normal 
-      // the original use was to get LL that is REF in normal (in order to call somatic)
-      // but for next filter, we want to see if we are confident in the germline call
-    }
-
-    assert(getSpan() > -2);
+    // set the somatic_score field to true or false
+    score_somatic(LOD_CUTOFF_SOMATIC, LOD_CUTOFF_SOMATIC_DBSNP);
 
+    // quality score is odds that read is non-homozygous reference (max 99)
+    quality = 0;
+    for (auto& a : allele)
+      quality = std::max(a.second.NH_GQ, (double)quality); 
+    
   }
 
   void BreakPoint::order() {
@@ -1187,24 +1120,6 @@ void BreakPoint::__format_bx_string() {
     
   }
 
-  bool BreakPoint::valid() const {
-
-    // debug
-    return true;
-    
-    if (!(b1.gr.strand == '+' || b1.gr.strand == '-') || !(b2.gr.strand == '+' || b2.gr.strand == '-')) {
-      std::cerr << "b1.strand " << b1.gr.strand << " b2.strand " << b2.gr.strand << std::endl;
-      return false;
-    }
-
-    // b1 is less than b2, or the same but signifies inverted connection
-    if ((b1.gr < b2.gr) || (b1.gr.chr == b2.gr.chr && b1.gr.pos1 == b2.gr.pos1)) 
-      return true;
-    
-    std::cerr << b1.gr << " " << b2.gr << std::endl;
-    return false;
-  }
-
   void BreakPoint::setRefAlt(const RefGenome * main_rg, const RefGenome * viral) {
 
     assert(!main_rg->IsEmpty());
@@ -1246,44 +1161,6 @@ void BreakPoint::__format_bx_string() {
 	  alt = "N";
 	  std::cerr << "Caught exception in BreakPoint:setRefAlt for SV Alt: " << ia.what() << std::endl;
 	}
-
-      /*char * ref1 = faidx_fetch_seq(main_findex, const_cast<char*>(b1.chr_name.c_str()), b1.gr.pos1-1, b1.gr.pos1-1, &len);
-      if (!ref1) {
-	if (viral_findex)
-	  ref1 = faidx_fetch_seq(viral_findex, const_cast<char*>(b1.chr_name.c_str()), b1.gr.pos1-1, b1.gr.pos1-1, &len);
-      }
-      if (!ref1) {
-	std::cerr << "couldn't find reference on BP1 for ref " << b1.chr_name << " in either viral or human" << std::endl;
-	}
-      
-      char * ref2 = faidx_fetch_seq(main_findex, const_cast<char*>(b2.chr_name.c_str()), b2.gr.pos1-1, b2.gr.pos1-1, &len);
-      if (!ref2) {
-	if (viral_findex)
-	  ref2 = faidx_fetch_seq(viral_findex, const_cast<char*>(b2.chr_name.c_str()), b2.gr.pos1-1, b2.gr.pos1-1, &len);
-      } 
-      if (!ref2) {
-	std::cerr << "couldn't find reference on BP2 for ref " << b2.chr_name << " in either viral or human" << std::endl;
-      }
-
-      // couldn't find something, so bail
-      if (!ref1 || !ref2)
-	return;
-
-      // by convention, set ref to 1 and alt to 2. Gets sorted in VCF creation
-      ref = std::string(ref1);
-      alt = std::string(ref2);
-
-      */
-      
-      //if (!ref.length())
-      //	ref = "N";
-      //if (!alt.length())
-      //	alt = "N";
-
-      //if (ref1)
-      //free(ref1);
-      //if (ref2)
-      //	free(ref2);
       
     } else {
 
@@ -1294,29 +1171,11 @@ void BreakPoint::__format_bx_string() {
 	} catch (const std::invalid_argument& ia) {
 	  ref = "N";
 	  std::cerr << "Caught exception in BreakPoint:setRefAlt for indel ref: " << ia.what() << std::endl;
-	  //std::cerr << "couldn't find reference sequence for ref " << b1.chr_name << " for indel, on human reference " << std::endl;
-	  //std::cerr << (*this) << std::endl;
-	  //exit(EXIT_FAILURE); //debug
-	  //return;
 	}
-	
-	// reference
-	//char * refi = faidx_fetch_seq(main_findex, const_cast<char*>(b1.chr_name.c_str()), b1.gr.pos1-1, b1.gr.pos1-1, &len);
-	//if (!refi) {
-	//  std::cerr << "couldn't find reference sequence for ref " << b1.chr_name << " for indel, on human reference " << std::endl;
-	//  return;
-	//}
-	//ref = std::string(refi);
-	//if (!ref.length()) {
-	//  ref = "N";
-	//}
 
 	// alt 
 	alt = ref + insertion;
       
-	//if (refi)
-	//  free(refi);
-
       // deletion
       } else {	
 
@@ -1327,36 +1186,18 @@ void BreakPoint::__format_bx_string() {
 	} catch (const std::invalid_argument& ia) {
 	  ref = std::string(std::abs(b1.gr.pos1 - b2.gr.pos1), 'N');
 	  std::cerr << "Caught exception in BreakPoint:setRefAlt for indel ref: " << ia.what() << std::endl;	  
-	  //std::cerr << "PIECE 2 couldn't find reference sequence for ref " << b1.chr_name << " for indel, on human reference " << std::endl;
-	  //std::cerr << (*this) << std::endl;
-	  //exit(EXIT_FAILURE); //debug
 	}
-	
-	//char * refi = faidx_fetch_seq(main_findex, const_cast<char*>(b1.chr_name.c_str()), b1.gr.pos1-1, b2.gr.pos1-2, &len);
-	//if (!refi) {
-	//  std::cerr << "couldn't find reference sequence for ref " << b1.chr_name << " for indel, on human reference " << std::endl;
-	//  return;
-	//}
-	//ref = std::string(refi);
-	//if (!ref.length())
-	//  ref = "N";
 	alt = ref.substr(0,1);
-	//if (refi)
-	//  free(refi);
       }
     }
 
     if (ref.empty()) {
-      //std::cerr << " REF empty " << (*this) << " " << b1.chr_name << " b1.gr " << b1.gr << std::endl;
       ref = "N";
     }
     if (alt.empty()) {
-      //std::cerr << " ALT empty " << (*this) << " " << b1.chr_name << " b1.gr " << b1.gr << std::endl;
       alt = "N";
     }
 	
-    //assert(!ref.empty());
-    //assert(!alt.empty());
   }
 
   int BreakPoint::getSpan() const { 
@@ -1427,7 +1268,6 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH
 	  homology_s = val;
 	  break; 
 	case 19: 
-	  //insertion_size = (val == "x") ? 0 : val.length();
 	  insertion_s = val;
 	  break; 
 	case 20: cname_s = val; break;
@@ -1512,11 +1352,9 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH
 
   void SampleInfo::modelSelection(double er) {
 
-    //alt = std::max(alt, cigar);
-
-    if (alt >= cov) {
+    // can't have more alt reads than total reads
+    if (alt >= cov) 
       cov = alt;
-    }
 
     // adjust the coverage to be more in line with restrictions on ALT.
     // namely that ALT reads must overlap the variant site by more than T_SPLIT_BUFF
@@ -1528,14 +1366,11 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH
     } else {
       a_cov = cov;
     }
-
-    // make sure we get correct alt
-    //__adjust_alt_counts();
-    //alt = std::max(alt, std::max((int)supporting_reads.size(), cigar));
-
     af = a_cov > 0 ? (double)alt / (double)a_cov : 1;
     af = af > 1 ? 1 : af;
 
+    int scaled_alt = std::min(alt, (int)a_cov);
+
     // mutect log liklihood against error
     // how likely to see these ALT counts if true AF is af
     // vs how likely to see these ALT counts if true AF is 0
@@ -1548,12 +1383,12 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH
     // the indiviual calcs will have more confidence. Ultimately,
     // LO will represent the log likeihood that the variant is AF = af
     // vs AF = 0 
-    double ll_alt = __log_likelihood(a_cov - alt, alt, af, er);
-    double ll_err = __log_likelihood(a_cov - alt, alt, 0 , er);
+    double ll_alt = __log_likelihood(a_cov - scaled_alt, scaled_alt, af, er);
+    double ll_err = __log_likelihood(a_cov - scaled_alt, scaled_alt, 0 , er);
     LO = ll_alt - ll_err; 
 
     //mutetct log likelihood normal
-    //er = 0.0005; // make this low, so that ALT in REF is rare and NORM in TUM gives low somatic prob
+    // er = 0.0005; // make this low, so that ALT in REF is rare and NORM in TUM gives low somatic prob
     // actually, dont' worry about it too much. 3+ alt in ref excludes somatic anyways.
     // so a high LO_n for the normal means that we are very confident that site is REF only in 
     // normal sample. This is why LO_n from the normal can be used as a discriminant for 
@@ -1561,18 +1396,23 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH
     // or above a larger threshold XX if at DBSNP site, then we accept as somatic
     // LO_n should not be used for setting the confidence that something is real, just the 
     // confidence that it is somatic
-    double ll_alt_norm = __log_likelihood(cov - alt, alt, std::max(0.5, af), er); // likelihood that variant is 0.5
-    double ll_ref_norm = __log_likelihood(cov - alt, alt, 0                , er); // likelihood that varaint is actually true REF
+    double ll_ref_norm = __log_likelihood(cov - scaled_alt, scaled_alt, 0                , er); // likelihood that varaint is actually true REF
+    double ll_alt_norm = __log_likelihood(cov - scaled_alt, scaled_alt, std::max(0.5, af), er); // likelihood that variant is 0.5
     LO_n = ll_ref_norm - ll_alt_norm; // higher number means more likely to be AF = 0 (ref) than AF = 0.5 (alt). 
     
     // genotype calculation as provided in 
     // http://bioinformatics.oxfordjournals.org/content/early/2011/09/08/bioinformatics.btr509.full.pdf+html
-    int scaled_cov = std::floor((double)cov * 0.90);
-    int this_alt = std::min(alt, scaled_cov);
-
-    genotype_likelihoods[0] = __genotype_likelihoods(2, er, this_alt, scaled_cov); // ref/ref
-    genotype_likelihoods[1] = __genotype_likelihoods(1, er, this_alt, scaled_cov);
-    genotype_likelihoods[2] = __genotype_likelihoods(0, er, this_alt, scaled_cov);
+    //int scaled_cov = std::floor((double)cov * 0.90);
+    //int this_alt = std::min(alt, scaled_cov);
+    genotype_likelihoods[0] = __genotype_likelihoods(2, er, scaled_alt, a_cov); // 0/0
+    genotype_likelihoods[1] = __genotype_likelihoods(1, er, scaled_alt, a_cov); // 0/1
+    genotype_likelihoods[2] = __genotype_likelihoods(0, er, scaled_alt, a_cov); // 1/1
+
+    //debug
+    //std::cerr << " ALT " << alt << " scaled alt " << scaled_alt << " ER " << er << " A_COV " << a_cov << 
+    //  " 0/0 " << genotype_likelihoods[0] << " 0/1 " << genotype_likelihoods[1] << 
+    //  " 1/1 " << genotype_likelihoods[2] << " LOD " << LO << " LO_n " << LO_n << 
+    //  " af " << af << std::endl;
 
     double max_likelihood = *std::max_element(genotype_likelihoods.begin(), genotype_likelihoods.end());
     if (max_likelihood == genotype_likelihoods[0])
@@ -1582,7 +1422,7 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH
     else 
       genotype = "1/1";
 
-    // scale to max
+    // scale GT likelihoods to max
     genotype_likelihoods[0] = max_likelihood - genotype_likelihoods[0];
     genotype_likelihoods[1] = max_likelihood - genotype_likelihoods[1];
     genotype_likelihoods[2] = max_likelihood - genotype_likelihoods[2];
@@ -1807,6 +1647,7 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH
 
   // g is the number of reference alleles (e.g. g = 2 is homozygous reference)
   // assumes biallelic model
+  // http://bioinformatics.oxfordjournals.org/content/early/2011/09/08/bioinformatics.btr509.full.pdf+html
   double SampleInfo::__genotype_likelihoods(int g, double er, int alt, int cov) {
     double val =  - cov * log10(2) + (cov - alt) * log10( (2 - g) * er + g  * (1 - er) ) + alt * log10( (2 - g) * (1 - er) + g * er);
     return val;
diff --git a/src/Snowman/BreakPoint.h b/src/Snowman/BreakPoint.h
index e7a2e19..97c913e 100644
--- a/src/Snowman/BreakPoint.h
+++ b/src/Snowman/BreakPoint.h
@@ -267,7 +267,7 @@ struct ReducedBreakEnd {
 
    void __set_total_reads();
    
-   void __score_somatic(double NODBCUTOFF, double DBCUTOFF);
+   void score_somatic(double NODBCUTOFF, double DBCUTOFF);
 
    void addCovs(const std::unordered_map<std::string, STCoverage*>& covs);
 
@@ -387,10 +387,10 @@ struct ReducedBreakEnd {
 
    friend std::ostream& operator<<(std::ostream& out, const BreakPoint& bp);
    
-   void __score_dscrd(int min_dscrd_size);
-   void __score_assembly_only();
-   void __score_assembly_dscrd();
-   void __score_indel(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP);
+   void score_dscrd(int min_dscrd_size);
+   void score_assembly_only();
+   void score_assembly_dscrd();
+   void score_indel(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP);
    void __format_readname_string();
    void __set_homologies_insertions();
    void __set_evidence();
diff --git a/src/Snowman/DiscordantCluster.cpp b/src/Snowman/DiscordantCluster.cpp
index fd6d03a..8e50b48 100644
--- a/src/Snowman/DiscordantCluster.cpp
+++ b/src/Snowman/DiscordantCluster.cpp
@@ -610,8 +610,10 @@ using namespace SeqLib;
     for (auto& i : brv) {
 
       // only cluster FR reads together, RF reads together, FF together and RR together
-      if (i.PairOrientation() != orientation)
+      std::cout << i.Qname() << "\t" << i.PairOrientation() << std::endl;
+      if (i.PairOrientation() != orientation) {
 	continue;
+      }
 
       std::string qq = i.Qname();
 
diff --git a/src/Snowman/LearnBamParams.cpp b/src/Snowman/LearnBamParams.cpp
index 3dac572..744211b 100644
--- a/src/Snowman/LearnBamParams.cpp
+++ b/src/Snowman/LearnBamParams.cpp
@@ -148,84 +148,3 @@ void LearnBamParams::process_read(const SeqLib::BamRecord& r, size_t count, BamP
    ff->second.visited++;
    
 }
-
-/*void LearnBamParams::learnParams(BamParams& p, int max_count) {
-
-  SeqLib::GenomicRegionVector grv = {SeqLib::GenomicRegion(0, 1000000,2000000), SeqLib::GenomicRegion(1,1000000,2000000)};
-  SeqLib::BamWalker bwalker;
-  bwalker.OpenReadBam(bam);
-  bwalker.setBamWalkerRegions(grv);
-
-  SeqLib::BamRecord r;
-
-  double frac_clip = 0;
-  double frac_disc = 0;
-  double frac_bad =0;
-  int mapq = 0;
-  int readlen = 0;
-  double cov_count = 0;
-  int count = 0;
-
-  double pos1 = 0, pos2 = 0;
-  int chr1 = 0;
-  bool rule;
-
-  std::vector<int> isizer;
-
-  while (bwalker.GetNextRead(r, rule) && ++count < max_count) {
-
-    bool qcpass = !r.DuplicateFlag() && !r.QCFailFlag() && !r.SecondaryFlag();
-    if (!qcpass)
-      continue;
-
-    readlen = std::max(r.Length(), readlen);
-    mapq = std::max(r.MapQuality(), mapq);
-
-    // get the first read position
-    if (count == 1) {
-      pos1 = r.Position();
-      chr1 = r.ChrID();
-    }
-    
-    if (r.FullInsertSize() > 1000)
-      ++frac_disc;
-    if (r.NumClip() >= 5)
-      ++frac_clip;
-    if ( r.CigarSize() >= 6 || r.MapQuality() <= 5 || (r.QualitySequence().length() < 0.6 * readlen) ) {
-      ++frac_bad;
-    }
-	
-    if (r.InsertSize() > 0 && r.ProperOrientation() && r.FullInsertSize() < 10000)
-      isizer.push_back(r.FullInsertSize());
-
-    if (count % 500000 == 0)
-      std::cerr << "...learning from read " << r.Brief(bwalker.header()) << " at count " << SeqLib::AddCommas(count) << " of " << SeqLib::AddCommas(max_count) << std::endl;
-    
-    // last read position
-    if (chr1 == r.ChrID() && r.MapQuality() > 0 && std::abs(r.InsertSize()) > 10 && std::abs(r.InsertSize()) < 2000) {
-      ++cov_count;
-      pos2 = r.Position();
-    }
-  }
-
-  double sum = std::accumulate(isizer.begin(), isizer.end(), 0.0);
-  p.mean_isize = isizer.size() > 0 ? sum / isizer.size() : 0;
-  double mm = p.mean_isize;
-
-  p.median_isize = isizer.size() ? CalcMHWScore(isizer) : 0;
-
-  // get isize stdev
-  std::vector<double> diff(isizer.size());
-  std::transform(isizer.begin(), isizer.end(), diff.begin(), [mm](double x) { return x - mm; });
-  double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
-  p.sd_isize = std::sqrt(sq_sum / isizer.size());
-
-  p.mean_cov = (pos2 - pos1 > 0) ? cov_count * readlen / (pos2 - pos1) : 0;
-  p.frac_disc = frac_disc / (double)count;
-  p.frac_bad = frac_bad / (double)count;
-  p.frac_clip = frac_clip / (double)count;
-  p.readlen = readlen;
-  p.max_mapq = mapq;
-
-}
-*/
diff --git a/src/Snowman/run_snowman.cpp b/src/Snowman/run_snowman.cpp
index a0fd2d8..6783235 100644
--- a/src/Snowman/run_snowman.cpp
+++ b/src/Snowman/run_snowman.cpp
@@ -111,7 +111,7 @@ namespace opt {
   static bool write_corrected_reads = false;
   static bool zip = false;
   static bool read_tracking = false; // turn on output of qnames
-  static bool all_contigs = false;  // output all contigs
+  static bool all_contigs = false;   // output all contigs
   static bool no_unfiltered = false; // don't output unfiltered variants
 
   // discordant clustering params
@@ -303,7 +303,7 @@ static const char *RUN_USAGE_MESSAGE =
 "  -m, --min-overlap                    Minimum read overlap, an SGA parameter. Default: 0.4* readlength\n"
 "  -e, --error-rate                     Fractional difference two reads can have to overlap. See SGA. 0 is fast, but requires error correcting. [0]\n"
 "  -K, --ec-correct-type                (f) Fermi-kit BFC correction, (s) Kmer-correction from SGA, (0) no correction (then suggest non-zero -e) [f]\n"
-"  -E, --ec-subsample                   Learn from fraction of non-weird reads during error-correction. Lower number = faster compute [0.25]\n"
+"  -E, --ec-subsample                   Learn from fraction of non-weird reads during error-correction. Lower number = faster compute [0.5]\n"
 "      --write-asqg                     Output an ASQG graph file for each assembly window.\n"
 "  BWA-MEM alignment params\n"
 "      --bwa-match-score                Set the BWA-MEM match score. BWA-MEM -A [2]\n"
@@ -783,7 +783,7 @@ void parseRunOptions(int argc, char** argv) {
     case OPT_SCALE_ERRORS: arg >> opt::scale_error; break;
     case 'C': arg >> opt::max_cov;  break;
     case OPT_NUM_TO_SAMPLE: arg >> opt::num_to_sample;  break;
-    case OPT_READ_TRACK: opt::read_tracking = false; break;
+    case OPT_READ_TRACK: opt::read_tracking = true; break;
 	case 't': 
 	  tmp = SnowmanUtils::__bamOptParse(opt::bam, arg, sample_number++, "t");
 	  if (opt::main_bam == "-")
diff --git a/src/Snowman/vcf.cpp b/src/Snowman/vcf.cpp
index 53fe903..032a560 100644
--- a/src/Snowman/vcf.cpp
+++ b/src/Snowman/vcf.cpp
@@ -263,7 +263,7 @@ VCFFile::VCFFile(std::string file, std::string id, const SeqLib::BamHeader& h, c
   sv_header.addInfoField("IMPRECISE","0","Flag", "Imprecise structural variation");
   sv_header.addInfoField("SECONDARY","0","Flag", "SV calls comes from a secondary alignment");
   sv_header.addInfoField("HOMLEN","1","Integer","Length of base pair identical micro-homology at event breakpoints");
-  sv_header.addInfoField("DBSNP","1","String","TRUE if variant overlaps a dbSNP site");
+  //sv_header.addInfoField("DBSNP","1","String","TRUE if variant overlaps a dbSNP site");
   //sv_header.addInfoField("BKDIST","1","Integer","Distance between breakpoints (-1 if difference chromosomes");
   sv_header.addInfoField("MAPQ","1","Integer","Mapping quality (BWA-MEM) of this fragement of the contig (-1 if discordant only)");
   sv_header.addInfoField("MATEMAPQ","1","Integer","Mapping quality of the partner fragment of the contig");
@@ -314,12 +314,13 @@ VCFFile::VCFFile(std::string file, std::string id, const SeqLib::BamHeader& h, c
   sv_header.addFormatField("SL","1","Float","Alignment-quality Scaled log-odds, where LO is LO * (MAPQ - 2*NM)/60");
 
   indel_header.addInfoField("REPSEQ","1","String","Repeat sequence near the event");
-  indel_header.addInfoField("GRAYLIST","0","Flag","Indel is low quality and cross a difficult region of genome");
+  //indel_header.addInfoField("GRAYLIST","0","Flag","Indel is low quality and cross a difficult region of genome");
   //indel_header.addInfoField("SOMATIC","0","Flag","Variant is somatic");
   indel_header.addInfoField("PON","1","Integer","Number of normal samples that have this indel present");
   indel_header.addInfoField("NM","1","Integer","Number of mismatches of this alignment fragment to reference");
   indel_header.addInfoField("READNAMES",".","String","IDs of ALT reads");
   indel_header.addInfoField("BX",".","String","Table of BX tag counts for supporting reads");
+  indel_header.addInfoField("DBSNP","0","Flag","Variant found in dbSNP");
   indel_header.addInfoField("LOD","1","Float","Log of the odds that variant is real vs artifact");
 
   // keep track of exact positions to keep from duplicating
@@ -841,10 +842,10 @@ std::unordered_map<std::string, std::string> VCFEntry::fillInfoFields() const {
     lod_ss << std::setprecision(4) << bp->true_lod;
     info_fields["LOD"] = lod_ss.str();
     lod_ss.str(std::string());
-    if (bp->blacklist)
-      info_fields["GRAYLIST"]  = "";
+    //if (bp->blacklist)
+    //  info_fields["GRAYLIST"]  = std::string();
     if (bp->dbsnp)
-      info_fields["DBSNP"] = "TRUE"; //bp->rs; 
+      info_fields["DBSNP"] = std::string(); 
   }
 
   return info_fields;
diff --git a/web/batch.functions.R b/web/batch.functions.R
index 6655fb9..ec5e885 100644
--- a/web/batch.functions.R
+++ b/web/batch.functions.R
@@ -288,20 +288,22 @@ load_indel <- function(files, mc.cores=1) {
       print(paste("File does not exist",x))
       return (GRanges())
     }
-    if (exists('rowRanges'))
+    if (exists('rowRanges')) {
         fff <- rowRanges(rv <- VariantAnnotation::readVcf(x, "hg19"))
-    else
+    } else {
         fff <- rowData(rv <- VariantAnnotation::readVcf(x, "hg19"))
+      }
+    
     
-
     if (length(fff) == 0)
       return (GRanges())
 
     ### make sure we don't take germilne variants from strelka
-    if ("NT" %in% colnames(info(rv)))
+    if ("NT" %in% colnames(info(rv))) {
       non_norm_het = grepl("ref",info(rv)$NT)
-    else
+    } else {
       non_norm_het <- rep(TRUE, length(fff))
+    }
     
     fff <- fff[ix <- (fff$FILTER == "PASS" | fff$FILTER == "alleleBias" | fff$FILTER == "QSI_ref" | fff$FILTER == "LOWLOD") & non_norm_het]
     if (length(fff) == 0)
@@ -369,10 +371,10 @@ load_indel <- function(files, mc.cores=1) {
     if (sum(c("DP","AD") %in% colnames(mcols(fff)))==2) 
       fff$AF <- ifelse(fff$DP > 0, fff$AD/fff$DP, NA)
 
-    #mcols(fff)$SPAN <- pmax(width(fff), width(unlist(fff$ALT)))
-    mcols(fff)$CALC_SPAN <- pmax(width(fff), fff$ALTWIDTH - 1)
+    mcols(fff)$SPAN <- pmax(width(fff), width(unlist(fff$ALT)))
+    ###mcols(fff)$CALC_SPAN <- pmax(width(fff), fff$ALTWIDTH - 1)
     mcols(fff)$ETYPE <- ifelse(fff$REFWIDTH > 1, 'del', 'ins')
-    
+
     mcols(fff) <- cbind(VariantAnnotation::info(rv)[ix, ], mcols(fff))
     
     return (fff)
@@ -437,6 +439,7 @@ load_delly <- function(files) {
     mcols(del)$SPAN <- dt[mcols(del)$id]$span
     mcols(del)$SVTYPE <- dt[mcols(del)$id]$SVTYPE
     mcols(del)$CT <- dt[mcols(del)$id]$CT
+    mcols(del)$MAPQ <- dt[mcols(del)$id]$MAPQ
     if (ncol(geno(dvcf)$DV)==2) {
       mcols(del)$TDISC <- dt[mcols(del)$id]$TDISC
       mcols(del)$NDISC <- dt[mcols(del)$id]$NDISC
@@ -1150,3 +1153,29 @@ span.histogram <- function(span) {
   ppdf(print(g), width=5, height=3)  
   
 }
+
+load_svlib_sv <- function(x, black, mappability) {
+
+  ss2 <- .ann_delins(load_snowman2(x))
+  ss2 <- ss2[!grepl("N", mcols(ss2)$INSERTION)]
+  ss2 <- ss2[mcols(ss2)$MAPQ > 20]
+  fo <- grl.unlist(ss2) %**% black
+  ss2 <- ss2[setdiff(seq_along(ss2), ceiling(fo$query.id/2))]
+  mcols(ss2)$SPAN <- pmax(nchar(as.character(mcols(ss2)$INSERTION)), mcols(ss2)$SPAN)
+  ## adjust based on insertions
+  mcols(ss2)$deltype[nchar(mcols(ss2)$INSERTION) > mcols(ss2)$SPAN * 0.2] <- FALSE
+  return(ss2)
+  
+}
+
+load_svlib_indel <- function(x, black, mappability) {
+
+  ssi2 <- load_indel(x)[[1]]
+  fo <- ssi2 %**% black
+  ssi2 <- ssi2[setdiff(seq_along(ssi2), fo$query.id)]
+  fo <- (ssi2 %**% mappability)
+  ssi2$score <- NA
+  ssi2$score[fo$query.id] <- mappability$score[fo$subject.id]
+  return(ssi2)
+}
+