From 5213029a04c78659b1450dfe8a6addb481faed70 Mon Sep 17 00:00:00 2001 From: walaj Date: Mon, 19 Dec 2016 00:24:20 -0500 Subject: [PATCH] dbsnp missing from indel header. cleaned some code --- src/Snowman/AlignedContig.cpp | 3 - src/Snowman/AlignmentFragment.cpp | 2 - src/Snowman/BreakPoint.cpp | 313 ++++++++---------------------- src/Snowman/BreakPoint.h | 10 +- src/Snowman/DiscordantCluster.cpp | 4 +- src/Snowman/LearnBamParams.cpp | 81 -------- src/Snowman/run_snowman.cpp | 6 +- src/Snowman/vcf.cpp | 11 +- web/batch.functions.R | 45 ++++- 9 files changed, 131 insertions(+), 344 deletions(-) diff --git a/src/Snowman/AlignedContig.cpp b/src/Snowman/AlignedContig.cpp index a4489b3..19a8a4d 100644 --- a/src/Snowman/AlignedContig.cpp +++ b/src/Snowman/AlignedContig.cpp @@ -363,8 +363,6 @@ void AlignedContig::blacklist(SeqLib::GRC &grv) { bp.secondary = a.m_align.SecondaryFlag() || b.m_align.SecondaryFlag(); - assert(bp.valid()); - // add the the vector of breakpoints if (!bp.secondary) { m_local_breaks.push_back(bp); @@ -434,7 +432,6 @@ void AlignedContig::blacklist(SeqLib::GRC &grv) { // order the breakpoint m_global_bp.order(); - assert(m_global_bp.valid()); } std::string AlignedContig::printDiscordantClusters() const { diff --git a/src/Snowman/AlignmentFragment.cpp b/src/Snowman/AlignmentFragment.cpp index f40faaa..fe504c8 100644 --- a/src/Snowman/AlignmentFragment.cpp +++ b/src/Snowman/AlignmentFragment.cpp @@ -97,7 +97,6 @@ void AlignmentFragment::SetIndels(const AlignedContig * c) { while (parseIndelBreak(bp) && fail_safe_count++ < MAX_INDELS && !m_align.SecondaryFlag()) { bp.aligned_covered = c->aligned_covered; assert(bp.aligned_covered); - assert(bp.valid()); for (auto& i : c->prefixes) bp.allele[i].indel = true; m_indel_breaks.push_back(bp); @@ -290,7 +289,6 @@ void AlignmentFragment::indelCigarMatches(const std::unordered_map 0 ? (double)t.alt / (double)n.alt : 100; - //double taf = t.cov > 0 ? (double)t.alt / (double)t.cov : 0; - bool immediate_reject = (ratio <= 12 && n.cov > 10) || n.alt >= 2; // can't call somatic with 3+ normal reads or <5x more tum than norm ALT + void BreakPoint::score_somatic(double NODBCUTOFF, double DBCUTOFF) { - double somatic_ratio = 0; - size_t ncount = 0; - - if (evidence == "INDEL") { - // this is LOD of normal being REF vs AF = 0.5+ // We want this to be high for a somatic call somatic_lod = n.LO_n; - - if (rs.empty()) - somatic_score = somatic_lod > NODBCUTOFF && !immediate_reject; - else - somatic_score = somatic_lod > DBCUTOFF && !immediate_reject; - - // reject if reall low AF and at DBSNP - //if (taf < 0.2 && !rs.empty()) - // somatic_score = 0; - - } else { - somatic_lod = n.LO_n; - - // old model - ncount = std::max(n.split, dc.ncount); - somatic_ratio = ncount > 0 ? std::max(t.split,dc.tcount)/ncount : 100; + // can't call somatic with 3+ normal reads or <5x more tum than norm ALT + double ratio = n.alt > 0 ? (double)t.alt / (double)n.alt : 100; + if (( ratio <= MIN_SOMATIC_RATIO && n.cov > 10) || n.alt > 2) { + somatic_score = 0; + return; + } + + if (evidence == "INDEL") { + // somatic score is just true or false for now + // use the specified cutoff for indels, taking into account whether at dbsnp site + somatic_score = somatic_lod > (rs.empty() ? NODBCUTOFF : DBCUTOFF); + + // for SVs, just use a hard cutoff for gauging somatic vs germline + } else { + double ncount = std::max(n.split, dc.ncount); + double somatic_ratio = ncount > 0 ? (double)std::max(t.split,dc.tcount)/ncount : 100; somatic_score = somatic_ratio >= MIN_SOMATIC_RATIO && n.split < 2; } - - // kill all if too many normal support - if (n.alt > 2) - somatic_score = 0; - - // kill if single normal read in discordant clsuter + + // set germline if single normal read in discordant clsuter if (evidence == "DSCRD" && n.alt > 0) somatic_score = 0; - - // kill if bad ratio - double r = n.alt > 0 ? (double)t.alt / (double)n.alt : 100; - if (r < 10) - somatic_score = 0; - -} - - /* void BreakPoint::__set_total_reads() { - - // total unique read support - - t_reads = 0; - n_reads = 0; - - for (auto& a : allele) { - if (a.first.at(0) == 't') - t_reads += a.alt; - else - n_reads += a.alt; - } - - //if (evidence == "ASDIS") { - std::unordered_set this_reads_t; - std::unordered_set this_reads_n; - for (auto& i : split_reads) - if (!this_reads_t.count(i) && i.at(0) == 't') - this_reads_t.insert(i); - for (auto& i : dc.reads) - if (!this_reads_t.count(i.first) && i.first.at(0) == 't') - this_reads_t.insert(i.first); - for (auto& i : split_reads) - if (!this_reads_n.count(i) && i.at(0) == 'n') - this_reads_n.insert(i); - for (auto& i : dc.reads) - if (!this_reads_n.count(i.first) && i.first.at(0) == 'n') - this_reads_n.insert(i.first); - - t_reads = this_reads_t.size(); - n_reads = this_reads_n.size(); - + } - */ - void BreakPoint::__score_assembly_dscrd() { + void BreakPoint::score_assembly_dscrd() { int this_mapq1 = b1.mapq; int this_mapq2 = b2.mapq; @@ -910,9 +856,6 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) { int max_a_mapq = std::max(this_mapq1, dc.mapq1); int max_b_mapq = std::max(this_mapq2, dc.mapq2); - //int min_assm_mapq = std::min(this_mapq1, this_mapq2); - //int max_assm_mapq = std::max(this_mapq1, this_mapq2); - // how much of contig is covered by split reads int cov_span = split_cov_bounds.second - split_cov_bounds.first ; @@ -957,7 +900,7 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) { confidence = "PASS"; } - void BreakPoint::__score_dscrd(int min_dscrd_size) { + void BreakPoint::score_dscrd(int min_dscrd_size) { t.alt = dc.tcount; n.alt = dc.ncount; @@ -973,7 +916,7 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) { confidence = "LOWMAPQDISC"; else if (!dc.m_id_competing.empty()) confidence = "COMPETEDISC"; - else if ( disc_count < disc_cutoff) // || (dc.ncount > 0 && disc_count < 15) ) // be stricter about germline disc only + else if ( disc_count < disc_cutoff) confidence = "WEAKDISC"; else confidence = "PASS"; @@ -981,42 +924,42 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) { assert(confidence.length()); } -void BreakPoint::__score_indel(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP) { +void BreakPoint::score_indel(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP) { assert(b1.mapq == b2.mapq); - bool is_refilter = !confidence.empty(); // act differently if this is refilter runx + bool is_refilter = !confidence.empty(); // act differently if this is refilter run // for refilter, only consider ones that were low lod or PASS // ie ones that with a different lod threshold may be changed // if confidence is empty, this is original run so keep going if (confidence != "LOWLOD" && confidence != "PASS" && is_refilter) return; - - //double ratio = n.alt > 0 ? t.alt / n.alt : 100; - + double max_lod = 0; for (auto& s : allele) max_lod = std::max(max_lod, s.second.LO); - bool homozygous_ref = false; + // check if homozygous reference is most likely GT + bool homozygous_ref = true; for (auto& s : allele) { - if (s.second.genotype_likelihoods[0] < s.second.genotype_likelihoods[1] && - s.second.genotype_likelihoods[0] < s.second.genotype_likelihoods[2]) - homozygous_ref = true; + if (s.second.genotype_likelihoods[0] > s.second.genotype_likelihoods[1] || + s.second.genotype_likelihoods[0] > s.second.genotype_likelihoods[2]) + homozygous_ref = false; } + // get the allelic fractions, just for VLOWAF filter double af_t = t.cov > 0 ? (double)t.alt / (double)t.cov : 0; double af_n = n.cov > 0 ? (double)n.alt / (double)n.cov : 0; double af = std::max(af_t, af_n); - if (b1.mapq < 10) // || std::max(b1.nm, b2.nm) > 6) + if (b1.mapq < 10) confidence="LOWMAPQ"; else if (!is_refilter && (double)aligned_covered / (double)seq.length() < 0.80) // less than 80% of read is covered by some alignment confidence = "LOWAS"; else if ((b1.sub_n && b1.mapq < 50) || (b2.sub_n && b2.mapq < 50)) confidence = "MULTIMATCH"; - else if (max_lod < LOD_CUTOFF && rs.empty()) // non db snp site + else if (max_lod < LOD_CUTOFF && rs.empty()) // non db snp site confidence = "LOWLOD"; else if (max_lod < LOD_CUTOFF_DBSNP && !rs.empty()) // be more permissive for dbsnp site confidence = "LOWLOD"; @@ -1057,15 +1000,15 @@ void BreakPoint::scoreBreakpoint(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP, dou n.disc = dc.ncount; } - // scale the LOD for MAPQ + // provide a scaled LOD that accounts for MAPQ. Heuristic, not really used int mapqr1 = b1.local ? std::max(30, b1.mapq) : b1.mapq; // if local, don't drop below 30 int mapqr2 = b2.local ? std::max(30, b2.mapq) : b2.mapq; // if local (aligns to within window), don't drop below 30 double scale = (double)( std::min(mapqr1, mapqr2) - 2 * b1.nm) / (double)60; for (auto& i : allele) i.second.SLO = i.second.LO * scale; - t.LO = t.SLO * scale; - n.LO = n.SLO * scale; - a.LO = a.SLO * scale; + t.SLO = t.LO * scale; + n.SLO = n.LO * scale; + a.SLO = a.LO * scale; // sanity check int split =0; @@ -1075,36 +1018,26 @@ void BreakPoint::scoreBreakpoint(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP, dou // do the scoring if (evidence == "ASSMB" || (evidence == "COMPL" && (dc.ncount + dc.tcount)==0)) - __score_assembly_only(); + score_assembly_only(); if (evidence == "ASDIS" || (evidence == "COMPL" && (dc.ncount + dc.tcount))) - __score_assembly_dscrd(); + score_assembly_dscrd(); if (evidence == "DSCRD") - __score_dscrd(min_dscrd_size); - if (evidence == "ASDIS" && confidence != "PASS") { + score_dscrd(min_dscrd_size); + // it failed assembly filters, but might pass discordant filters + if (evidence == "ASDIS" && confidence != "PASS") { evidence = "DSCRD"; - __score_dscrd(min_dscrd_size); + score_dscrd(min_dscrd_size); } else if (evidence == "INDEL") - __score_indel(LOD_CUTOFF, LOD_CUTOFF_DBSNP); - - // - __score_somatic(LOD_CUTOFF_SOMATIC, LOD_CUTOFF_SOMATIC_DBSNP); - - //if (confidence == "PASS") - // quality = 99; - //else - // quality = 0; - quality = t.NH_GQ; // a.SLO; + score_indel(LOD_CUTOFF, LOD_CUTOFF_DBSNP); - double LR = -1000000; - for (auto& i : allele) { - LR = std::max(-i.second.LO_n, LR); - // neg because want to evaluate likelihood that is ALT in normal - // the original use was to get LL that is REF in normal (in order to call somatic) - // but for next filter, we want to see if we are confident in the germline call - } - - assert(getSpan() > -2); + // set the somatic_score field to true or false + score_somatic(LOD_CUTOFF_SOMATIC, LOD_CUTOFF_SOMATIC_DBSNP); + // quality score is odds that read is non-homozygous reference (max 99) + quality = 0; + for (auto& a : allele) + quality = std::max(a.second.NH_GQ, (double)quality); + } void BreakPoint::order() { @@ -1187,24 +1120,6 @@ void BreakPoint::__format_bx_string() { } - bool BreakPoint::valid() const { - - // debug - return true; - - if (!(b1.gr.strand == '+' || b1.gr.strand == '-') || !(b2.gr.strand == '+' || b2.gr.strand == '-')) { - std::cerr << "b1.strand " << b1.gr.strand << " b2.strand " << b2.gr.strand << std::endl; - return false; - } - - // b1 is less than b2, or the same but signifies inverted connection - if ((b1.gr < b2.gr) || (b1.gr.chr == b2.gr.chr && b1.gr.pos1 == b2.gr.pos1)) - return true; - - std::cerr << b1.gr << " " << b2.gr << std::endl; - return false; - } - void BreakPoint::setRefAlt(const RefGenome * main_rg, const RefGenome * viral) { assert(!main_rg->IsEmpty()); @@ -1246,44 +1161,6 @@ void BreakPoint::__format_bx_string() { alt = "N"; std::cerr << "Caught exception in BreakPoint:setRefAlt for SV Alt: " << ia.what() << std::endl; } - - /*char * ref1 = faidx_fetch_seq(main_findex, const_cast(b1.chr_name.c_str()), b1.gr.pos1-1, b1.gr.pos1-1, &len); - if (!ref1) { - if (viral_findex) - ref1 = faidx_fetch_seq(viral_findex, const_cast(b1.chr_name.c_str()), b1.gr.pos1-1, b1.gr.pos1-1, &len); - } - if (!ref1) { - std::cerr << "couldn't find reference on BP1 for ref " << b1.chr_name << " in either viral or human" << std::endl; - } - - char * ref2 = faidx_fetch_seq(main_findex, const_cast(b2.chr_name.c_str()), b2.gr.pos1-1, b2.gr.pos1-1, &len); - if (!ref2) { - if (viral_findex) - ref2 = faidx_fetch_seq(viral_findex, const_cast(b2.chr_name.c_str()), b2.gr.pos1-1, b2.gr.pos1-1, &len); - } - if (!ref2) { - std::cerr << "couldn't find reference on BP2 for ref " << b2.chr_name << " in either viral or human" << std::endl; - } - - // couldn't find something, so bail - if (!ref1 || !ref2) - return; - - // by convention, set ref to 1 and alt to 2. Gets sorted in VCF creation - ref = std::string(ref1); - alt = std::string(ref2); - - */ - - //if (!ref.length()) - // ref = "N"; - //if (!alt.length()) - // alt = "N"; - - //if (ref1) - //free(ref1); - //if (ref2) - // free(ref2); } else { @@ -1294,29 +1171,11 @@ void BreakPoint::__format_bx_string() { } catch (const std::invalid_argument& ia) { ref = "N"; std::cerr << "Caught exception in BreakPoint:setRefAlt for indel ref: " << ia.what() << std::endl; - //std::cerr << "couldn't find reference sequence for ref " << b1.chr_name << " for indel, on human reference " << std::endl; - //std::cerr << (*this) << std::endl; - //exit(EXIT_FAILURE); //debug - //return; } - - // reference - //char * refi = faidx_fetch_seq(main_findex, const_cast(b1.chr_name.c_str()), b1.gr.pos1-1, b1.gr.pos1-1, &len); - //if (!refi) { - // std::cerr << "couldn't find reference sequence for ref " << b1.chr_name << " for indel, on human reference " << std::endl; - // return; - //} - //ref = std::string(refi); - //if (!ref.length()) { - // ref = "N"; - //} // alt alt = ref + insertion; - //if (refi) - // free(refi); - // deletion } else { @@ -1327,36 +1186,18 @@ void BreakPoint::__format_bx_string() { } catch (const std::invalid_argument& ia) { ref = std::string(std::abs(b1.gr.pos1 - b2.gr.pos1), 'N'); std::cerr << "Caught exception in BreakPoint:setRefAlt for indel ref: " << ia.what() << std::endl; - //std::cerr << "PIECE 2 couldn't find reference sequence for ref " << b1.chr_name << " for indel, on human reference " << std::endl; - //std::cerr << (*this) << std::endl; - //exit(EXIT_FAILURE); //debug } - - //char * refi = faidx_fetch_seq(main_findex, const_cast(b1.chr_name.c_str()), b1.gr.pos1-1, b2.gr.pos1-2, &len); - //if (!refi) { - // std::cerr << "couldn't find reference sequence for ref " << b1.chr_name << " for indel, on human reference " << std::endl; - // return; - //} - //ref = std::string(refi); - //if (!ref.length()) - // ref = "N"; alt = ref.substr(0,1); - //if (refi) - // free(refi); } } if (ref.empty()) { - //std::cerr << " REF empty " << (*this) << " " << b1.chr_name << " b1.gr " << b1.gr << std::endl; ref = "N"; } if (alt.empty()) { - //std::cerr << " ALT empty " << (*this) << " " << b1.chr_name << " b1.gr " << b1.gr << std::endl; alt = "N"; } - //assert(!ref.empty()); - //assert(!alt.empty()); } int BreakPoint::getSpan() const { @@ -1427,7 +1268,6 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH homology_s = val; break; case 19: - //insertion_size = (val == "x") ? 0 : val.length(); insertion_s = val; break; case 20: cname_s = val; break; @@ -1512,11 +1352,9 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH void SampleInfo::modelSelection(double er) { - //alt = std::max(alt, cigar); - - if (alt >= cov) { + // can't have more alt reads than total reads + if (alt >= cov) cov = alt; - } // adjust the coverage to be more in line with restrictions on ALT. // namely that ALT reads must overlap the variant site by more than T_SPLIT_BUFF @@ -1528,14 +1366,11 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH } else { a_cov = cov; } - - // make sure we get correct alt - //__adjust_alt_counts(); - //alt = std::max(alt, std::max((int)supporting_reads.size(), cigar)); - af = a_cov > 0 ? (double)alt / (double)a_cov : 1; af = af > 1 ? 1 : af; + int scaled_alt = std::min(alt, (int)a_cov); + // mutect log liklihood against error // how likely to see these ALT counts if true AF is af // vs how likely to see these ALT counts if true AF is 0 @@ -1548,12 +1383,12 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH // the indiviual calcs will have more confidence. Ultimately, // LO will represent the log likeihood that the variant is AF = af // vs AF = 0 - double ll_alt = __log_likelihood(a_cov - alt, alt, af, er); - double ll_err = __log_likelihood(a_cov - alt, alt, 0 , er); + double ll_alt = __log_likelihood(a_cov - scaled_alt, scaled_alt, af, er); + double ll_err = __log_likelihood(a_cov - scaled_alt, scaled_alt, 0 , er); LO = ll_alt - ll_err; //mutetct log likelihood normal - //er = 0.0005; // make this low, so that ALT in REF is rare and NORM in TUM gives low somatic prob + // er = 0.0005; // make this low, so that ALT in REF is rare and NORM in TUM gives low somatic prob // actually, dont' worry about it too much. 3+ alt in ref excludes somatic anyways. // so a high LO_n for the normal means that we are very confident that site is REF only in // normal sample. This is why LO_n from the normal can be used as a discriminant for @@ -1561,18 +1396,23 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH // or above a larger threshold XX if at DBSNP site, then we accept as somatic // LO_n should not be used for setting the confidence that something is real, just the // confidence that it is somatic - double ll_alt_norm = __log_likelihood(cov - alt, alt, std::max(0.5, af), er); // likelihood that variant is 0.5 - double ll_ref_norm = __log_likelihood(cov - alt, alt, 0 , er); // likelihood that varaint is actually true REF + double ll_ref_norm = __log_likelihood(cov - scaled_alt, scaled_alt, 0 , er); // likelihood that varaint is actually true REF + double ll_alt_norm = __log_likelihood(cov - scaled_alt, scaled_alt, std::max(0.5, af), er); // likelihood that variant is 0.5 LO_n = ll_ref_norm - ll_alt_norm; // higher number means more likely to be AF = 0 (ref) than AF = 0.5 (alt). // genotype calculation as provided in // http://bioinformatics.oxfordjournals.org/content/early/2011/09/08/bioinformatics.btr509.full.pdf+html - int scaled_cov = std::floor((double)cov * 0.90); - int this_alt = std::min(alt, scaled_cov); - - genotype_likelihoods[0] = __genotype_likelihoods(2, er, this_alt, scaled_cov); // ref/ref - genotype_likelihoods[1] = __genotype_likelihoods(1, er, this_alt, scaled_cov); - genotype_likelihoods[2] = __genotype_likelihoods(0, er, this_alt, scaled_cov); + //int scaled_cov = std::floor((double)cov * 0.90); + //int this_alt = std::min(alt, scaled_cov); + genotype_likelihoods[0] = __genotype_likelihoods(2, er, scaled_alt, a_cov); // 0/0 + genotype_likelihoods[1] = __genotype_likelihoods(1, er, scaled_alt, a_cov); // 0/1 + genotype_likelihoods[2] = __genotype_likelihoods(0, er, scaled_alt, a_cov); // 1/1 + + //debug + //std::cerr << " ALT " << alt << " scaled alt " << scaled_alt << " ER " << er << " A_COV " << a_cov << + // " 0/0 " << genotype_likelihoods[0] << " 0/1 " << genotype_likelihoods[1] << + // " 1/1 " << genotype_likelihoods[2] << " LOD " << LO << " LO_n " << LO_n << + // " af " << af << std::endl; double max_likelihood = *std::max_element(genotype_likelihoods.begin(), genotype_likelihoods.end()); if (max_likelihood == genotype_likelihoods[0]) @@ -1582,7 +1422,7 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH else genotype = "1/1"; - // scale to max + // scale GT likelihoods to max genotype_likelihoods[0] = max_likelihood - genotype_likelihoods[0]; genotype_likelihoods[1] = max_likelihood - genotype_likelihoods[1]; genotype_likelihoods[2] = max_likelihood - genotype_likelihoods[2]; @@ -1807,6 +1647,7 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH // g is the number of reference alleles (e.g. g = 2 is homozygous reference) // assumes biallelic model + // http://bioinformatics.oxfordjournals.org/content/early/2011/09/08/bioinformatics.btr509.full.pdf+html double SampleInfo::__genotype_likelihoods(int g, double er, int alt, int cov) { double val = - cov * log10(2) + (cov - alt) * log10( (2 - g) * er + g * (1 - er) ) + alt * log10( (2 - g) * (1 - er) + g * er); return val; diff --git a/src/Snowman/BreakPoint.h b/src/Snowman/BreakPoint.h index e7a2e19..97c913e 100644 --- a/src/Snowman/BreakPoint.h +++ b/src/Snowman/BreakPoint.h @@ -267,7 +267,7 @@ struct ReducedBreakEnd { void __set_total_reads(); - void __score_somatic(double NODBCUTOFF, double DBCUTOFF); + void score_somatic(double NODBCUTOFF, double DBCUTOFF); void addCovs(const std::unordered_map& covs); @@ -387,10 +387,10 @@ struct ReducedBreakEnd { friend std::ostream& operator<<(std::ostream& out, const BreakPoint& bp); - void __score_dscrd(int min_dscrd_size); - void __score_assembly_only(); - void __score_assembly_dscrd(); - void __score_indel(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP); + void score_dscrd(int min_dscrd_size); + void score_assembly_only(); + void score_assembly_dscrd(); + void score_indel(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP); void __format_readname_string(); void __set_homologies_insertions(); void __set_evidence(); diff --git a/src/Snowman/DiscordantCluster.cpp b/src/Snowman/DiscordantCluster.cpp index fd6d03a..8e50b48 100644 --- a/src/Snowman/DiscordantCluster.cpp +++ b/src/Snowman/DiscordantCluster.cpp @@ -610,8 +610,10 @@ using namespace SeqLib; for (auto& i : brv) { // only cluster FR reads together, RF reads together, FF together and RR together - if (i.PairOrientation() != orientation) + std::cout << i.Qname() << "\t" << i.PairOrientation() << std::endl; + if (i.PairOrientation() != orientation) { continue; + } std::string qq = i.Qname(); diff --git a/src/Snowman/LearnBamParams.cpp b/src/Snowman/LearnBamParams.cpp index 3dac572..744211b 100644 --- a/src/Snowman/LearnBamParams.cpp +++ b/src/Snowman/LearnBamParams.cpp @@ -148,84 +148,3 @@ void LearnBamParams::process_read(const SeqLib::BamRecord& r, size_t count, BamP ff->second.visited++; } - -/*void LearnBamParams::learnParams(BamParams& p, int max_count) { - - SeqLib::GenomicRegionVector grv = {SeqLib::GenomicRegion(0, 1000000,2000000), SeqLib::GenomicRegion(1,1000000,2000000)}; - SeqLib::BamWalker bwalker; - bwalker.OpenReadBam(bam); - bwalker.setBamWalkerRegions(grv); - - SeqLib::BamRecord r; - - double frac_clip = 0; - double frac_disc = 0; - double frac_bad =0; - int mapq = 0; - int readlen = 0; - double cov_count = 0; - int count = 0; - - double pos1 = 0, pos2 = 0; - int chr1 = 0; - bool rule; - - std::vector isizer; - - while (bwalker.GetNextRead(r, rule) && ++count < max_count) { - - bool qcpass = !r.DuplicateFlag() && !r.QCFailFlag() && !r.SecondaryFlag(); - if (!qcpass) - continue; - - readlen = std::max(r.Length(), readlen); - mapq = std::max(r.MapQuality(), mapq); - - // get the first read position - if (count == 1) { - pos1 = r.Position(); - chr1 = r.ChrID(); - } - - if (r.FullInsertSize() > 1000) - ++frac_disc; - if (r.NumClip() >= 5) - ++frac_clip; - if ( r.CigarSize() >= 6 || r.MapQuality() <= 5 || (r.QualitySequence().length() < 0.6 * readlen) ) { - ++frac_bad; - } - - if (r.InsertSize() > 0 && r.ProperOrientation() && r.FullInsertSize() < 10000) - isizer.push_back(r.FullInsertSize()); - - if (count % 500000 == 0) - std::cerr << "...learning from read " << r.Brief(bwalker.header()) << " at count " << SeqLib::AddCommas(count) << " of " << SeqLib::AddCommas(max_count) << std::endl; - - // last read position - if (chr1 == r.ChrID() && r.MapQuality() > 0 && std::abs(r.InsertSize()) > 10 && std::abs(r.InsertSize()) < 2000) { - ++cov_count; - pos2 = r.Position(); - } - } - - double sum = std::accumulate(isizer.begin(), isizer.end(), 0.0); - p.mean_isize = isizer.size() > 0 ? sum / isizer.size() : 0; - double mm = p.mean_isize; - - p.median_isize = isizer.size() ? CalcMHWScore(isizer) : 0; - - // get isize stdev - std::vector diff(isizer.size()); - std::transform(isizer.begin(), isizer.end(), diff.begin(), [mm](double x) { return x - mm; }); - double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); - p.sd_isize = std::sqrt(sq_sum / isizer.size()); - - p.mean_cov = (pos2 - pos1 > 0) ? cov_count * readlen / (pos2 - pos1) : 0; - p.frac_disc = frac_disc / (double)count; - p.frac_bad = frac_bad / (double)count; - p.frac_clip = frac_clip / (double)count; - p.readlen = readlen; - p.max_mapq = mapq; - -} -*/ diff --git a/src/Snowman/run_snowman.cpp b/src/Snowman/run_snowman.cpp index a0fd2d8..6783235 100644 --- a/src/Snowman/run_snowman.cpp +++ b/src/Snowman/run_snowman.cpp @@ -111,7 +111,7 @@ namespace opt { static bool write_corrected_reads = false; static bool zip = false; static bool read_tracking = false; // turn on output of qnames - static bool all_contigs = false; // output all contigs + static bool all_contigs = false; // output all contigs static bool no_unfiltered = false; // don't output unfiltered variants // discordant clustering params @@ -303,7 +303,7 @@ static const char *RUN_USAGE_MESSAGE = " -m, --min-overlap Minimum read overlap, an SGA parameter. Default: 0.4* readlength\n" " -e, --error-rate Fractional difference two reads can have to overlap. See SGA. 0 is fast, but requires error correcting. [0]\n" " -K, --ec-correct-type (f) Fermi-kit BFC correction, (s) Kmer-correction from SGA, (0) no correction (then suggest non-zero -e) [f]\n" -" -E, --ec-subsample Learn from fraction of non-weird reads during error-correction. Lower number = faster compute [0.25]\n" +" -E, --ec-subsample Learn from fraction of non-weird reads during error-correction. Lower number = faster compute [0.5]\n" " --write-asqg Output an ASQG graph file for each assembly window.\n" " BWA-MEM alignment params\n" " --bwa-match-score Set the BWA-MEM match score. BWA-MEM -A [2]\n" @@ -783,7 +783,7 @@ void parseRunOptions(int argc, char** argv) { case OPT_SCALE_ERRORS: arg >> opt::scale_error; break; case 'C': arg >> opt::max_cov; break; case OPT_NUM_TO_SAMPLE: arg >> opt::num_to_sample; break; - case OPT_READ_TRACK: opt::read_tracking = false; break; + case OPT_READ_TRACK: opt::read_tracking = true; break; case 't': tmp = SnowmanUtils::__bamOptParse(opt::bam, arg, sample_number++, "t"); if (opt::main_bam == "-") diff --git a/src/Snowman/vcf.cpp b/src/Snowman/vcf.cpp index 53fe903..032a560 100644 --- a/src/Snowman/vcf.cpp +++ b/src/Snowman/vcf.cpp @@ -263,7 +263,7 @@ VCFFile::VCFFile(std::string file, std::string id, const SeqLib::BamHeader& h, c sv_header.addInfoField("IMPRECISE","0","Flag", "Imprecise structural variation"); sv_header.addInfoField("SECONDARY","0","Flag", "SV calls comes from a secondary alignment"); sv_header.addInfoField("HOMLEN","1","Integer","Length of base pair identical micro-homology at event breakpoints"); - sv_header.addInfoField("DBSNP","1","String","TRUE if variant overlaps a dbSNP site"); + //sv_header.addInfoField("DBSNP","1","String","TRUE if variant overlaps a dbSNP site"); //sv_header.addInfoField("BKDIST","1","Integer","Distance between breakpoints (-1 if difference chromosomes"); sv_header.addInfoField("MAPQ","1","Integer","Mapping quality (BWA-MEM) of this fragement of the contig (-1 if discordant only)"); sv_header.addInfoField("MATEMAPQ","1","Integer","Mapping quality of the partner fragment of the contig"); @@ -314,12 +314,13 @@ VCFFile::VCFFile(std::string file, std::string id, const SeqLib::BamHeader& h, c sv_header.addFormatField("SL","1","Float","Alignment-quality Scaled log-odds, where LO is LO * (MAPQ - 2*NM)/60"); indel_header.addInfoField("REPSEQ","1","String","Repeat sequence near the event"); - indel_header.addInfoField("GRAYLIST","0","Flag","Indel is low quality and cross a difficult region of genome"); + //indel_header.addInfoField("GRAYLIST","0","Flag","Indel is low quality and cross a difficult region of genome"); //indel_header.addInfoField("SOMATIC","0","Flag","Variant is somatic"); indel_header.addInfoField("PON","1","Integer","Number of normal samples that have this indel present"); indel_header.addInfoField("NM","1","Integer","Number of mismatches of this alignment fragment to reference"); indel_header.addInfoField("READNAMES",".","String","IDs of ALT reads"); indel_header.addInfoField("BX",".","String","Table of BX tag counts for supporting reads"); + indel_header.addInfoField("DBSNP","0","Flag","Variant found in dbSNP"); indel_header.addInfoField("LOD","1","Float","Log of the odds that variant is real vs artifact"); // keep track of exact positions to keep from duplicating @@ -841,10 +842,10 @@ std::unordered_map VCFEntry::fillInfoFields() const { lod_ss << std::setprecision(4) << bp->true_lod; info_fields["LOD"] = lod_ss.str(); lod_ss.str(std::string()); - if (bp->blacklist) - info_fields["GRAYLIST"] = ""; + //if (bp->blacklist) + // info_fields["GRAYLIST"] = std::string(); if (bp->dbsnp) - info_fields["DBSNP"] = "TRUE"; //bp->rs; + info_fields["DBSNP"] = std::string(); } return info_fields; diff --git a/web/batch.functions.R b/web/batch.functions.R index 6655fb9..ec5e885 100644 --- a/web/batch.functions.R +++ b/web/batch.functions.R @@ -288,20 +288,22 @@ load_indel <- function(files, mc.cores=1) { print(paste("File does not exist",x)) return (GRanges()) } - if (exists('rowRanges')) + if (exists('rowRanges')) { fff <- rowRanges(rv <- VariantAnnotation::readVcf(x, "hg19")) - else + } else { fff <- rowData(rv <- VariantAnnotation::readVcf(x, "hg19")) + } + - if (length(fff) == 0) return (GRanges()) ### make sure we don't take germilne variants from strelka - if ("NT" %in% colnames(info(rv))) + if ("NT" %in% colnames(info(rv))) { non_norm_het = grepl("ref",info(rv)$NT) - else + } else { non_norm_het <- rep(TRUE, length(fff)) + } fff <- fff[ix <- (fff$FILTER == "PASS" | fff$FILTER == "alleleBias" | fff$FILTER == "QSI_ref" | fff$FILTER == "LOWLOD") & non_norm_het] if (length(fff) == 0) @@ -369,10 +371,10 @@ load_indel <- function(files, mc.cores=1) { if (sum(c("DP","AD") %in% colnames(mcols(fff)))==2) fff$AF <- ifelse(fff$DP > 0, fff$AD/fff$DP, NA) - #mcols(fff)$SPAN <- pmax(width(fff), width(unlist(fff$ALT))) - mcols(fff)$CALC_SPAN <- pmax(width(fff), fff$ALTWIDTH - 1) + mcols(fff)$SPAN <- pmax(width(fff), width(unlist(fff$ALT))) + ###mcols(fff)$CALC_SPAN <- pmax(width(fff), fff$ALTWIDTH - 1) mcols(fff)$ETYPE <- ifelse(fff$REFWIDTH > 1, 'del', 'ins') - + mcols(fff) <- cbind(VariantAnnotation::info(rv)[ix, ], mcols(fff)) return (fff) @@ -437,6 +439,7 @@ load_delly <- function(files) { mcols(del)$SPAN <- dt[mcols(del)$id]$span mcols(del)$SVTYPE <- dt[mcols(del)$id]$SVTYPE mcols(del)$CT <- dt[mcols(del)$id]$CT + mcols(del)$MAPQ <- dt[mcols(del)$id]$MAPQ if (ncol(geno(dvcf)$DV)==2) { mcols(del)$TDISC <- dt[mcols(del)$id]$TDISC mcols(del)$NDISC <- dt[mcols(del)$id]$NDISC @@ -1150,3 +1153,29 @@ span.histogram <- function(span) { ppdf(print(g), width=5, height=3) } + +load_svlib_sv <- function(x, black, mappability) { + + ss2 <- .ann_delins(load_snowman2(x)) + ss2 <- ss2[!grepl("N", mcols(ss2)$INSERTION)] + ss2 <- ss2[mcols(ss2)$MAPQ > 20] + fo <- grl.unlist(ss2) %**% black + ss2 <- ss2[setdiff(seq_along(ss2), ceiling(fo$query.id/2))] + mcols(ss2)$SPAN <- pmax(nchar(as.character(mcols(ss2)$INSERTION)), mcols(ss2)$SPAN) + ## adjust based on insertions + mcols(ss2)$deltype[nchar(mcols(ss2)$INSERTION) > mcols(ss2)$SPAN * 0.2] <- FALSE + return(ss2) + +} + +load_svlib_indel <- function(x, black, mappability) { + + ssi2 <- load_indel(x)[[1]] + fo <- ssi2 %**% black + ssi2 <- ssi2[setdiff(seq_along(ssi2), fo$query.id)] + fo <- (ssi2 %**% mappability) + ssi2$score <- NA + ssi2$score[fo$query.id] <- mappability$score[fo$subject.id] + return(ssi2) +} +