diff --git a/DESCRIPTION b/DESCRIPTION index 2f06b17..f36d02c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: fetchNOMe Type: Package Title: Package for fast and convenient retrieval of NOMe-seq data from BAM files -Version: 0.1 -Date: 2023-10-02 +Version: 0.2 +Date: 2024-05-09 Authors@R: person(given = "Evgeniy A.", family="Ozonov", email = "evgeniy.ozonov@fmi.ch", role = c("aut", "cre")) Description: This R package is tailored for fast and convenient retrieval of NOMe-seq data from BAM files aligned with QuasR, Bismark and BISCUIT. License: MIT + file LICENSE diff --git a/R/RcppExports.R b/R/RcppExports.R index 4b7485d..d3b9c92 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,7 +1,7 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax) { - .Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax) +fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) { + .Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) } diff --git a/R/get_data_matrix_from_bams.R b/R/get_data_matrix_from_bams.R index 3901f8a..5953c21 100644 --- a/R/get_data_matrix_from_bams.R +++ b/R/get_data_matrix_from_bams.R @@ -16,7 +16,7 @@ #' \item{"otherC"}{methylation states of all other cytosines which do not fall into above categories.} #' \item{"allC"}{methylation states of all cytosines combined.} #' } -#' +#' @param alignerUsed \code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISQUIT. #' @param collapseBySample \code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}. #' @param remove_nonunique \code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines. #' @param clip_until_nbg \code{integer} controlling clipping partial footprints at both ends of fragments. Namely, protected states are erased from each end until \code{clip_until_nbg} unprotected states are met. @@ -53,6 +53,7 @@ get_data_matrix_from_bams <- function(bamfiles, regions, genome, whichContext = c("GCH","WCG","bisC","otherC", "allC"), + alignerUsed = c("QuasR","Bismark","BISQUIT"), collapseBySample = TRUE, remove_nonunique = TRUE, clip_until_nbg = 1L, @@ -85,6 +86,11 @@ get_data_matrix_from_bams <- function(bamfiles, whichContext <- match.arg(whichContext) + alignerUsed <- match.arg(alignerUsed) + + message(paste0("Fetching data from BAM files generated by ",alignerUsed,".")) + + ## remove metadata from regions GenomicRanges::mcols(regions) <- NULL @@ -132,7 +138,8 @@ get_data_matrix_from_bams <- function(bamfiles, max_bisC_meth = max_bisC_meth, min_bisC_size = min_bisC_size, mapqMin = mapqMin, - mapqMax = mapqMax) + mapqMax = mapqMax, + alignerUsed = alignerUsed) data_mat <- outlist[["DataMatrix"]] # invert rows if strand is - diff --git a/man/get_data_matrix_from_bams.Rd b/man/get_data_matrix_from_bams.Rd index 580e187..08c6d32 100644 --- a/man/get_data_matrix_from_bams.Rd +++ b/man/get_data_matrix_from_bams.Rd @@ -10,6 +10,7 @@ get_data_matrix_from_bams( regions, genome, whichContext = c("GCH", "WCG", "bisC", "otherC", "allC"), + alignerUsed = c("QuasR", "Bismark", "BISQUIT"), collapseBySample = TRUE, remove_nonunique = TRUE, clip_until_nbg = 1L, @@ -40,6 +41,8 @@ get_data_matrix_from_bams( \item{"allC"}{methylation states of all cytosines combined.} }} +\item{alignerUsed}{\code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISQUIT.} + \item{collapseBySample}{\code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}.} \item{remove_nonunique}{\code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines.} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index d0644a3..7e43d5d 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -12,8 +12,8 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // fetch_data_matrix_from_bams_cpp -Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichContext, const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax); -RcppExport SEXP _fetchNOMe_fetch_data_matrix_from_bams_cpp(SEXP whichContextSEXP, SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP) { +Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichContext, const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::CharacterVector& alignerUsed); +RcppExport SEXP _fetchNOMe_fetch_data_matrix_from_bams_cpp(SEXP whichContextSEXP, SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP alignerUsedSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -32,13 +32,14 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_bisC_size(min_bisC_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMin(mapqMinSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMax(mapqMaxSEXP); - rcpp_result_gen = Rcpp::wrap(fetch_data_matrix_from_bams_cpp(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax)); + Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type alignerUsed(alignerUsedSEXP); + rcpp_result_gen = Rcpp::wrap(fetch_data_matrix_from_bams_cpp(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_fetchNOMe_fetch_data_matrix_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_data_matrix_from_bams_cpp, 15}, + {"_fetchNOMe_fetch_data_matrix_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_data_matrix_from_bams_cpp, 16}, {NULL, NULL, 0} }; diff --git a/src/fetch_data_from_bam.cpp b/src/fetch_data_from_bam.cpp index 796468f..ed3b869 100644 --- a/src/fetch_data_from_bam.cpp +++ b/src/fetch_data_from_bam.cpp @@ -5,6 +5,7 @@ bool fetch_data_from_bam(const string& bamfile, const string& regionChr, const int& regionStart, const int& regionEnd, + const alignerType& aligner, obj_pnts* pnts){ @@ -26,8 +27,27 @@ bool fetch_data_from_bam(const string& bamfile, if(regionChr.compare(fin->header->target_name[tid]) != 0) Rcpp::stop("could not find target '" + regionChr + "' in bam header of " + bamfile + ".\n"); - // call collectRegionData on store alignments in region - bam_fetch(fin->x.bam, idx, tid, regionStart, regionEnd, pnts, &collectRegionData); + + // define a pointer to a callback retrival function + int (*collectRegionData)(const bam1_t *hit, void *data); + + // choose callback function for data parsing based on aligner used + switch(aligner) { + case QuasR: + collectRegionData = &collectRegionData_QsBism; + break; + case Bismark: + collectRegionData = &collectRegionData_QsBism; + break; + case BISQUIT: + collectRegionData = &collectRegionData_Bisq; + break; + default: + Rcpp::stop("Only alignments generated by QuasR, Bismark or BISQUIT can be analyzed\n"); + } + + // call corresponding retrieval function to store alignments in region + bam_fetch(fin->x.bam, idx, tid, regionStart, regionEnd, pnts, collectRegionData); // clean bam file objects bam_index_destroy(idx); @@ -36,8 +56,8 @@ bool fetch_data_from_bam(const string& bamfile, return(1); } - -static int collectRegionData(const bam1_t *hit, void *data) { // bam_fetch callback function that parses XM tag in bismark bam and adds cooccurrence counts +// bam_fetch callback function for methyl calling and data storing for BAMs produced by QuasR and Bismark +static int collectRegionData_QsBism(const bam1_t *hit, void *data) { static obj_pnts *pdata = NULL; @@ -67,7 +87,9 @@ static int collectRegionData(const bam1_t *hit, void *data) { // bam_fetch callb } else{ // parsing first mate or SE data, strand determined by read itself msmType = is_read_rev ? GtoA : CtoT; } - // Rcpp::Rcout<<"msmType="<reg_data->addFragContextData(pdata->prefix, + pdata->prefix + qname, + ref_start, + ref_end, + rpos_vec, + dat_vec, + isplusstrand_vec, + is_second, + context_vec[i]); + + // for testing only + // frg.addData(ref_start, + // ref_end, + // rpos_vec, + // dat_vec, + // isplusstrand_vec, + // is_second, + // context_vec[i]); + + } + + + + + + + return(0); + +} + + + + + // checks validity of an alignment bool isValidAln(const bam1_t *hit,const obj_pnts* pdata){ diff --git a/src/fetch_data_from_bam.h b/src/fetch_data_from_bam.h index c8e1f3a..28b2ba2 100644 --- a/src/fetch_data_from_bam.h +++ b/src/fetch_data_from_bam.h @@ -40,10 +40,16 @@ bool fetch_data_from_bam(const string& bamfile, const string& regionChr, const int& regionStart, const int& regionEnd, + const alignerType& aligner, obj_pnts* pnts); -// bam_fetch callback function for bam_fetch -static int collectRegionData(const bam1_t *hit, void *data); +// bam_fetch callback function for bam_fetch for data retrieval from BAM files produced by QuasR and Bismark +static int collectRegionData_QsBism(const bam1_t *hit, void *data); + +// bam_fetch callback function for bam_fetch for data retrieval from BAM files produced by BISQUIT +static int collectRegionData_Bisq(const bam1_t *hit, void *data); + + // checks validity of an alignment bool isValidAln(const bam1_t *hit,const obj_pnts* pdata); diff --git a/src/functions_export_rcpp.cpp b/src/functions_export_rcpp.cpp index 8f6c5c5..4b6348c 100644 --- a/src/functions_export_rcpp.cpp +++ b/src/functions_export_rcpp.cpp @@ -16,7 +16,8 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, - const Rcpp::IntegerVector& mapqMax){ + const Rcpp::IntegerVector& mapqMax, + const Rcpp::CharacterVector& alignerUsed){ // convert input parameters string whichContext_ = Rcpp::as(whichContext); @@ -34,6 +35,18 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon else Rcpp::stop("whichContext must be GCH, WCG, bisC, otherC or allC"); + string alignerUsed_ = Rcpp::as(alignerUsed); + alignerType aligner; + if(alignerUsed_ == "QuasR") + aligner = QuasR; + else if(alignerUsed_ == "Bismark") + aligner = Bismark; + else if(alignerUsed_ == "BISQUIT") + aligner = BISQUIT; + else + Rcpp::stop("Only alignments generated by QuasR, Bismark or BISQUIT can be analyzed\n"); + + vector infiles_ = Rcpp::as >(infiles); string regionChr_ = Rcpp::as(regionChr); int regionStart_ = Rcpp::as(regionStart); @@ -74,6 +87,7 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon regionChr_, regionStart_, regionEnd_, + aligner, &pnts); } diff --git a/src/functions_export_rcpp.h b/src/functions_export_rcpp.h index 33de338..93418e9 100644 --- a/src/functions_export_rcpp.h +++ b/src/functions_export_rcpp.h @@ -41,7 +41,8 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, - const Rcpp::IntegerVector& mapqMax); + const Rcpp::IntegerVector& mapqMax, + const Rcpp::CharacterVector& alignerUsed); diff --git a/src/utils_globvars.h b/src/utils_globvars.h index bd6135b..98ea096 100644 --- a/src/utils_globvars.h +++ b/src/utils_globvars.h @@ -19,6 +19,13 @@ enum fragMapType{GCH, WCG, bisC, otherC, allC // this is used only to get data matrix with all Cs }; +// enumerator for aligners for which we have function implemented +enum alignerType{QuasR, + Bismark, + BISQUIT +}; + + // enumerator for type of mismatches enum seekMismType{CtoT,GtoA};