diff --git a/DESCRIPTION b/DESCRIPTION index a8273a6..88e7f9f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: fetchNOMe Type: Package Title: Package for fast and convenient retrieval of NOMe-seq data from BAM files -Version: 0.4 +Version: 0.5 Date: 2024-05-09 Authors@R: person(given = "Evgeniy A.", family="Ozonov", email = "evgeniy.ozonov@fmi.ch", role = c("aut", "cre")) Description: This R package is tailored for fast and convenient retrieval of NOMe-seq data from BAM files aligned with QuasR, Bismark and BISCUIT. diff --git a/NAMESPACE b/NAMESPACE index 64803a7..e35c380 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,4 +2,5 @@ useDynLib(fetchNOMe, .registration=TRUE) importFrom(Rcpp, evalCpp) export(get_data_matrix_from_bams) export(get_ctable_from_bams) -export(get_protect_stats_from_bams) \ No newline at end of file +export(get_protect_stats_from_bams) +export(get_molecule_data_list_from_bams) \ No newline at end of file diff --git a/R/RcppExports.R b/R/RcppExports.R index 0b285ec..c6b5c7d 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,15 +1,19 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -fetch_cooc_ctable_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) { - .Call(`_fetchNOMe_fetch_cooc_ctable_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) +fetch_molec_data_list_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens, data_as_rle) { + .Call(`_fetchNOMe_fetch_molec_data_list_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens, data_as_rle) } -fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) { - .Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) +fetch_cooc_ctable_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens) { + .Call(`_fetchNOMe_fetch_cooc_ctable_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens) } -fetch_protect_stats_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) { - .Call(`_fetchNOMe_fetch_protect_stats_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) +fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens) { + .Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens) +} + +fetch_protect_stats_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens) { + .Call(`_fetchNOMe_fetch_protect_stats_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens) } diff --git a/R/get_ctable_from_bams.R b/R/get_ctable_from_bams.R index 63c5d38..88be213 100644 --- a/R/get_ctable_from_bams.R +++ b/R/get_ctable_from_bams.R @@ -11,6 +11,8 @@ #' @param regions \linkS4class{GRanges} object for regions of interest. #' @param genome \code{character} path to fasta file which was used as reference for alignment. #' @param alignerUsed \code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISCUIT. +#' @param min_frag_data_len \code{integer} ignore fragments that have genomic lengths from most-left to most-right data points less than \code{min_frag_data_len}. +#' @param min_frag_data_dens \code{numeric} ignore fragments that have density of data-containing positions lower than \code{min_frag_data_dens}. #' @param collapseBySample \code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}. #' @param max_spacing maximum spacing length for which to collect co-occurrence counts. #' @param remove_nonunique \code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines. @@ -22,12 +24,8 @@ #' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency. #' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis. #' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis. -#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert -#' size (TLEN field in SAM Spec v1.4) of alignments to be included. -#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert -#' size (TLEN field in SAM Spec v1.4) of alignments to be included. -#' @param min_read_size \code{integer}. minimum read length to be included. -#' @param max_read_size \code{integer}. maximum read length to be included. + +#' @param max_read_size \code{integer}. maximum read length which is expected in the data. This parameter controls only extension of regions for extracting reference sequence. #' @param ncores number of cores to use. #' #' @return \code{tibble} where each row corresponds to sample - region combination. @@ -55,6 +53,8 @@ get_ctable_from_bams <- function(bamfiles, regions, genome, alignerUsed = c("QuasR","Bismark","BISCUIT"), + min_frag_data_len = 50L, + min_frag_data_dens = 0.05, collapseBySample = TRUE, max_spacing = 300, remove_nonunique = TRUE, @@ -64,9 +64,6 @@ get_ctable_from_bams <- function(bamfiles, min_bisC_size = 10, mapqMin = 0, mapqMax = 255, - absIsizeMin = NULL, - absIsizeMax = NULL, - min_read_size = 25L, max_read_size = 1000L, ncores = 1L){ @@ -93,28 +90,17 @@ get_ctable_from_bams <- function(bamfiles, alignerUsed <- match.arg(alignerUsed) - if (is.null(absIsizeMin)) # no tlen filtering - absIsizeMin <- -1L - if (is.null(absIsizeMax)) - absIsizeMax <- -1L - - - message(paste0("Collecting co-occurrence frequencies from BAM files generated by ",alignerUsed,".")) ## load refsequences for regions ## extend regions from left and right - if(absIsizeMax > 0) - ext_len <- absIsizeMax - else - ext_len <- max_read_size fa_index <- Rsamtools::scanFaIndex(genome) GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index) GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)), seqlengths = GenomicRanges::width(fa_index))) - refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center")) + refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center")) ## trim to remove parts out of chromosomes refseq_gr <- GenomicRanges::trim(refseq_gr) @@ -150,11 +136,10 @@ get_ctable_from_bams <- function(bamfiles, min_bisC_size = min_bisC_size, mapqMin = mapqMin, mapqMax = mapqMax, - absIsizeMin = absIsizeMin, - absIsizeMax = absIsizeMax, - min_read_size = min_read_size, - max_read_size = max_read_size, - alignerUsed = alignerUsed) + alignerUsed = alignerUsed, + min_frag_data_len = min_frag_data_len, + min_frag_data_dens = min_frag_data_dens + ) #coocmatr <- outlist[["CoocCountTable"]] diff --git a/R/get_data_matrix_from_bams.R b/R/get_data_matrix_from_bams.R index 134a5b2..d0b4685 100644 --- a/R/get_data_matrix_from_bams.R +++ b/R/get_data_matrix_from_bams.R @@ -17,6 +17,8 @@ #' \item{"allC"}{methylation states of all cytosines combined.} #' } #' @param alignerUsed \code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISCUIT. +#' @param min_frag_data_len \code{integer} ignore fragments that have genomic lengths from most-left to most-right data points less than \code{min_frag_data_len}. +#' @param min_frag_data_dens \code{numeric} ignore fragments that have density of data-containing positions lower than \code{min_frag_data_dens}. #' @param collapseBySample \code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}. #' @param remove_nonunique \code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines. #' @param clip_until_nbg \code{integer} controlling clipping partial footprints at both ends of fragments. Namely, protected states are omitted from each end until \code{clip_until_nbg} unprotected states are met. @@ -27,12 +29,7 @@ #' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency. #' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis. #' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis. -#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert -#' size (TLEN field in SAM Spec v1.4) of alignments to be included. -#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert -#' size (TLEN field in SAM Spec v1.4) of alignments to be included. -#' @param min_read_size \code{integer}. minimum read length to be included. -#' @param max_read_size \code{integer}. maximum read length to be included. +#' @param max_read_size \code{integer}. maximum read length which is expected in the data. This parameter controls only extension of regions for extracting reference sequence. #' @param ncores number of cores to use. #' @return \code{tibble} where each row corresponds to sample - region combination. #' Columns of the returned \code{tibble} represent: @@ -59,6 +56,8 @@ get_data_matrix_from_bams <- function(bamfiles, genome, whichContext = c("GCH","WCG","bisC","otherC", "allC"), alignerUsed = c("QuasR","Bismark","BISCUIT"), + min_frag_data_len = 50L, + min_frag_data_dens = 0.05, collapseBySample = TRUE, remove_nonunique = TRUE, clip_until_nbg = 1L, @@ -67,9 +66,6 @@ get_data_matrix_from_bams <- function(bamfiles, min_bisC_size = 10, mapqMin = 0, mapqMax = 255, - absIsizeMin = NULL, - absIsizeMax = NULL, - min_read_size = 25L, max_read_size = 1000L, ncores = 1L){ if(!inherits(bamfiles,"character")) @@ -96,11 +92,6 @@ get_data_matrix_from_bams <- function(bamfiles, alignerUsed <- match.arg(alignerUsed) - if (is.null(absIsizeMin)) # no tlen filtering - absIsizeMin <- -1L - if (is.null(absIsizeMax)) - absIsizeMax <- -1L - message(paste0("Fetching data from BAM files generated by ",alignerUsed,".")) @@ -110,15 +101,11 @@ get_data_matrix_from_bams <- function(bamfiles, ## load refsequences for regions ## extend regions from left and right - if(absIsizeMax > 0) - ext_len <- absIsizeMax - else - ext_len <- max_read_size fa_index <- Rsamtools::scanFaIndex(genome) GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index) GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)), seqlengths = GenomicRanges::width(fa_index))) - refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center")) + refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center")) ## trim to remove parts out of chromosomes refseq_gr <- GenomicRanges::trim(refseq_gr) @@ -156,11 +143,9 @@ get_data_matrix_from_bams <- function(bamfiles, min_bisC_size = min_bisC_size, mapqMin = mapqMin, mapqMax = mapqMax, - absIsizeMin = absIsizeMin, - absIsizeMax = absIsizeMax, - min_read_size = min_read_size, - max_read_size = max_read_size, - alignerUsed = alignerUsed) + alignerUsed = alignerUsed, + min_frag_data_len = min_frag_data_len, + min_frag_data_dens = min_frag_data_dens) data_mat <- outlist[["DataMatrix"]] # invert rows if strand is - diff --git a/R/get_molecule_data_list_from_bams.R b/R/get_molecule_data_list_from_bams.R new file mode 100644 index 0000000..885a315 --- /dev/null +++ b/R/get_molecule_data_list_from_bams.R @@ -0,0 +1,172 @@ + + + +#' Fetch data for each molecule from NOMe-seq BAM files for a set of regions +#' +#' +#' +#' @param bamfiles \code{character} paths to bam files +#' @param samplenames \code{character} names of samples. If \code{collapseBySample} is \code{TRUE} counts are aggregated across bam files with same sample name. +#' @param regions \linkS4class{GRanges} object for regions of interest. +#' @param genome \code{character} path to fasta file which was used as reference for alignment. +#' @param whichContext \code{character} which can be one of "GCH","WCG","bisC", "otherC" or "allC" and sets which data to retrieve. +#' \describe{ +#' \item{"GCH}{protection data;} +#' \item{"WCG"}{endogenous CpG methylation data;} +#' \item{"bisC}{methylation states of non-GCH and non-WCG cytosins used for estimating efficiency of bisulfite conversion;} +#' \item{"otherC"}{methylation states of all other cytosines which do not fall into above categories.} +#' \item{"allC"}{methylation states of all cytosines combined.} +#' } +#' @param alignerUsed \code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISCUIT. +#' @param min_frag_data_len \code{integer} ignore fragments that have genomic lengths from most-left to most-right data points less than \code{min_frag_data_len}. +#' @param min_frag_data_dens \code{numeric} ignore fragments that have density of data-containing positions lower than \code{min_frag_data_dens}. +#' @param collapseBySample \code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}. +#' @param remove_nonunique \code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines. +#' @param clip_until_nbg \code{integer} controlling clipping partial footprints at both ends of fragments. Namely, protected states are omitted from each end until \code{clip_until_nbg} unprotected states are met. +#' Set \code{clip_until_nbg=0L} to skip clipping pre-processing. +#' @param noclip_protect_frac_above \code{numeric} controlling fragment clipping, i.e. fragments that have fraction of protected states higher than \code{max_protect_frac} escape clipping. This parameter allows to avoid removal of fully protected fragments during clipping process. +#' @param max_bisC_meth \code{numeric} controlling removal of fragments with failed bisulfite conversion, i.e. fragments with fraction of methylated non-GCH, and non-WCG cytosines higher then \code{max_bisC_meth} are removed from the analysis to get rid of fragments for which bisulfite conversion failed. +#' @param min_bisC_size \code{integer} setting minimum number non-GCH, and non-WCG cytosines for filtering according to bisulfite conversion efficiency, +#' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency. +#' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis. +#' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis. +#' @param max_read_size \code{integer}. maximum read length which is expected in the data. This parameter controls only extension of regions for extracting reference sequence. +#' @param ncores number of cores to use. +#' @param data_as_Rle \code{logical} if \code{TRUE} data vectors are \code{Rle} compressed. +#' @return \code{tibble} where each row corresponds to sample - molecule combination. +#' Please note that if \code{regions} contain overlapping ranges it is possible that same molecule to appear several times. +#' +#' Columns of the returned \code{tibble} represent: +#' \describe{ +#' \item{SampleName}{Name of a sample as defined by \code{samplenames}.} +#' \item{seqnames, start, end}{seqnames (or chromosomes) and coordinates of molecules. Please note, that start and end are positions of the first and last data points and not positions of alignments.} +#' \item{R1strand}{strands of alignments for R1 reads} +#' \item{qname}{names of a molecules} +#' \item{nDataPoints}{numbers of informative data points within molecules} +#' \item{startDataPoint}{most-left data points} +#' \item{endDataPoint}{most-right data points} +#' \item{fragData_list}{vectors with molecule data +#' } +#' } +#' @export +#' + +get_molecule_data_list_from_bams <- function(bamfiles, + samplenames, + regions, + genome, + whichContext = c("GCH","WCG","bisC","otherC", "allC"), + alignerUsed = c("QuasR","Bismark","BISCUIT"), + min_frag_data_len = 50L, + min_frag_data_dens = 0.05, + collapseBySample = TRUE, + remove_nonunique = TRUE, + clip_until_nbg = 1L, + noclip_protect_frac_above = 0.90, + max_bisC_meth = 0.1, + min_bisC_size = 10, + mapqMin = 0, + mapqMax = 255, + max_read_size = 1000L, + ncores = 1L, + data_as_Rle = FALSE){ + if(!inherits(bamfiles,"character")) + stop("bamfiles must be vector of character") + if(!inherits(samplenames,"character")) + stop("samplenames must be vector of character") + if(any(!file.exists(bamfiles))){ + nonex <- which(!file.exists(bamfiles)) + nonex <- paste0(bamfiles[nonex],collapse = "\n") + stop(paste0("Could not find following BAM files:\n",nonex)) + } + + if(!inherits(regions,"GRanges")) + stop("regions must be Granges") + + + if(!inherits(genome,"character") & length(genome) !=1) + stop("genome must be character and contain only one element") + + if(!file.exists(genome)) + stop("Could not find genome:", genome) + + whichContext <- match.arg(whichContext) + + alignerUsed <- match.arg(alignerUsed) + + message(paste0("Fetching data for every molecule from BAM files generated by ",alignerUsed,".")) + + + ## remove metadata from regions + GenomicRanges::mcols(regions) <- NULL + + ## load refsequences for regions + + ## extend regions from left and right + fa_index <- Rsamtools::scanFaIndex(genome) + GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index) + GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)), + seqlengths = GenomicRanges::width(fa_index))) + refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center")) + + ## trim to remove parts out of chromosomes + refseq_gr <- GenomicRanges::trim(refseq_gr) + refseqs <- Rsamtools::scanFa(genome,refseq_gr) + + # 2. split vector by sample name. If collapseBySample is FALSE add to samplenames some suffixes + if(!collapseBySample){ + samplenames <- paste0("s",seq_along(samplenames),"_",samplenames) + } + + bamfiles <- split(bamfiles, f=samplenames) + + + molec_data_list <- do.call(rbind,lapply(names(bamfiles), + function(bamgroupname){ + ctbl <- do.call(rbind,parallel::mclapply(seq_along(regions), + function(regi){ + + regdf <- GenomicRanges::as.data.frame(regions[regi]) + regdf$seqnames <- as.character(regdf[,"seqnames"]) + seqgr <- as.data.frame(refseq_gr[regi]) + + outlist <- fetch_molec_data_list_from_bams_cpp(whichContext = whichContext, + infiles = bamfiles[[bamgroupname]], + regionChr = regdf[1,"seqnames"], + regionStart = regdf[1,"start"] - 1, # convert to 0-based + regionEnd = regdf[1,"end"], + remove_nonunique = remove_nonunique, + seqstring = as.character(refseqs[regi]), + seqStart = seqgr[1,"start"] - 1, + seqEnd = seqgr[1,"end"], + clip_until_nbg = clip_until_nbg, + max_protect_frac = noclip_protect_frac_above, + max_bisC_meth = max_bisC_meth, + min_bisC_size = min_bisC_size, + mapqMin = mapqMin, + mapqMax = mapqMax, + alignerUsed = alignerUsed, + min_frag_data_len = min_frag_data_len, + min_frag_data_dens = min_frag_data_dens, + data_as_rle = data_as_Rle) + + + outtbl <- tibble::tibble("SampleName" = bamgroupname, + "seqnames" = regdf[1,"seqnames"], + "start" = outlist[["start"]], + "end" = outlist[["end"]], + "R1strand" = outlist[["R1strand"]], + "fragConfig" = outlist[["fragConfigs"]], + "qname" = outlist[["qnames"]], + "nDataPoints" = outlist[["nDataPoints"]], + "startDataPoint" = outlist[["startDataPoint"]], + "endDataPoint" = outlist[["endDataPoint"]], + "fragData_list" = outlist[["fragData_list"]]) + colnames(outtbl)[which(colnames(outtbl) == "fragData_list")] <- paste0(whichContext,"_fragData_list") + + return(outtbl) + },mc.cores = ncores)) + + return(ctbl) + })) +} \ No newline at end of file diff --git a/R/get_protect_stats_from_bams.R b/R/get_protect_stats_from_bams.R index e22b1fa..99cc3f6 100644 --- a/R/get_protect_stats_from_bams.R +++ b/R/get_protect_stats_from_bams.R @@ -9,6 +9,8 @@ #' @param regions \linkS4class{GRanges} object for regions of interest. #' @param genome \code{character} path to fasta file which was used as reference for alignment. #' @param alignerUsed \code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISCUIT. +#' @param min_frag_data_len \code{integer} ignore fragments that have genomic lengths from most-left to most-right data points less than \code{min_frag_data_len}. +#' @param min_frag_data_dens \code{numeric} ignore fragments that have density of data-containing positions lower than \code{min_frag_data_dens}. #' @param collapseBySample \code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}. #' @param remove_nonunique \code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines. #' @param clip_until_nbg \code{integer} controlling clipping partial footprints at both ends of fragments. Namely, protected states are erased from each end until \code{clip_until_nbg} unprotected states are met. @@ -19,12 +21,7 @@ #' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency. #' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis. #' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis. -#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert -#' size (TLEN field in SAM Spec v1.4) of alignments to be included. -#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert -#' size (TLEN field in SAM Spec v1.4) of alignments to be included. -#' @param min_read_size \code{integer}. minimum read length to be included. -#' @param max_read_size \code{integer}. maximum read length to be included. +#' @param max_read_size \code{integer}. maximum read length which is expected in the data. This parameter controls only extension of regions for extracting reference sequence. #' @param ncores number of cores to use. #' #' @return \code{tibble} where each row corresponds to sample - region combination. @@ -56,6 +53,8 @@ get_protect_stats_from_bams <- function(bamfiles, regions, genome, alignerUsed = c("QuasR","Bismark","BISCUIT"), + min_frag_data_len = 50L, + min_frag_data_dens = 0.05, collapseBySample = TRUE, remove_nonunique = TRUE, clip_until_nbg = 1L, @@ -64,9 +63,6 @@ get_protect_stats_from_bams <- function(bamfiles, min_bisC_size = 10, mapqMin = 0, mapqMax = 255, - absIsizeMin = NULL, - absIsizeMax = NULL, - min_read_size = 25L, max_read_size = 1000L, ncores = 1L){ if(!inherits(bamfiles,"character")) @@ -89,24 +85,17 @@ get_protect_stats_from_bams <- function(bamfiles, stop("Could not find genome:", genome) alignerUsed <- match.arg(alignerUsed) - if (is.null(absIsizeMin)) # no tlen filtering - absIsizeMin <- -1L - if (is.null(absIsizeMax)) - absIsizeMax <- -1L + message(paste0("Collecting protection statistics from BAM files generated by ",alignerUsed,".")) ## load refsequences for regions ## extend regions from left and right - if(absIsizeMax > 0) - ext_len <- absIsizeMax - else - ext_len <- max_read_size - fa_index <- Rsamtools::scanFaIndex(genome) + fa_index <- Rsamtools::scanFaIndex(genome) GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index) GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)), seqlengths = GenomicRanges::width(fa_index))) - refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center")) + refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center")) ## trim to remove parts out of chromosomes refseq_gr <- GenomicRanges::trim(refseq_gr) @@ -142,11 +131,9 @@ get_protect_stats_from_bams <- function(bamfiles, min_bisC_size = min_bisC_size, mapqMin = mapqMin, mapqMax = mapqMax, - absIsizeMin = absIsizeMin, - absIsizeMax = absIsizeMax, - min_read_size = min_read_size, - max_read_size = max_read_size, - alignerUsed = alignerUsed) + alignerUsed = alignerUsed, + min_frag_data_len = min_frag_data_len, + min_frag_data_dens = min_frag_data_dens) data_mat <- outlist[["ProtectStats"]] diff --git a/man/get_ctable_from_bams.Rd b/man/get_ctable_from_bams.Rd index 114627e..ba59c00 100644 --- a/man/get_ctable_from_bams.Rd +++ b/man/get_ctable_from_bams.Rd @@ -10,6 +10,8 @@ get_ctable_from_bams( regions, genome, alignerUsed = c("QuasR", "Bismark", "BISCUIT"), + min_frag_data_len = 50L, + min_frag_data_dens = 0.05, collapseBySample = TRUE, max_spacing = 300, remove_nonunique = TRUE, @@ -19,9 +21,6 @@ get_ctable_from_bams( min_bisC_size = 10, mapqMin = 0, mapqMax = 255, - absIsizeMin = NULL, - absIsizeMax = NULL, - min_read_size = 25L, max_read_size = 1000L, ncores = 1L ) @@ -37,6 +36,10 @@ get_ctable_from_bams( \item{alignerUsed}{\code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISCUIT.} +\item{min_frag_data_len}{\code{integer} ignore fragments that have genomic lengths from most-left to most-right data points less than \code{min_frag_data_len}.} + +\item{min_frag_data_dens}{\code{numeric} ignore fragments that have density of data-containing positions lower than \code{min_frag_data_dens}.} + \item{collapseBySample}{\code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}.} \item{max_spacing}{maximum spacing length for which to collect co-occurrence counts.} @@ -57,15 +60,7 @@ i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \c \item{mapqMax}{\code{integer} setting maximum MAPQ of reads to be included into analysis.} -\item{absIsizeMin}{\code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert -size (TLEN field in SAM Spec v1.4) of alignments to be included.} - -\item{absIsizeMax}{\code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert -size (TLEN field in SAM Spec v1.4) of alignments to be included.} - -\item{min_read_size}{\code{integer}. minimum read length to be included.} - -\item{max_read_size}{\code{integer}. maximum read length to be included.} +\item{max_read_size}{\code{integer}. maximum read length which is expected in the data. This parameter controls only extension of regions for extracting reference sequence.} \item{ncores}{number of cores to use.} } diff --git a/man/get_data_matrix_from_bams.Rd b/man/get_data_matrix_from_bams.Rd index aa37a89..a06ec22 100644 --- a/man/get_data_matrix_from_bams.Rd +++ b/man/get_data_matrix_from_bams.Rd @@ -11,6 +11,8 @@ get_data_matrix_from_bams( genome, whichContext = c("GCH", "WCG", "bisC", "otherC", "allC"), alignerUsed = c("QuasR", "Bismark", "BISCUIT"), + min_frag_data_len = 50L, + min_frag_data_dens = 0.05, collapseBySample = TRUE, remove_nonunique = TRUE, clip_until_nbg = 1L, @@ -19,9 +21,6 @@ get_data_matrix_from_bams( min_bisC_size = 10, mapqMin = 0, mapqMax = 255, - absIsizeMin = NULL, - absIsizeMax = NULL, - min_read_size = 25L, max_read_size = 1000L, ncores = 1L ) @@ -46,6 +45,10 @@ get_data_matrix_from_bams( \item{alignerUsed}{\code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISCUIT.} +\item{min_frag_data_len}{\code{integer} ignore fragments that have genomic lengths from most-left to most-right data points less than \code{min_frag_data_len}.} + +\item{min_frag_data_dens}{\code{numeric} ignore fragments that have density of data-containing positions lower than \code{min_frag_data_dens}.} + \item{collapseBySample}{\code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}.} \item{remove_nonunique}{\code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines.} @@ -64,15 +67,7 @@ i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \c \item{mapqMax}{\code{integer} setting maximum MAPQ of reads to be included into analysis.} -\item{absIsizeMin}{\code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert -size (TLEN field in SAM Spec v1.4) of alignments to be included.} - -\item{absIsizeMax}{\code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert -size (TLEN field in SAM Spec v1.4) of alignments to be included.} - -\item{min_read_size}{\code{integer}. minimum read length to be included.} - -\item{max_read_size}{\code{integer}. maximum read length to be included.} +\item{max_read_size}{\code{integer}. maximum read length which is expected in the data. This parameter controls only extension of regions for extracting reference sequence.} \item{ncores}{number of cores to use.} } diff --git a/man/get_molecule_data_list_from_bams.Rd b/man/get_molecule_data_list_from_bams.Rd new file mode 100644 index 0000000..76d9793 --- /dev/null +++ b/man/get_molecule_data_list_from_bams.Rd @@ -0,0 +1,93 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_molecule_data_list_from_bams.R +\name{get_molecule_data_list_from_bams} +\alias{get_molecule_data_list_from_bams} +\title{Fetch data for each molecule from NOMe-seq BAM files for a set of regions} +\usage{ +get_molecule_data_list_from_bams( + bamfiles, + samplenames, + regions, + genome, + whichContext = c("GCH", "WCG", "bisC", "otherC", "allC"), + alignerUsed = c("QuasR", "Bismark", "BISCUIT"), + min_frag_data_len = 50L, + min_frag_data_dens = 0.05, + collapseBySample = TRUE, + remove_nonunique = TRUE, + clip_until_nbg = 1L, + noclip_protect_frac_above = 0.9, + max_bisC_meth = 0.1, + min_bisC_size = 10, + mapqMin = 0, + mapqMax = 255, + max_read_size = 1000L, + ncores = 1L +) +} +\arguments{ +\item{bamfiles}{\code{character} paths to bam files} + +\item{samplenames}{\code{character} names of samples. If \code{collapseBySample} is \code{TRUE} counts are aggregated across bam files with same sample name.} + +\item{regions}{\linkS4class{GRanges} object for regions of interest.} + +\item{genome}{\code{character} path to fasta file which was used as reference for alignment.} + +\item{whichContext}{\code{character} which can be one of "GCH","WCG","bisC", "otherC" or "allC" and sets which data to retrieve. +\describe{ +\item{"GCH}{protection data;} +\item{"WCG"}{endogenous CpG methylation data;} +\item{"bisC}{methylation states of non-GCH and non-WCG cytosins used for estimating efficiency of bisulfite conversion;} +\item{"otherC"}{methylation states of all other cytosines which do not fall into above categories.} +\item{"allC"}{methylation states of all cytosines combined.} +}} + +\item{alignerUsed}{\code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISCUIT.} + +\item{min_frag_data_len}{\code{integer} ignore fragments that have genomic lengths from most-left to most-right data points less than \code{min_frag_data_len}.} + +\item{min_frag_data_dens}{\code{numeric} ignore fragments that have density of data-containing positions lower than \code{min_frag_data_dens}.} + +\item{collapseBySample}{\code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}.} + +\item{remove_nonunique}{\code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines.} + +\item{clip_until_nbg}{\code{integer} controlling clipping partial footprints at both ends of fragments. Namely, protected states are omitted from each end until \code{clip_until_nbg} unprotected states are met. +Set \code{clip_until_nbg=0L} to skip clipping pre-processing.} + +\item{noclip_protect_frac_above}{\code{numeric} controlling fragment clipping, i.e. fragments that have fraction of protected states higher than \code{max_protect_frac} escape clipping. This parameter allows to avoid removal of fully protected fragments during clipping process.} + +\item{max_bisC_meth}{\code{numeric} controlling removal of fragments with failed bisulfite conversion, i.e. fragments with fraction of methylated non-GCH, and non-WCG cytosines higher then \code{max_bisC_meth} are removed from the analysis to get rid of fragments for which bisulfite conversion failed.} + +\item{min_bisC_size}{\code{integer} setting minimum number non-GCH, and non-WCG cytosines for filtering according to bisulfite conversion efficiency, +i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency.} + +\item{mapqMin}{\code{integer} setting minimum MAPQ of reads to be included into analysis.} + +\item{mapqMax}{\code{integer} setting maximum MAPQ of reads to be included into analysis.} + +\item{max_read_size}{\code{integer}. maximum read length which is expected in the data. This parameter controls only extension of regions for extracting reference sequence.} + +\item{ncores}{number of cores to use.} +} +\value{ +\code{tibble} where each row corresponds to sample - molecule combination. +Please note that if \code{regions} contain overlapping ranges it is possible that same molecule to appear several times. + +Columns of the returned \code{tibble} represent: +\describe{ + \item{SampleName}{Name of a sample as defined by \code{samplenames}.} + \item{seqnames, start, end}{seqnames (or chromosomes) and coordinates of molecules. Please note, that start and end are positions of the first and last data points and not positions of alignments.} + \item{R1strand}{strands of alignments for R1 reads} + \item{qname}{names of a molecules} + \item{nDataPoints}{numbers of informative data points within molecules} + \item{startDataPoint}{most-left data points} + \item{endDataPoint}{most-right data points} + \item{fragData_list}{vectors with molecule data + } +} +} +\description{ +Fetch data for each molecule from NOMe-seq BAM files for a set of regions +} diff --git a/man/get_protect_stats_from_bams.Rd b/man/get_protect_stats_from_bams.Rd index a201c5e..4b556bd 100644 --- a/man/get_protect_stats_from_bams.Rd +++ b/man/get_protect_stats_from_bams.Rd @@ -18,9 +18,6 @@ get_protect_stats_from_bams( min_bisC_size = 10, mapqMin = 0, mapqMax = 255, - absIsizeMin = NULL, - absIsizeMax = NULL, - min_read_size = 25L, max_read_size = 1000L, ncores = 1L ) @@ -54,17 +51,13 @@ i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \c \item{mapqMax}{\code{integer} setting maximum MAPQ of reads to be included into analysis.} -\item{absIsizeMin}{\code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert -size (TLEN field in SAM Spec v1.4) of alignments to be included.} +\item{max_read_size}{\code{integer}. maximum read length which is expected in the data. This parameter controls only extension of regions for extracting reference sequence.} -\item{absIsizeMax}{\code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert -size (TLEN field in SAM Spec v1.4) of alignments to be included.} - -\item{min_read_size}{\code{integer}. minimum read length to be included.} +\item{ncores}{number of cores to use.} -\item{max_read_size}{\code{integer}. maximum read length to be included.} +\item{min_frag_data_len}{\code{integer} ignore fragments that have genomic lengths from most-left to most-right data points less than \code{min_frag_data_len}.} -\item{ncores}{number of cores to use.} +\item{min_frag_data_dens}{\code{numeric} ignore fragments that have density of data-containing positions lower than \code{min_frag_data_dens}.} } \value{ \code{tibble} where each row corresponds to sample - region combination. diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index b1a19d8..6029094 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,9 +11,38 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif +// fetch_molec_data_list_from_bams_cpp +Rcpp::List fetch_molec_data_list_from_bams_cpp(const Rcpp::CharacterVector& whichContext, const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::CharacterVector& alignerUsed, const Rcpp::IntegerVector& min_frag_data_len, const Rcpp::NumericVector& min_frag_data_dens, const Rcpp::LogicalVector& data_as_rle); +RcppExport SEXP _fetchNOMe_fetch_molec_data_list_from_bams_cpp(SEXP whichContextSEXP, SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP alignerUsedSEXP, SEXP min_frag_data_lenSEXP, SEXP min_frag_data_densSEXP, SEXP data_as_rleSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type whichContext(whichContextSEXP); + Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type infiles(infilesSEXP); + Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type regionChr(regionChrSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type regionStart(regionStartSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type regionEnd(regionEndSEXP); + Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type seqstring(seqstringSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type seqStart(seqStartSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type seqEnd(seqEndSEXP); + Rcpp::traits::input_parameter< const Rcpp::LogicalVector& >::type remove_nonunique(remove_nonuniqueSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type clip_until_nbg(clip_until_nbgSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type max_protect_frac(max_protect_fracSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type max_bisC_meth(max_bisC_methSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_bisC_size(min_bisC_sizeSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMin(mapqMinSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMax(mapqMaxSEXP); + Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type alignerUsed(alignerUsedSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_frag_data_len(min_frag_data_lenSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type min_frag_data_dens(min_frag_data_densSEXP); + Rcpp::traits::input_parameter< const Rcpp::LogicalVector& >::type data_as_rle(data_as_rleSEXP); + rcpp_result_gen = Rcpp::wrap(fetch_molec_data_list_from_bams_cpp(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens, data_as_rle)); + return rcpp_result_gen; +END_RCPP +} // fetch_cooc_ctable_from_bams_cpp -Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::IntegerVector& max_spac, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::IntegerVector& absIsizeMin, const Rcpp::IntegerVector& absIsizeMax, const Rcpp::IntegerVector& min_read_size, const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed); -RcppExport SEXP _fetchNOMe_fetch_cooc_ctable_from_bams_cpp(SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP max_spacSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP absIsizeMinSEXP, SEXP absIsizeMaxSEXP, SEXP min_read_sizeSEXP, SEXP max_read_sizeSEXP, SEXP alignerUsedSEXP) { +Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::IntegerVector& max_spac, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::CharacterVector& alignerUsed, const Rcpp::IntegerVector& min_frag_data_len, const Rcpp::NumericVector& min_frag_data_dens); +RcppExport SEXP _fetchNOMe_fetch_cooc_ctable_from_bams_cpp(SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP max_spacSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP alignerUsedSEXP, SEXP min_frag_data_lenSEXP, SEXP min_frag_data_densSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -32,18 +61,16 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_bisC_size(min_bisC_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMin(mapqMinSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMax(mapqMaxSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMin(absIsizeMinSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMax(absIsizeMaxSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_read_size(min_read_sizeSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type max_read_size(max_read_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type alignerUsed(alignerUsedSEXP); - rcpp_result_gen = Rcpp::wrap(fetch_cooc_ctable_from_bams_cpp(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_frag_data_len(min_frag_data_lenSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type min_frag_data_dens(min_frag_data_densSEXP); + rcpp_result_gen = Rcpp::wrap(fetch_cooc_ctable_from_bams_cpp(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens)); return rcpp_result_gen; END_RCPP } // fetch_data_matrix_from_bams_cpp -Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichContext, const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::IntegerVector& absIsizeMin, const Rcpp::IntegerVector& absIsizeMax, const Rcpp::IntegerVector& min_read_size, const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed); -RcppExport SEXP _fetchNOMe_fetch_data_matrix_from_bams_cpp(SEXP whichContextSEXP, SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP absIsizeMinSEXP, SEXP absIsizeMaxSEXP, SEXP min_read_sizeSEXP, SEXP max_read_sizeSEXP, SEXP alignerUsedSEXP) { +Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichContext, const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::CharacterVector& alignerUsed, const Rcpp::IntegerVector& min_frag_data_len, const Rcpp::NumericVector& min_frag_data_dens); +RcppExport SEXP _fetchNOMe_fetch_data_matrix_from_bams_cpp(SEXP whichContextSEXP, SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP alignerUsedSEXP, SEXP min_frag_data_lenSEXP, SEXP min_frag_data_densSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -62,18 +89,16 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_bisC_size(min_bisC_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMin(mapqMinSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMax(mapqMaxSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMin(absIsizeMinSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMax(absIsizeMaxSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_read_size(min_read_sizeSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type max_read_size(max_read_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type alignerUsed(alignerUsedSEXP); - rcpp_result_gen = Rcpp::wrap(fetch_data_matrix_from_bams_cpp(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_frag_data_len(min_frag_data_lenSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type min_frag_data_dens(min_frag_data_densSEXP); + rcpp_result_gen = Rcpp::wrap(fetch_data_matrix_from_bams_cpp(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens)); return rcpp_result_gen; END_RCPP } // fetch_protect_stats_from_bams_cpp -Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::IntegerVector& absIsizeMin, const Rcpp::IntegerVector& absIsizeMax, const Rcpp::IntegerVector& min_read_size, const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed); -RcppExport SEXP _fetchNOMe_fetch_protect_stats_from_bams_cpp(SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP absIsizeMinSEXP, SEXP absIsizeMaxSEXP, SEXP min_read_sizeSEXP, SEXP max_read_sizeSEXP, SEXP alignerUsedSEXP) { +Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::CharacterVector& alignerUsed, const Rcpp::IntegerVector& min_frag_data_len, const Rcpp::NumericVector& min_frag_data_dens); +RcppExport SEXP _fetchNOMe_fetch_protect_stats_from_bams_cpp(SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP alignerUsedSEXP, SEXP min_frag_data_lenSEXP, SEXP min_frag_data_densSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -91,20 +116,19 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_bisC_size(min_bisC_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMin(mapqMinSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMax(mapqMaxSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMin(absIsizeMinSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMax(absIsizeMaxSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_read_size(min_read_sizeSEXP); - Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type max_read_size(max_read_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type alignerUsed(alignerUsedSEXP); - rcpp_result_gen = Rcpp::wrap(fetch_protect_stats_from_bams_cpp(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_frag_data_len(min_frag_data_lenSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type min_frag_data_dens(min_frag_data_densSEXP); + rcpp_result_gen = Rcpp::wrap(fetch_protect_stats_from_bams_cpp(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_fetchNOMe_fetch_cooc_ctable_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_cooc_ctable_from_bams_cpp, 20}, - {"_fetchNOMe_fetch_data_matrix_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_data_matrix_from_bams_cpp, 20}, - {"_fetchNOMe_fetch_protect_stats_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_protect_stats_from_bams_cpp, 19}, + {"_fetchNOMe_fetch_molec_data_list_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_molec_data_list_from_bams_cpp, 19}, + {"_fetchNOMe_fetch_cooc_ctable_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_cooc_ctable_from_bams_cpp, 18}, + {"_fetchNOMe_fetch_data_matrix_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_data_matrix_from_bams_cpp, 18}, + {"_fetchNOMe_fetch_protect_stats_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_protect_stats_from_bams_cpp, 17}, {NULL, NULL, 0} }; diff --git a/src/fetch_data_from_bam.cpp b/src/fetch_data_from_bam.cpp index 99d2b21..183bd94 100644 --- a/src/fetch_data_from_bam.cpp +++ b/src/fetch_data_from_bam.cpp @@ -7,193 +7,218 @@ bool fetch_data_from_bam(const string& bamfile, const int& regionEnd, const alignerType& aligner, obj_pnts* pnts){ - - - bam1_t *hit = bam_init1(); - samfile_t *fin; - bam_index_t *idx; - - - fin = _bam_tryopen(bamfile.c_str(), "rb", NULL); - idx = bam_index_load(bamfile.c_str()); // load BAM index - if (idx == 0) - Rcpp::stop("BAM index for '" + bamfile + "' unavailable\n"); - - // get target id - int tid = 0; - while(regionChr.compare(fin->header->target_name[tid]) != 0 && tid+1header->n_targets) - tid++; - - if(regionChr.compare(fin->header->target_name[tid]) != 0) - Rcpp::stop("could not find target '" + regionChr + "' in bam header of " + bamfile + ".\n"); - - - // define a pointer to a callback retrival function - int (*collectRegionData)(const bam1_t *hit, void *data); - - // choose callback function for data parsing based on aligner used - switch(aligner) { - case QuasR: - collectRegionData = &collectRegionData_QsBism; - break; - case Bismark: - collectRegionData = &collectRegionData_QsBism; - break; - case BISCUIT: - collectRegionData = &collectRegionData_Bisq; - break; - default: - Rcpp::stop("Only alignments generated by QuasR, Bismark or BISCUIT can be analyzed\n"); - } - - // call corresponding retrieval function to store alignments in region - bam_fetch(fin->x.bam, idx, tid, regionStart, regionEnd, pnts, collectRegionData); - - // clean bam file objects - bam_index_destroy(idx); - samclose(fin); - - return(1); + + + bam1_t *hit = bam_init1(); + samfile_t *fin; + bam_index_t *idx; + + + fin = _bam_tryopen(bamfile.c_str(), "rb", NULL); + idx = bam_index_load(bamfile.c_str()); // load BAM index + if (idx == 0) + Rcpp::stop("BAM index for '" + bamfile + "' unavailable\n"); + + // get target id + int tid = 0; + while(regionChr.compare(fin->header->target_name[tid]) != 0 && tid+1header->n_targets) + tid++; + + if(regionChr.compare(fin->header->target_name[tid]) != 0) + Rcpp::stop("could not find target '" + regionChr + "' in bam header of " + bamfile + ".\n"); + + + // define a pointer to a callback retrival function + int (*collectRegionData)(const bam1_t *hit, void *data); + + // choose callback function for data parsing based on aligner used + switch(aligner) { + case QuasR: + collectRegionData = &collectRegionData_QsBism; + break; + case Bismark: + collectRegionData = &collectRegionData_QsBism; + break; + case BISCUIT: + collectRegionData = &collectRegionData_Bisq; + break; + default: + Rcpp::stop("Only alignments generated by QuasR, Bismark or BISCUIT can be analyzed\n"); + } + + // call corresponding retrieval function to store alignments in region + bam_fetch(fin->x.bam, idx, tid, regionStart, regionEnd, pnts, collectRegionData); + + // clean bam file objects + bam_index_destroy(idx); + samclose(fin); + + return(1); } + +string getFragConfig(const bam1_t *hit){ + // get configuration of the fragment + string frag_config; + if(bam_is_first(hit)){ + if(bam_is_rev(hit)) + frag_config = "R1-"; + else + frag_config = "R1+"; + if(bam_is_proper_pair(hit)){ + if(bam_is_mrev(hit)) + frag_config += "R2-"; + else + frag_config += "R2+"; + } + } else if(bam_is_second(hit)){ + if(bam_is_proper_pair(hit)){ + if(bam_is_rev(hit)) + frag_config = "R1-"; + else + frag_config = "R1+"; + + if(bam_is_rev(hit)) + frag_config += "R2-"; + else + frag_config += "R2+"; + } else{ + std::cout<<"WARNING!!! Unexpected fragment configuration!"< core.pos; // leftmost position in reference - int ref_end = bam_calend(&(hit->core),cigar); // rightmost position in reference - bool is_read_rev = bam_is_rev(hit); // is read on "+" strand - bool is_mate_rev = bam_is_mrev(hit); - bool is_second = bam_is_second(hit); // is it a second mate - - // choose strand to get contexts for - seekMismType msmType; - if(is_second){ // parsing second mate in PE data, strand determined by first mate - msmType = is_mate_rev ? GtoA : CtoT; - } else{ // parsing first mate or SE data, strand determined by read itself - msmType = is_read_rev ? GtoA : CtoT; - } - - // for testing only - //Rcpp::Rcout<<"qname="< context_vec = {GCH, WCG, bisC, otherC}; - for(int i = 0; i < context_vec.size(); ++i){ - // get positions for the context in reference - vector cntx_rpos_vec = pdata->refseq_info->getPositionsBetween(ref_start,ref_end,context_vec[i], msmType == CtoT); - - vector rpos_vec; - vector dat_vec; - vector isplusstrand_vec; - - for(int cind = 0; cind < cntx_rpos_vec.size(); ++cind){ - - // get query position - int qpos = _fromRefToQueryPos(ref_start, - cntx_rpos_vec[cind], - hit->core.n_cigar, - cigar); - // get nucleotide in query - string qseq = nt16_table[bam1_seqi(hitseq, qpos)]; - - // check if C->T mismatch if msmType == CtoT or GtoA otherwise. query on - in BAM are reverse complement - switch(msmType) { - case CtoT: - { - if(qseq == "C"){ // methylated or unprotected - // add position on reference - rpos_vec.push_back(cntx_rpos_vec[cind]); - // add data - dat_vec.push_back(context_vec[i] == GCH ? 0 : 1); - // add strand - isplusstrand_vec.push_back(!is_read_rev); - - } else if(qseq == "T"){ // unmethylated or protected - // add position on reference - rpos_vec.push_back(cntx_rpos_vec[cind]); - // add data - dat_vec.push_back(context_vec[i] == GCH ? 1 : 0); - // add strand - isplusstrand_vec.push_back(!is_read_rev); - } - break; - } - case GtoA: - { - if(qseq == "G"){ // methylated or unprotected - // add position on reference - rpos_vec.push_back(cntx_rpos_vec[cind]); - // add data - dat_vec.push_back(context_vec[i] == GCH ? 0 : 1); - // add strand - isplusstrand_vec.push_back(!is_read_rev); - - } else if(qseq == "A"){ // unmethylated or protected - // add position on reference - rpos_vec.push_back(cntx_rpos_vec[cind]); - // add data - dat_vec.push_back(context_vec[i] == GCH ? 1 : 0); - // add strand - isplusstrand_vec.push_back(!is_read_rev); - } - - break; - } - } - - - } - - - - - // add data for the current context into regionData - pdata->reg_data->addFragContextData(pdata->prefix, - pdata->prefix + qname, - ref_start, - ref_end, - rpos_vec, - dat_vec, - isplusstrand_vec, - is_second, - context_vec[i]); - - // for testing only -// frg.addData(ref_start, -// ref_end, -// rpos_vec, -// dat_vec, -// isplusstrand_vec, -// is_second, -// context_vec[i]); - - } - - - - - - - return(0); - + + static obj_pnts *pdata = NULL; + + pdata = (obj_pnts*) data; + + + // check if alignment is valid + if(!isValidAln(hit,pdata)) + return(0); + // for testing only + //print_bs_alignment(hit,pdata); + + // parse alignment and create vectors for regData + string qname = string(bam1_qname(hit)); // frag name + uint32_t *cigar = bam_get_cigar(hit); // cigar array + uint8_t *hitseq = bam1_seq(hit); // query sequence + int ref_start = hit -> core.pos; // leftmost position in reference + int ref_end = bam_calend(&(hit->core),cigar); // rightmost position in reference + bool is_read_rev = bam_is_rev(hit); // is read on "-" strand + bool is_mate_rev = bam_is_mrev(hit); + bool is_second = bam_is_second(hit); // is it a second mate + + string frag_config = getFragConfig(hit); + + + + // choose strand to get contexts for + seekMismType msmType; + if(is_second){ // parsing second mate in PE data, strand determined by first mate + msmType = is_mate_rev ? GtoA : CtoT; + } else{ // parsing first mate or SE data, strand determined by read itself + msmType = is_read_rev ? GtoA : CtoT; + } + + // go across contexts and add data + //vector context_vec = {GCH, WCG, bisC, otherC}; + //for(int i = 0; i < context_vec.size(); ++i){ + + for(int i = 0; i < pdata->context_vec->size(); ++i){ + fragMapType currContext = pdata->context_vec->at(i); + + // get positions for the context in reference + vector cntx_rpos_vec = pdata->refseq_info->getPositionsBetween(ref_start,ref_end,currContext, msmType == CtoT); + + vector rpos_vec; + vector dat_vec; + vector isplusstrand_vec; + + for(int cind = 0; cind < cntx_rpos_vec.size(); ++cind){ + + // get query position + int qpos = _fromRefToQueryPos(ref_start, + cntx_rpos_vec[cind], + hit->core.n_cigar, + cigar); + // get nucleotide in query + string qseq = nt16_table[bam1_seqi(hitseq, qpos)]; + + // check if C->T mismatch if msmType == CtoT or GtoA otherwise. query on - in BAM are reverse complement + switch(msmType) { + case CtoT: + { + if(qseq == "C"){ // methylated or unprotected + // add position on reference + rpos_vec.push_back(cntx_rpos_vec[cind]); + // add data + //dat_vec.push_back(context_vec[i] == GCH ? 0 : 1); + dat_vec.push_back(currContext == GCH ? 0 : 1); + // add strand + isplusstrand_vec.push_back(!is_read_rev); + + } else if(qseq == "T"){ // unmethylated or protected + // add position on reference + rpos_vec.push_back(cntx_rpos_vec[cind]); + // add data + //dat_vec.push_back(context_vec[i] == GCH ? 1 : 0); + dat_vec.push_back(currContext == GCH ? 1 : 0); + // add strand + isplusstrand_vec.push_back(!is_read_rev); + } + break; + } + case GtoA: + { + if(qseq == "G"){ // methylated or unprotected + // add position on reference + rpos_vec.push_back(cntx_rpos_vec[cind]); + // add data + //dat_vec.push_back(context_vec[i] == GCH ? 0 : 1); + dat_vec.push_back(currContext == GCH ? 0 : 1); + // add strand + isplusstrand_vec.push_back(!is_read_rev); + + } else if(qseq == "A"){ // unmethylated or protected + // add position on reference + rpos_vec.push_back(cntx_rpos_vec[cind]); + // add data + //dat_vec.push_back(context_vec[i] == GCH ? 1 : 0); + dat_vec.push_back(currContext == GCH ? 1 : 0); + // add strand + isplusstrand_vec.push_back(!is_read_rev); + } + + break; + } + } + + + } + + // add data for the current context into regionData + pdata->reg_data->addFragContextData(pdata->prefix, + pdata->prefix + qname, + ref_start, + ref_end, + frag_config, + rpos_vec, + dat_vec, + isplusstrand_vec, + is_second, + currContext); + + } + return(0); } @@ -222,6 +247,8 @@ static int collectRegionData_Bisq(const bam1_t *hit, void *data) { bool is_mate_rev = bam_is_mrev(hit); bool is_second = bam_is_second(hit); // is it a second mate + string frag_config = getFragConfig(hit); + // extract YD tag uint8_t* ydTag = bam_aux_get(hit, "YD"); char ydValue; @@ -261,10 +288,13 @@ static int collectRegionData_Bisq(const bam1_t *hit, void *data) { // fragData frg; // go across contexts and add data - vector context_vec = {GCH, WCG, bisC, otherC}; - for(int i = 0; i < context_vec.size(); ++i){ + // vector context_vec = {GCH, WCG, bisC, otherC}; + // for(int i = 0; i < context_vec.size(); ++i){ + for(int i = 0; i < pdata->context_vec->size(); ++i){ + fragMapType currContext = pdata->context_vec->at(i); + // get positions for the context in reference - vector cntx_rpos_vec = pdata->refseq_info->getPositionsBetween(ref_start,ref_end,context_vec[i], msmType == CtoT); + vector cntx_rpos_vec = pdata->refseq_info->getPositionsBetween(ref_start,ref_end,currContext, msmType == CtoT); vector rpos_vec; vector dat_vec; @@ -288,7 +318,7 @@ static int collectRegionData_Bisq(const bam1_t *hit, void *data) { // add position on reference rpos_vec.push_back(cntx_rpos_vec[cind]); // add data - dat_vec.push_back(context_vec[i] == GCH ? 0 : 1); + dat_vec.push_back(currContext == GCH ? 0 : 1); // add strand isplusstrand_vec.push_back(!is_read_rev); @@ -296,7 +326,7 @@ static int collectRegionData_Bisq(const bam1_t *hit, void *data) { // add position on reference rpos_vec.push_back(cntx_rpos_vec[cind]); // add data - dat_vec.push_back(context_vec[i] == GCH ? 1 : 0); + dat_vec.push_back(currContext == GCH ? 1 : 0); // add strand isplusstrand_vec.push_back(!is_read_rev); } @@ -308,7 +338,7 @@ static int collectRegionData_Bisq(const bam1_t *hit, void *data) { // add position on reference rpos_vec.push_back(cntx_rpos_vec[cind]); // add data - dat_vec.push_back(context_vec[i] == GCH ? 0 : 1); + dat_vec.push_back(currContext == GCH ? 0 : 1); // add strand isplusstrand_vec.push_back(!is_read_rev); @@ -316,7 +346,7 @@ static int collectRegionData_Bisq(const bam1_t *hit, void *data) { // add position on reference rpos_vec.push_back(cntx_rpos_vec[cind]); // add data - dat_vec.push_back(context_vec[i] == GCH ? 1 : 0); + dat_vec.push_back(currContext == GCH ? 1 : 0); // add strand isplusstrand_vec.push_back(!is_read_rev); } @@ -330,34 +360,25 @@ static int collectRegionData_Bisq(const bam1_t *hit, void *data) { + if(dat_vec.size()>0){ + + // add data for the current context into regionData + pdata->reg_data->addFragContextData(pdata->prefix, + pdata->prefix + qname, + ref_start, + ref_end, + frag_config, + rpos_vec, + dat_vec, + isplusstrand_vec, + is_second, + currContext); + } + - // add data for the current context into regionData - pdata->reg_data->addFragContextData(pdata->prefix, - pdata->prefix + qname, - ref_start, - ref_end, - rpos_vec, - dat_vec, - isplusstrand_vec, - is_second, - context_vec[i]); - // for testing only - // frg.addData(ref_start, - // ref_end, - // rpos_vec, - // dat_vec, - // isplusstrand_vec, - // is_second, - // context_vec[i]); } - - - - - - return(0); } @@ -368,25 +389,36 @@ static int collectRegionData_Bisq(const bam1_t *hit, void *data) { // checks validity of an alignment bool isValidAln(const bam1_t *hit,const obj_pnts* pdata){ - - bool valaln = !bam_is_not_primary(hit) && - !bam_is_qc_fail(hit) && - !bam_is_pcr_dupl(hit) && - !bam_is_suppl(hit) && - ((int )(hit->core).qual >= pdata->mapqMin) && - ((int )(hit->core).qual <= pdata->mapqMax) && - ((int )(hit->core).l_qseq >= pdata->min_read_size) && - ((int )(hit->core).l_qseq <= pdata->max_read_size); - - // check insert length - if(bam_is_proper_pair(hit)){ - if(pdata->absIsizeMin > 0){ - valaln = valaln && llabs(((int )(hit->core).isize)) >= pdata->absIsizeMin; - } - if(pdata->absIsizeMax > 0){ - valaln = valaln && llabs(((int )(hit->core).isize)) <= pdata->absIsizeMax; - } - } + + // bool valaln = !bam_is_not_primary(hit) && + // !bam_is_qc_fail(hit) && + // !bam_is_pcr_dupl(hit) && + // !bam_is_suppl(hit) && + // ((int )(hit->core).qual >= pdata->mapqMin) && + // ((int )(hit->core).qual <= pdata->mapqMax) && + // ((int )(hit->core).l_qseq >= pdata->min_read_size) && + // ((int )(hit->core).l_qseq <= pdata->max_read_size); + // + // // check insert length + // if(bam_is_proper_pair(hit)){ + // if(pdata->absIsizeMin > 0){ + // valaln = valaln && llabs(((int )(hit->core).isize)) >= pdata->absIsizeMin; + // } + // if(pdata->absIsizeMax > 0){ + // valaln = valaln && llabs(((int )(hit->core).isize)) <= pdata->absIsizeMax; + // } + // } + + + bool valaln = !bam_is_not_primary(hit) && + !bam_is_qc_fail(hit) && + !bam_is_pcr_dupl(hit) && + !bam_is_suppl(hit) && + ((int )(hit->core).qual >= pdata->mapqMin) && + ((int )(hit->core).qual <= pdata->mapqMax); + + + return(valaln); } @@ -394,162 +426,162 @@ bool isValidAln(const bam1_t *hit,const obj_pnts* pdata){ // print alignment for debugging void print_bs_alignment(const bam1_t *hit,const obj_pnts* pdata){ - - // print data for one alignment - Rcpp::Rcout<<"====== "< core), cigar); int qlen = bam_cigar2qlen(hit->core.n_cigar, cigar); - uint8_t *hitseq = bam1_seq(hit); - - - // ## for Bismark meth and unmeth alphabet find positions in XM tag - // ## Bismark codes - // # z - C in CpG context - unmethylated - // # Z - C in CpG context - methylated - // # x - C in CHG context - unmethylated - // # X - C in CHG context - methylated - // # h - C in CHH context - unmethylated - // # H - C in CHH context - methylated - // # u - C in Unknown context (CN or CHN) - unmethylated - // # U - C in Unknown context (CN or CHN) - methylated - // # . - not a C or irrelevant position - - - // get XM tag if present - uint8_t *pxmtag = bam_aux_get(hit,"XM"); - string xm_tag; - if(pxmtag != NULL){ - int type = *pxmtag++; - if(type == 'Z') - xm_tag = string((char*)pxmtag); - } - - - // choose strand to get contexts for - seekMismType msmType; - if(is_second){ // parsing second mate in PE data, strand determined by first mate - msmType = is_mate_rev ? GtoA : CtoT; - } else{ // parsing first mate or SE data, strand determined by read itself - msmType = is_read_rev ? GtoA : CtoT; - } - - string queryseq, refseq, matchsymb,cntxstr; - for(int qpos = 0; qpos < qlen; ++qpos){ - int rpos = _fromQueryToRefPos(hit -> core.pos,qpos,hit -> core.n_cigar,cigar); - string rseq; - string qseq = nt16_table[bam1_seqi(hitseq, qpos)]; - - if(rpos != -1){ - rseq = pdata->refseq_info->getRefSubseq(rpos,1); - matchsymb += "|"; - } else { - rseq = "-"; - matchsymb += " "; - } - - queryseq += qseq; - refseq += rseq; - - // get context - short int mcntx = pdata->refseq_info->getContextForRefPos(rpos); - - switch (msmType) { - case CtoT: - { - if(mcntx == 0 || (qseq != "C" && qseq != "T")){ - cntxstr += "."; - } else{ - if(rseq != "C"){ - Rcpp::Rcout<<"print_bs_alignment: WARNING: Expect C in reference sequence but got "< core.pos,qpos,hit -> core.n_cigar,cigar); + string rseq; + string qseq = nt16_table[bam1_seqi(hitseq, qpos)]; + + if(rpos != -1){ + rseq = pdata->refseq_info->getRefSubseq(rpos,1); + matchsymb += "|"; + } else { + rseq = "-"; + matchsymb += " "; + } + + queryseq += qseq; + refseq += rseq; + + // get context + short int mcntx = pdata->refseq_info->getContextForRefPos(rpos); + + switch (msmType) { + case CtoT: + { + if(mcntx == 0 || (qseq != "C" && qseq != "T")){ + cntxstr += "."; + } else{ + if(rseq != "C"){ + Rcpp::Rcout<<"print_bs_alignment: WARNING: Expect C in reference sequence but got "< *context_vec; // pointer to a vector of contexts to retrieve data for } obj_pnts; @@ -53,6 +55,8 @@ static int collectRegionData_QsBism(const bam1_t *hit, void *data); // bam_fetch callback function for bam_fetch for data retrieval from BAM files produced by BISCUIT static int collectRegionData_Bisq(const bam1_t *hit, void *data); +string getFragConfig(const bam1_t *hit); + // checks validity of an alignment diff --git a/src/fragData-class.cpp b/src/fragData-class.cpp index 6cbf2ea..70feee9 100644 --- a/src/fragData-class.cpp +++ b/src/fragData-class.cpp @@ -4,43 +4,111 @@ fragData::fragData(){ fragStart = -1; fragEnd = -1; + fragConfig = ""; prefix = ""; }; // constructors/destructors fragData::fragData(const string& bampref, - const int& fragstart, - const int& fragend, - const vector& gch_pos, - const vector& gch_protect, - const vector& gch_onplusstrand, - const vector& wcg_pos, - const vector& wcg_protect, - const vector& wcg_onplusstrand, - const vector& bisc_pos, - const vector& bisc_protect, - const vector& bisc_onplusstrand, - const vector& otherc_pos, - const vector& otherc_protect, - const vector& otherc_onplusstrand + const int& fragstart, + const int& fragend, + const string& fragconfig, + const vector& gch_pos, + const vector& gch_protect, + const vector& gch_onplusstrand, + const vector& wcg_pos, + const vector& wcg_protect, + const vector& wcg_onplusstrand, + const vector& bisc_pos, + const vector& bisc_protect, + const vector& bisc_onplusstrand, + const vector& otherc_pos, + const vector& otherc_protect, + const vector& otherc_onplusstrand ){ // set coordinates - fragStart = fragstart; - fragEnd = fragend; - prefix = bampref; + // add GCH data - addData(bampref,fragstart, fragend, gch_pos, gch_protect, gch_onplusstrand, 0,GCH); + addData(bampref,fragstart, fragend, fragconfig, gch_pos, gch_protect, gch_onplusstrand, 0,GCH); // add WCG data - addData(bampref,fragstart, fragend, wcg_pos, wcg_protect, wcg_onplusstrand, 0,WCG); + addData(bampref,fragstart, fragend, fragconfig, wcg_pos, wcg_protect, wcg_onplusstrand, 0,WCG); // add bisC data - addData(bampref,fragstart, fragend, bisc_pos, bisc_protect, bisc_onplusstrand, 0,bisC); + addData(bampref,fragstart, fragend, fragconfig, bisc_pos, bisc_protect, bisc_onplusstrand, 0,bisC); // add otherC data - addData(bampref,fragstart, fragend, otherc_pos, otherc_protect, otherc_onplusstrand, 0,otherC); + addData(bampref,fragstart, fragend, fragconfig, otherc_pos, otherc_protect, otherc_onplusstrand, 0,otherC); }; fragData::~fragData(){}; + +// add data into the map +bool fragData::addData(const string& bampref, + const int& fragstart, + const int& fragend, + const string& fragconfig, + const vector& rposvec, + const vector& protectvec, + const vector& isonplusstrandvec, + const bool& isSecond,// data for same positions on the second read are ignored + fragMapType whichMap // must be enumerator GCH (protect), WCG (endogenous), bisC or otherC +){ + if(rposvec.size() != protectvec.size()|| rposvec.size() != isonplusstrandvec.size()){ + Rcpp::Rcout<<"fragProtectData::addData: sizes of rposvec, protectvec and isonplusstrandvec do not match."; + return(0); + } + + // change fragment coordinates if neccessary + if(fragStart < 0 || fragstart < fragStart) + fragStart = fragstart; + if(fragEnd < 0 || fragend > fragEnd) + fragEnd = fragend; + + if(fragConfig.size() == 0) + fragConfig = fragconfig; + + prefix = bampref; + + // select which map to add the data to + map *mdat; // pointer to methylation/protection map + map *sdat; // pointer to strand map + switch(whichMap) { + case GCH: + mdat = &gchProtect; + sdat = &gchIsOnPlusStrand; + break; + case WCG: + mdat = &wcgMeth; + sdat = &wcgIsOnPlusStrand; + break; + case bisC: + mdat = &bisCMeth; + sdat = &bisCIsOnPlusStrand; + break; + case otherC: + mdat = &otherCMeth; + sdat = &otherCIsOnPlusStrand; + break; + } + + for(int i = 0; i < rposvec.size(); ++i){ + // check whether position already in the map + if(mdat->find(rposvec[i]) == mdat->end()){ // if not found then insert data + mdat->insert(make_pair(rposvec[i], protectvec[i])); + sdat->insert(make_pair(rposvec[i], isonplusstrandvec[i])); + } + else if(!isSecond){ // if found and data is not for second mate then overwrite. + (*mdat)[rposvec[i]] = protectvec[i]; + (*sdat)[rposvec[i]] = isonplusstrandvec[i]; + } + } + return(1); +}; + + + + + // get vector of positions in a map vector fragData::getPositions(fragMapType whichMap){ // select which map to get the data from @@ -72,7 +140,7 @@ vector fragData::getPositions(fragMapType whichMap){ // get data at position bool fragData::getDataAt(int rpos, - fragMapType whichMap){ + fragMapType whichMap){ // select which map to get the data from map *mdat; // pointer to methylation/protection map @@ -100,7 +168,7 @@ bool fragData::getDataAt(int rpos, // get isOnPlus bool fragData::isDataOnPlusStrand(int rpos, - fragMapType whichMap){ + fragMapType whichMap){ // select which map to get the data from map *sdat; // pointer to strand map @@ -127,63 +195,6 @@ bool fragData::isDataOnPlusStrand(int rpos, }; -// add data into the map -bool fragData::addData(const string& bampref, - const int& fragstart, - const int& fragend, - const vector& rposvec, - const vector& protectvec, - const vector& isonplusstrandvec, - const bool& isSecond,// data for same positions on the second read are ignored - fragMapType whichMap // must be enumerator GCH (protect), WCG (endogenous), bisC or otherC -){ - if(rposvec.size() != protectvec.size()|| rposvec.size() != isonplusstrandvec.size()){ - Rcpp::Rcout<<"fragProtectData::addData: sizes of rposvec, protectvec and isonplusstrandvec do not match."; - return(0); - } - - // change fragment coordinates if neccessary - if(fragStart < 0 || fragstart < fragStart) - fragStart = fragstart; - if(fragEnd < 0 || fragend > fragEnd) - fragEnd = fragend; - prefix = bampref; - - // select which map to add the data to - map *mdat; // pointer to methylation/protection map - map *sdat; // pointer to strand map - switch(whichMap) { - case GCH: - mdat = &gchProtect; - sdat = &gchIsOnPlusStrand; - break; - case WCG: - mdat = &wcgMeth; - sdat = &wcgIsOnPlusStrand; - break; - case bisC: - mdat = &bisCMeth; - sdat = &bisCIsOnPlusStrand; - break; - case otherC: - mdat = &otherCMeth; - sdat = &otherCIsOnPlusStrand; - break; - } - - for(int i = 0; i < rposvec.size(); ++i){ - // check whether position already in the map - if(mdat->find(rposvec[i]) == mdat->end()){ // if not found then insert data - mdat->insert(make_pair(rposvec[i], protectvec[i])); - sdat->insert(make_pair(rposvec[i], isonplusstrandvec[i])); - } - else if(!isSecond){ // if found and data is not for second mate then overwrite. - (*mdat)[rposvec[i]] = protectvec[i]; - (*sdat)[rposvec[i]] = isonplusstrandvec[i]; - } - } - return(1); -}; // convert data to strings vector fragData::getStringData() { @@ -261,7 +272,7 @@ void fragData::addCountsToProtectStatsTable(protectStatsTable& pStatsTable){ // 2. function to clip data. returns 1 if fragment was clipped or 0 otherwise int fragData::clipProtectData(const int& clip_until_nbg, // clips from both ends until this number of 0 is met - const double& max_protect_frac // keeps fragment if percentage of 1s in a fragment is above this number + const double& max_protect_frac // keeps fragment if percentage of 1s in a fragment is above this number ){ if(gchProtect.size() == 0) return(0); @@ -318,7 +329,7 @@ int fragData::clipProtectData(const int& clip_until_nbg, // clips from both ends }; -double fragData::get_bisC_meth(int min_size){ +double fragData::get_bisC_meth(const int& min_size){ if(bisCMeth.size() < min_size) return(0); @@ -349,3 +360,291 @@ string fragData::getStringEncodingForUniqueness(){ }; +string fragData::getFragConfig(){ + return(fragConfig); +}; + +std::vector fragData::getFragDataStartEndNpoints(fragMapType whichMap){ + int fragDataStart = -1, fragDataEnd = -1, n_points = -1; + + if(whichMap != allC){ + // select which map to work on + map *mdat; // pointer to methylation/protection map + + switch(whichMap) { + case GCH: + mdat = &gchProtect; + break; + case WCG: + mdat = &wcgMeth; + break; + case bisC: + mdat = &bisCMeth; + break; + case otherC: + mdat = &otherCMeth; + break; + } + + if(!mdat->empty()){ + fragDataStart = mdat->begin()->first; + fragDataEnd = mdat->rbegin()->first; + n_points = mdat->size(); + } + + } else { + // if allC is requested, get the minimum/maximum of positions and total number of points across all maps + if(!gchProtect.empty()){ + fragDataStart = gchProtect.begin()->first; + fragDataEnd = gchProtect.rbegin()->first; + n_points = gchProtect.size(); + } + + if(!wcgMeth.empty()){ + if(wcgMeth.begin()->first < fragDataStart) + fragDataStart = wcgMeth.begin()->first; + if(wcgMeth.rbegin()->first > fragDataEnd) + fragDataEnd = wcgMeth.rbegin()->first; + n_points = n_points + wcgMeth.size(); + } + + if(!bisCMeth.empty()){ + if(bisCMeth.begin()->first < fragDataStart) + fragDataStart = bisCMeth.begin()->first; + if(bisCMeth.rbegin()->first > fragDataEnd) + fragDataEnd = bisCMeth.rbegin()->first; + n_points = n_points + bisCMeth.size(); + } + + if(!otherCMeth.empty()){ + if(otherCMeth.begin()->first < fragDataStart) + fragDataStart = otherCMeth.begin()->first; + if(otherCMeth.rbegin()->first > fragDataEnd) + fragDataEnd = otherCMeth.rbegin()->first; + n_points = n_points + otherCMeth.size(); + } + + } + + std::vector vec_out={fragDataStart,fragDataEnd,n_points}; + return(vec_out); +}; + + + +// is the required map empty? +bool fragData::isEmpty(fragMapType whichMap){ + if(whichMap != allC){ + // select which map to work on + map *mdat; // pointer to methylation/protection map + switch(whichMap) { + case GCH: + mdat = &gchProtect; + break; + case WCG: + mdat = &wcgMeth; + break; + case bisC: + mdat = &bisCMeth; + break; + case otherC: + mdat = &otherCMeth; + break; + } + + return(mdat->empty()); + } else { + return(gchProtect.empty() && wcgMeth.empty() && bisCMeth.empty() && otherCMeth.empty()); + } +}; + + +// get the data wrapped int Rcpp::IntegerVector; +Rcpp::List fragData::getFragDataVector(fragMapType whichMap){ + + vector dataPosNpoints = getFragDataStartEndNpoints(whichMap); + + int fragDataStart = dataPosNpoints[0]; + int fragDataEnd = dataPosNpoints[1]; + + Rcpp::IntegerVector fragDataVec(fragDataEnd - fragDataStart + 1); + //fragDataVec.fill(fragDataVec.begin(),fragDataVec.end(),NA_INTEGER); + fragDataVec.fill(NA_INTEGER); + if(whichMap != allC){ + // select which map to work on + map *mdat; // pointer to methylation/protection map + switch(whichMap) { + case GCH: + mdat = &gchProtect; + break; + case WCG: + mdat = &wcgMeth; + break; + case bisC: + mdat = &bisCMeth; + break; + case otherC: + mdat = &otherCMeth; + break; + } + + for(map::iterator it = mdat->begin(); it != mdat->end(); ++it){ + fragDataVec(it->first - fragDataStart) = it->second; + } + + + } else { + + for(int rpos = fragDataStart; rpos <= fragDataEnd; ++rpos){ + // hierarchy GCH > WCG > bisC>otherC + if(gchProtect.find(rpos) != gchProtect.end()){ + fragDataVec(rpos - fragDataStart) = 1-gchProtect[rpos]; + } else if(wcgMeth.find(rpos) != wcgMeth.end()){ + fragDataVec(rpos - fragDataStart) = wcgMeth[rpos]; + } else if(bisCMeth.find(rpos) != bisCMeth.end()){ + fragDataVec(rpos - fragDataStart) = bisCMeth[rpos]; + } else if(otherCMeth.find(rpos) != otherCMeth.end()){ + fragDataVec(rpos - fragDataStart) = otherCMeth[rpos]; + } + } + } + + Rcpp::List data_out = Rcpp::List::create(Rcpp::Named("fragDataStart") = fragDataStart, + Rcpp::Named("fragDataEnd") = fragDataEnd, + Rcpp::Named("nDataPoints") = dataPosNpoints[2], + Rcpp::Named("startData") = fragDataVec[0], + Rcpp::Named("endData") = fragDataVec[fragDataVec.size() - 1], + Rcpp::Named("fragDataVec") = fragDataVec); + + return(data_out); + +}; + + +// get the Rle compressed data wrapped int Rcpp::IntegerVector; +Rcpp::List fragData::getFragDataRle(fragMapType whichMap){ + + vector dataPosNpoints = getFragDataStartEndNpoints(whichMap); + + int fragDataStart = dataPosNpoints[0]; + int fragDataEnd = dataPosNpoints[1]; + + // Initialize vectors for values and lengths + Rcpp::IntegerVector values; + Rcpp::IntegerVector lengths; + + if(whichMap != allC){ + // select which map to work on + map *mdat; // pointer to methylation/protection map + switch(whichMap) { + case GCH: + mdat = &gchProtect; + break; + case WCG: + mdat = &wcgMeth; + break; + case bisC: + mdat = &bisCMeth; + break; + case otherC: + mdat = &otherCMeth; + break; + } + + // iterator to traverse the map + map::iterator it = mdat->begin(); + int current_value = it->second; // Initial value + int current_pos = it->first - 1; // initial position. the -1 is made to deal with the first iteration in order to set the correct run length + int run_length = 0; // Initial run length + + // Iterate over the map to create RLE data + for (; it != mdat->end(); ++it) { + if(it->first == current_pos + 1){ + if (it->second == current_value) { + // continue the run if the value is the same and position changed by 1 + run_length++; + current_pos = it->first; + } else{ + // store the current run + values.push_back(current_value); + lengths.push_back(run_length); + + // reset for the new run + current_value = it->second; + run_length = 1; + } + } else{ // positions were interupted by a run of NAs + // store the current run + values.push_back(current_value); + lengths.push_back(run_length); + // store run of NAs + values.push_back(NA_INTEGER); + lengths.push_back(it->first - current_pos - 1); + // reset for the new run + current_value = it->second; + current_pos = it->first; + run_length = 1; + } + } + // store the last run + values.push_back(current_value); + lengths.push_back(run_length); + + } else { + + int prev_value = -1; + int run_length = 0; + for(int rpos = fragDataStart; rpos <= fragDataEnd; ++rpos){ + int current_value = -1; + // hierarchy GCH > WCG > bisC>otherC + if(gchProtect.find(rpos) != gchProtect.end()){ + current_value = 1-gchProtect[rpos]; + } else if(wcgMeth.find(rpos) != wcgMeth.end()){ + current_value = wcgMeth[rpos]; + } else if(bisCMeth.find(rpos) != bisCMeth.end()){ + current_value = bisCMeth[rpos]; + } else if(otherCMeth.find(rpos) != otherCMeth.end()){ + current_value = otherCMeth[rpos]; + } + + if(current_value != prev_value){ + // store the current run + if(run_length > 0){ + if(prev_value == -1) + values.push_back(NA_INTEGER); + else + values.push_back(prev_value); + lengths.push_back(run_length); + } + + // reset for the new run + prev_value = current_value; + run_length = 1; + } else{ + // continute the run + //prev_value = current_value; + run_length++; + } + } + // store the last run + values.push_back(prev_value); + lengths.push_back(run_length); + + + } + + + // Load the Rle constructor function from the IRanges package + Rcpp::Function Rle("Rle", Rcpp::Environment::namespace_env("IRanges")); + SEXP rle_object = Rle(Rcpp::_["values"] = values, Rcpp::_["lengths"] = lengths); + + Rcpp::List data_out = Rcpp::List::create(Rcpp::Named("fragDataStart") = fragDataStart, + Rcpp::Named("fragDataEnd") = fragDataEnd, + Rcpp::Named("nDataPoints") = dataPosNpoints[2], + Rcpp::Named("startData") = values[0], + Rcpp::Named("endData") = values[values.size() - 1], + Rcpp::Named("rle_data") = rle_object); + return(data_out); +}; + + diff --git a/src/fragData-class.h b/src/fragData-class.h index eb7e839..23d45e4 100644 --- a/src/fragData-class.h +++ b/src/fragData-class.h @@ -28,12 +28,15 @@ class fragData{ int fragStart; int fragEnd; + // configuration of a fragment, i.e. R1+R2- etc. + string fragConfig; // file prefix where the fragment came from string prefix; // protection at GCH sites map gchProtect; // map: 1-based position -> protection {0 - unprotected, 1 - protected} - map gchIsOnPlusStrand; // 1 - "+" strand, 0 - "-" strand + map gchIsOnPlusStrand; // 1 - "+" strand, 0 - "-" strand. + // The reason that the strand is encoded as an vector is that fragData is a container for a fragment consisting of both reads R1 and R2 // methylation of endogenous CpG at WCG sites map wcgMeth; @@ -55,6 +58,7 @@ class fragData{ fragData(const string& bampref, const int& fragstart, const int& fragend, + const string& fragconfig, const vector& gch_pos, const vector& gch_protect, const vector& gch_onplusstrand, @@ -84,10 +88,12 @@ class fragData{ fragMapType whichMap); + // add data into the map bool addData(const string& bampref, const int& fragstart, const int& fragend, + const string& fragconfig, const vector& rposvec, const vector& protectvec, const vector& isonplusstrandvec, @@ -98,6 +104,27 @@ class fragData{ // convert data to strings vector getStringData(); + string getFragConfig(); + + // // get most left position of the first point with data + // int getFragDataStart(fragMapType whichMap); + // + // // get most right position of the last point with data + // int getFragDataEnd(fragMapType whichMap); + + // get positions where the data start, end and number of informative positions + std::vector getFragDataStartEndNpoints(fragMapType whichMap); + + + + + // get the data, npoints and left and right datapoints; + Rcpp::List getFragDataVector(fragMapType whichMap); + + + // get the Rle compressed data, npoints and left and right datapoints; + Rcpp::List getFragDataRle(fragMapType whichMap); + // print void print(); @@ -117,8 +144,9 @@ class fragData{ ); - double get_bisC_meth(int min_size); + double get_bisC_meth(const int& min_size); + bool isEmpty(fragMapType whichMap); // get string representation of fragData string getStringEncodingForUniqueness(); diff --git a/src/functions_export_rcpp.cpp b/src/functions_export_rcpp.cpp index 40cd79e..8725535 100644 --- a/src/functions_export_rcpp.cpp +++ b/src/functions_export_rcpp.cpp @@ -2,6 +2,149 @@ +Rcpp::List fetch_molec_data_list_from_bams_cpp(const Rcpp::CharacterVector& whichContext, + const Rcpp::CharacterVector& infiles, + const Rcpp::CharacterVector& regionChr, + const Rcpp::IntegerVector& regionStart, + const Rcpp::IntegerVector& regionEnd, + const Rcpp::CharacterVector& seqstring, + const Rcpp::IntegerVector& seqStart, + const Rcpp::IntegerVector& seqEnd, + const Rcpp::LogicalVector& remove_nonunique, + const Rcpp::IntegerVector& clip_until_nbg, + const Rcpp::NumericVector& max_protect_frac, + const Rcpp::NumericVector& max_bisC_meth, + const Rcpp::IntegerVector& min_bisC_size, + const Rcpp::IntegerVector& mapqMin, + const Rcpp::IntegerVector& mapqMax, + const Rcpp::CharacterVector& alignerUsed, + const Rcpp::IntegerVector& min_frag_data_len, + const Rcpp::NumericVector& min_frag_data_dens, + const Rcpp::LogicalVector& data_as_rle){ + + // convert input parameters + string whichContext_ = Rcpp::as(whichContext); + fragMapType whichMap; + if(whichContext_ == "GCH") + whichMap = GCH; + else if(whichContext_ == "WCG") + whichMap = WCG; + else if(whichContext_ == "bisC") + whichMap = bisC; + else if(whichContext_ == "otherC") + whichMap = otherC; + else if(whichContext_ == "allC") + whichMap = allC; + else + Rcpp::stop("whichContext must be GCH, WCG, bisC, otherC or allC"); + + string alignerUsed_ = Rcpp::as(alignerUsed); + alignerType aligner; + if(alignerUsed_ == "QuasR") + aligner = QuasR; + else if(alignerUsed_ == "Bismark") + aligner = Bismark; + else if(alignerUsed_ == "BISCUIT") + aligner = BISCUIT; + else + Rcpp::stop("Only alignments generated by QuasR, Bismark or BISCUIT can be analyzed\n"); + + + vector infiles_ = Rcpp::as >(infiles); + string regionChr_ = Rcpp::as(regionChr); + int regionStart_ = Rcpp::as(regionStart); + int regionEnd_ = Rcpp::as(regionEnd); + string seqstring_ = Rcpp::as(seqstring); + int seqStart_ = Rcpp::as(seqStart); + int seqEnd_ = Rcpp::as(seqEnd); + + bool remove_nonunique_ = Rcpp::as(remove_nonunique); + int clip_until_nbg_ = Rcpp::as(clip_until_nbg); + double max_protect_frac_ = Rcpp::as(max_protect_frac); + double max_bisC_meth_ = Rcpp::as(max_bisC_meth); + int min_bisC_size_ = Rcpp::as(min_bisC_size); + int mapqMin_ = Rcpp::as(mapqMin); + int mapqMax_ = Rcpp::as(mapqMax); + + int min_frag_data_len_ = Rcpp::as(min_frag_data_len); + double min_frag_data_dens_ = Rcpp::as(min_frag_data_dens); + + bool data_as_rle_ = Rcpp::as(data_as_rle); + + // initialize container for reference sequence + refSeqInfo refseqInfo(seqstring_, seqStart_,seqEnd_); + + // initialize data structure for keeping object pointers + obj_pnts pnts; + // store pointer to reference sequence container + pnts.refseq_info = &refseqInfo; + // store min and amp MAPQ + pnts.mapqMin = mapqMin_; + pnts.mapqMax = mapqMax_; + + // store which contexts to parse + vector cont_vec; + if(whichMap == bisC){ + cont_vec = {bisC}; + } else if(whichMap == allC){ + cont_vec = {GCH, WCG, bisC, otherC}; + } else { + if(max_bisC_meth_ < 1.0) // if filtering bisulfite failed reads needed we add bisC context + cont_vec = {whichMap, bisC}; + else + cont_vec = {whichMap}; + } + pnts.context_vec = &cont_vec; + // create container for fragments and counters + regionData regData(regionChr_,regionStart_,regionEnd_); + pnts.reg_data = ®Data; + + int n_fetched = 0, n_nonUnique = 0, n_bisfailed = 0, n_analysed = 0, n_clipped = 0; + + // fetch data from all bam files + for(int i = 0; i < infiles_.size(); ++i){ + pnts.prefix = "f" + to_string(i + 1)+":"; + // fetch data from bam file + fetch_data_from_bam(infiles_[i], + regionChr_, + regionStart_, + regionEnd_, + aligner, + &pnts); + } + + n_fetched = regData.size(); + + // post-processing of the data, i.e. filtering by uniqueness, bisulfite efficiency, clipping and removal of fragments with insufficient data + // function below returns the following stats + // {n_nonUnique,n_bisfailed,n_clipped,n_insuffFailed,n_analysed} + vector filt_stats = regData.postProcessFragData(remove_nonunique_, + clip_until_nbg_, + max_protect_frac_, + max_bisC_meth_, + min_bisC_size_, + min_frag_data_len_, + min_frag_data_dens_, + whichMap); + + + // create List with fetched reads/fragments for R with the following information + // start (genomic position of the first data point. note, not the alignment start!) + // end last position of the data points + // R1strand strand where the R1 read aligned + // qname - name of the fragment + // GCH_fragData_list list which contains vectors with 0 (accessible), 1(protected) and NA, non-informative + Rcpp::List data_out; + if(data_as_rle_) + data_out = regData.getFragDataRle(whichMap); + else + data_out = regData.getFragDataList(whichMap); + return(data_out); +} + + + + Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, @@ -17,11 +160,9 @@ Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, - const Rcpp::IntegerVector& absIsizeMin, - const Rcpp::IntegerVector& absIsizeMax, - const Rcpp::IntegerVector& min_read_size, - const Rcpp::IntegerVector& max_read_size, - const Rcpp::CharacterVector& alignerUsed){ + const Rcpp::CharacterVector& alignerUsed, + const Rcpp::IntegerVector& min_frag_data_len, + const Rcpp::NumericVector& min_frag_data_dens){ string alignerUsed_ = Rcpp::as(alignerUsed); alignerType aligner; @@ -51,11 +192,9 @@ Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, int min_bisC_size_ = Rcpp::as(min_bisC_size); int mapqMin_ = Rcpp::as(mapqMin); int mapqMax_ = Rcpp::as(mapqMax); - int absIsizeMin_ = Rcpp::as(absIsizeMin); - int absIsizeMax_ = Rcpp::as(absIsizeMax); - int min_read_size_ = Rcpp::as(min_read_size); - int max_read_size_ = Rcpp::as(max_read_size); + int min_frag_data_len_ = Rcpp::as(min_frag_data_len); + double min_frag_data_dens_ = Rcpp::as(min_frag_data_dens); // initialize container for reference sequence refSeqInfo refseqInfo(seqstring_, seqStart_,seqEnd_); @@ -67,18 +206,18 @@ Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, // store min and max MAPQ pnts.mapqMin = mapqMin_; pnts.mapqMax = mapqMax_; - // store min and max tlen - pnts.absIsizeMin = absIsizeMin_; - pnts.absIsizeMax = absIsizeMax_; - // store min and max read lengths - pnts.min_read_size = min_read_size_; - pnts.max_read_size = max_read_size_; + // store which contexts to parse + vector cont_vec; + if(max_bisC_meth_ < 1.0) // if filtering bisulfite failed reads needed we add bisC context + cont_vec = {GCH, bisC}; + else + cont_vec = {GCH}; + pnts.context_vec = &cont_vec; // initialize container for cooccurrence counts and counters coocCntTable cTable(max_spac_); int n_fetched = 0, n_nonUnique = 0, n_bisfailed = 0, n_analysed = 0, n_clipped = 0; - // for each bam file fetch data and add counts to cooccurrence table for(int i = 0; i < infiles_.size(); ++i){ // initialize container for fragments in region @@ -94,26 +233,22 @@ Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, &pnts); n_fetched += regData.size(); - // get qnames on non-unique fragments - set nonUnfrags; - if(remove_nonunique_) - nonUnfrags = regData.getNonUniqueQnames(); - n_nonUnique += nonUnfrags.size(); - // get qnames with failed bisulfite conversion - set failedBisConvfrags; - if(max_bisC_meth_ < 1.0) - failedBisConvfrags = regData.getFailedBisConvQnames(max_bisC_meth_,min_bisC_size_); - n_bisfailed += failedBisConvfrags.size(); - - // erase fragments which nonunique and failed bisConv - std::set remove_qnames(nonUnfrags); - remove_qnames.insert(failedBisConvfrags.begin(), failedBisConvfrags.end()); - n_analysed += regData.eraseQnames(remove_qnames); - - // clip fragments - if(clip_until_nbg_ > 0) - n_clipped += regData.clipProtectData(clip_until_nbg_,max_protect_frac_); + // post-processing of the data, i.e. filtering by uniqueness, bisulfite efficiency, clipping and removal of fragments with insufficient data + // function below returns the following stats + // {n_nonUnique,n_bisfailed,n_clipped,n_insuffFailed,n_analysed} + vector filt_stats = regData.postProcessFragData(remove_nonunique_, + clip_until_nbg_, + max_protect_frac_, + max_bisC_meth_, + min_bisC_size_, + min_frag_data_len_, + min_frag_data_dens_, + GCH); + n_nonUnique += filt_stats[0]; + n_bisfailed += filt_stats[1]; + n_clipped += filt_stats[2]; + n_analysed += filt_stats[4]; // add counts to table regData.addCountsToCoocTable(cTable); @@ -161,11 +296,9 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, - const Rcpp::IntegerVector& absIsizeMin, - const Rcpp::IntegerVector& absIsizeMax, - const Rcpp::IntegerVector& min_read_size, - const Rcpp::IntegerVector& max_read_size, - const Rcpp::CharacterVector& alignerUsed){ + const Rcpp::CharacterVector& alignerUsed, + const Rcpp::IntegerVector& min_frag_data_len, + const Rcpp::NumericVector& min_frag_data_dens){ // convert input parameters string whichContext_ = Rcpp::as(whichContext); @@ -210,10 +343,9 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon int min_bisC_size_ = Rcpp::as(min_bisC_size); int mapqMin_ = Rcpp::as(mapqMin); int mapqMax_ = Rcpp::as(mapqMax); - int absIsizeMin_ = Rcpp::as(absIsizeMin); - int absIsizeMax_ = Rcpp::as(absIsizeMax); - int min_read_size_ = Rcpp::as(min_read_size); - int max_read_size_ = Rcpp::as(max_read_size); + + int min_frag_data_len_ = Rcpp::as(min_frag_data_len); + double min_frag_data_dens_ = Rcpp::as(min_frag_data_dens); // initialize container for reference sequence refSeqInfo refseqInfo(seqstring_, seqStart_,seqEnd_); @@ -224,12 +356,21 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon // store min and amp MAPQ pnts.mapqMin = mapqMin_; pnts.mapqMax = mapqMax_; - // store min and max tlen - pnts.absIsizeMin = absIsizeMin_; - pnts.absIsizeMax = absIsizeMax_; - // store min and max read lengths - pnts.min_read_size = min_read_size_; - pnts.max_read_size = max_read_size_; + + // store which contexts to parse + vector cont_vec; + if(whichMap == bisC){ + cont_vec = {bisC}; + } else if(whichMap == allC){ + cont_vec = {GCH, WCG, bisC, otherC}; + } else { + if(max_bisC_meth_ < 1.0) // if filtering bisulfite failed reads needed we add bisC context + cont_vec = {whichMap, bisC}; + else + cont_vec = {whichMap}; + } + pnts.context_vec = &cont_vec; + // create container for fragments and counters regionData regData(regionChr_,regionStart_,regionEnd_); pnts.reg_data = ®Data; @@ -249,27 +390,21 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon n_fetched = regData.size(); - // get qnames on non-unique fragments - set nonUnfrags; - if(remove_nonunique_) - nonUnfrags = regData.getNonUniqueQnames(); - n_nonUnique = nonUnfrags.size(); - - // get qnames with failed bisulfite conversion - set failedBisConvfrags; - if(max_bisC_meth_ < 1.0) - failedBisConvfrags = regData.getFailedBisConvQnames(max_bisC_meth_,min_bisC_size_); - n_bisfailed = failedBisConvfrags.size(); - - // erase fragments which nonunique and failed bisConv - std::set remove_qnames(nonUnfrags); - remove_qnames.insert(failedBisConvfrags.begin(), failedBisConvfrags.end()); - n_analysed = regData.eraseQnames(remove_qnames); - - // clip fragments - if(clip_until_nbg_ > 0) - n_clipped = regData.clipProtectData(clip_until_nbg_,max_protect_frac_); - + // post-processing of the data, i.e. filtering by uniqueness, bisulfite efficiency, clipping and removal of fragments with insufficient data + // function below returns the following stats + // {n_nonUnique,n_bisfailed,n_clipped,n_insuffFailed,n_analysed} + vector filt_stats = regData.postProcessFragData(remove_nonunique_, + clip_until_nbg_, + max_protect_frac_, + max_bisC_meth_, + min_bisC_size_, + min_frag_data_len_, + min_frag_data_dens_, + whichMap); + n_nonUnique += filt_stats[0]; + n_bisfailed += filt_stats[1]; + n_clipped += filt_stats[2]; + n_analysed += filt_stats[4]; // create List for R @@ -300,11 +435,9 @@ Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infile const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, - const Rcpp::IntegerVector& absIsizeMin, - const Rcpp::IntegerVector& absIsizeMax, - const Rcpp::IntegerVector& min_read_size, - const Rcpp::IntegerVector& max_read_size, - const Rcpp::CharacterVector& alignerUsed){ + const Rcpp::CharacterVector& alignerUsed, + const Rcpp::IntegerVector& min_frag_data_len, + const Rcpp::NumericVector& min_frag_data_dens){ string alignerUsed_ = Rcpp::as(alignerUsed); @@ -336,10 +469,8 @@ Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infile int min_bisC_size_ = Rcpp::as(min_bisC_size); int mapqMin_ = Rcpp::as(mapqMin); int mapqMax_ = Rcpp::as(mapqMax); - int absIsizeMin_ = Rcpp::as(absIsizeMin); - int absIsizeMax_ = Rcpp::as(absIsizeMax); - int min_read_size_ = Rcpp::as(min_read_size); - int max_read_size_ = Rcpp::as(max_read_size); + int min_frag_data_len_ = Rcpp::as(min_frag_data_len); + double min_frag_data_dens_ = Rcpp::as(min_frag_data_dens); // initialize container for reference sequence refSeqInfo refseqInfo(seqstring_, seqStart_,seqEnd_); @@ -351,18 +482,20 @@ Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infile // store min and max MAPQ pnts.mapqMin = mapqMin_; pnts.mapqMax = mapqMax_; - // store min and max tlen - pnts.absIsizeMin = absIsizeMin_; - pnts.absIsizeMax = absIsizeMax_; - // store min and max read lengths - pnts.min_read_size = min_read_size_; - pnts.max_read_size = max_read_size_; + + // store which contexts to parse + vector cont_vec; + if(max_bisC_meth_ < 1.0) // if filtering bisulfite failed reads needed we add bisC context + cont_vec = {GCH, bisC}; + else + cont_vec = {GCH}; + pnts.context_vec = &cont_vec; + + // initialize container for collecting protect statistics and counters protectStatsTable protStats; int n_fetched = 0, n_nonUnique = 0, n_bisfailed = 0, n_analysed = 0, n_clipped = 0; - - - // // for each bam file fetch data and add counts to protect stats table + // for each bam file fetch data and add counts to protect stats table for(int i = 0; i < infiles_.size(); ++i){ // initialize container for fragments in region regionData regData(regionChr_,regionStart_,regionEnd_); @@ -377,26 +510,21 @@ Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infile &pnts); n_fetched += regData.size(); - // get qnames on non-unique fragments - set nonUnfrags; - if(remove_nonunique_) - nonUnfrags = regData.getNonUniqueQnames(); - n_nonUnique += nonUnfrags.size(); - - // get qnames with failed bisulfite conversion - set failedBisConvfrags; - if(max_bisC_meth_ < 1.0) - failedBisConvfrags = regData.getFailedBisConvQnames(max_bisC_meth_,min_bisC_size_); - n_bisfailed += failedBisConvfrags.size(); - - // erase fragments which nonunique and failed bisConv - std::set remove_qnames(nonUnfrags); - remove_qnames.insert(failedBisConvfrags.begin(), failedBisConvfrags.end()); - n_analysed += regData.eraseQnames(remove_qnames); - - // clip fragments - if(clip_until_nbg_ > 0) - n_clipped += regData.clipProtectData(clip_until_nbg_,max_protect_frac_); + // post-processing of the data, i.e. filtering by uniqueness, bisulfite efficiency, clipping and removal of fragments with insufficient data + // function below returns the following stats + // {n_nonUnique,n_bisfailed,n_clipped,n_insuffFailed,n_analysed} + vector filt_stats = regData.postProcessFragData(remove_nonunique_, + clip_until_nbg_, + max_protect_frac_, + max_bisC_meth_, + min_bisC_size_, + min_frag_data_len_, + min_frag_data_dens_, + GCH); + n_nonUnique += filt_stats[0]; + n_bisfailed += filt_stats[1]; + n_clipped += filt_stats[2]; + n_analysed += filt_stats[4]; // add counts to protect stat table regData.addCountsToProtectStatsTable(protStats); diff --git a/src/functions_export_rcpp.h b/src/functions_export_rcpp.h index ff86799..0a197a9 100644 --- a/src/functions_export_rcpp.h +++ b/src/functions_export_rcpp.h @@ -25,7 +25,26 @@ using namespace std; bool _VERBOSE_ = 0; - +// [[Rcpp::export]] +Rcpp::List fetch_molec_data_list_from_bams_cpp(const Rcpp::CharacterVector& whichContext, + const Rcpp::CharacterVector& infiles, + const Rcpp::CharacterVector& regionChr, + const Rcpp::IntegerVector& regionStart, + const Rcpp::IntegerVector& regionEnd, + const Rcpp::CharacterVector& seqstring, + const Rcpp::IntegerVector& seqStart, + const Rcpp::IntegerVector& seqEnd, + const Rcpp::LogicalVector& remove_nonunique, + const Rcpp::IntegerVector& clip_until_nbg, + const Rcpp::NumericVector& max_protect_frac, + const Rcpp::NumericVector& max_bisC_meth, + const Rcpp::IntegerVector& min_bisC_size, + const Rcpp::IntegerVector& mapqMin, + const Rcpp::IntegerVector& mapqMax, + const Rcpp::CharacterVector& alignerUsed, + const Rcpp::IntegerVector& min_frag_data_len, + const Rcpp::NumericVector& min_frag_data_dens, + const Rcpp::LogicalVector& data_as_rle); // [[Rcpp::export]] Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, @@ -43,11 +62,9 @@ Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, - const Rcpp::IntegerVector& absIsizeMin, - const Rcpp::IntegerVector& absIsizeMax, - const Rcpp::IntegerVector& min_read_size, - const Rcpp::IntegerVector& max_read_size, - const Rcpp::CharacterVector& alignerUsed); + const Rcpp::CharacterVector& alignerUsed, + const Rcpp::IntegerVector& min_frag_data_len, + const Rcpp::NumericVector& min_frag_data_dens); // [[Rcpp::export]] @@ -66,11 +83,9 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, - const Rcpp::IntegerVector& absIsizeMin, - const Rcpp::IntegerVector& absIsizeMax, - const Rcpp::IntegerVector& min_read_size, - const Rcpp::IntegerVector& max_read_size, - const Rcpp::CharacterVector& alignerUsed); + const Rcpp::CharacterVector& alignerUsed, + const Rcpp::IntegerVector& min_frag_data_len, + const Rcpp::NumericVector& min_frag_data_dens); // [[Rcpp::export]] Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infiles, @@ -87,10 +102,8 @@ Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infile const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, - const Rcpp::IntegerVector& absIsizeMin, - const Rcpp::IntegerVector& absIsizeMax, - const Rcpp::IntegerVector& min_read_size, - const Rcpp::IntegerVector& max_read_size, - const Rcpp::CharacterVector& alignerUsed); + const Rcpp::CharacterVector& alignerUsed, + const Rcpp::IntegerVector& min_frag_data_len, + const Rcpp::NumericVector& min_frag_data_dens); #endif \ No newline at end of file diff --git a/src/regionData-class.cpp b/src/regionData-class.cpp index a282bae..6e97ba3 100644 --- a/src/regionData-class.cpp +++ b/src/regionData-class.cpp @@ -14,6 +14,7 @@ bool regionData::addFragContextData(const string& bampref, const string& qname, const int& fragstart, const int& fragend, + const string& fragconfig, const vector& rposvec, const vector& protectvec, const vector& isonplusstrandvec, @@ -29,6 +30,7 @@ bool regionData::addFragContextData(const string& bampref, frg.addData(bampref, fragstart, fragend, + fragconfig, rposvec, protectvec, isonplusstrandvec, @@ -41,6 +43,7 @@ bool regionData::addFragContextData(const string& bampref, fDataMap[qname].addData(bampref, fragstart, fragend, + fragconfig, rposvec, protectvec, isonplusstrandvec, @@ -113,7 +116,7 @@ set regionData::getNonUniqueQnames(){ }; // get set of qnames with failed bisulfite conversion -set regionData::getFailedBisConvQnames(double max_bisC_meth,int min_bisC_size){ +set regionData::getFailedBisConvQnames(const double& max_bisC_meth,const int& min_bisC_size){ set failed_qnames; for(unordered_map::iterator it = fDataMap.begin(); it != fDataMap.end();++it){ @@ -132,6 +135,99 @@ int regionData::eraseQnames(const set& qnames){ return(fDataMap.size()); }; + +int regionData::eraseEmptyFrags(fragMapType whichMap){ + for(unordered_map::iterator it = fDataMap.begin(); it != fDataMap.end();){ + if((it->second).isEmpty(whichMap)){ + it = fDataMap.erase(it); + } else{ + ++it; + } + } + return(fDataMap.size()); +}; + +set regionData::getInsuffDataQnames(const int& min_frag_data_len, + const double& min_frag_data_dens, + const fragMapType& whichMap){ + set failed_qnames; + + for(unordered_map::iterator it = fDataMap.begin(); it != fDataMap.end();++it){ + std::vector dataPosNpoints = (it->second).getFragDataStartEndNpoints(whichMap); + + int fragDataStart = dataPosNpoints[0]; + int fragDataEnd = dataPosNpoints[1]; + int npoints = dataPosNpoints[2]; + int fragDataLen = fragDataEnd - fragDataStart + 1; + + if(fragDataLen == 0 || npoints == 0){ + failed_qnames.insert(it->first); + continue; + } + + double fragDataDens = static_cast(npoints) / fragDataLen; + + if(fragDataLen < min_frag_data_len || fragDataDens < min_frag_data_dens) + failed_qnames.insert(it->first); + } + return(failed_qnames); +}; + + +vector regionData::postProcessFragData(const bool& remove_nonunique, // remove non-unique? + const int& clip_until_nbg, // clipping partial footprints at the edges + const double& max_protect_frac, // + const double& max_bisC_meth, + const int& min_bisC_size, + const int& min_frag_data_len, + const double& min_frag_data_dens, + const fragMapType& whichMap){ + ///// 1. filtering non-unique fragments ///// + // get qnames on non-unique fragments + set nonUnfrags; + if(remove_nonunique) + nonUnfrags = getNonUniqueQnames(); + int n_nonUnique = nonUnfrags.size(); + + + ///// 2. filtering fragments with failed bisulfite conversion, i.e. too high bisC methylation ///// + // get qnames with failed bisulfite conversion + set failedBisConvfrags; + if(max_bisC_meth < 1.0) + failedBisConvfrags = getFailedBisConvQnames(max_bisC_meth,min_bisC_size); + int n_bisfailed = failedBisConvfrags.size(); + + // erase fragments which are nonunique or failed bisConv + set remove_qnames(nonUnfrags); + remove_qnames.insert(failedBisConvfrags.begin(), failedBisConvfrags.end()); + eraseQnames(remove_qnames); + // empty the set with qnames + remove_qnames.erase(remove_qnames.begin(),remove_qnames.end()); + + ///// 3. clip fragments ///// + int n_clipped = 0; + if(clip_until_nbg > 0){ + n_clipped = clipProtectData(clip_until_nbg,max_protect_frac); + // // remove fragments which were erased by clipping + // eraseEmptyFrags(fragMapType whichMap); + } + + //// 4. filtering fragments by data length, i.e. genomic distance from most left and most right informative (containing data) positions + set insuffDatafrags = getInsuffDataQnames(min_frag_data_len, + min_frag_data_dens, + whichMap); + int n_insuffFailed = insuffDatafrags.size(); + + // erase fragments which contain insufficient data + int n_analysed = eraseQnames(insuffDatafrags); + + std::vector vec_out={n_nonUnique,n_bisfailed,n_clipped,n_insuffFailed,n_analysed}; + return(vec_out); +}; + + + + int regionData::size(){ return(fDataMap.size()); }; @@ -186,3 +282,115 @@ Rcpp::IntegerMatrix regionData::getDataMatrix(fragMapType whichMap){ return(matr); }; + + +Rcpp::List regionData::getFragDataList(fragMapType whichMap){ + // start (genomic position of the first data point. note, not the alignment start!) + // end last position of the data points + // fragConfigs configurations of fragments + // qname - name of the fragment + // fragData_list list which contains vectors with 0 (accessible), 1(protected) and NA, non-informative + + + Rcpp::IntegerVector fragStarts; // start positions are 1-based on reference + Rcpp::IntegerVector fragEnds; // start positions are 1-based on reference + + Rcpp::IntegerVector nDataPoints; // number of informative points + Rcpp::IntegerVector startDataPoints; // data points at start positions + Rcpp::IntegerVector endDataPoints; // data points at end positions + + Rcpp::CharacterVector fragConfigs; // configurations of configs + + Rcpp::CharacterVector fragQnames; // fragment names + + Rcpp::List fragData_list; + + + for(unordered_map::iterator it=fDataMap.begin(); it != fDataMap.end(); ++it){ + // get number of informative points in a fragment + Rcpp::List fragdat = (it->second).getFragDataVector(whichMap); + int npoints = Rcpp::as(fragdat["nDataPoints"]); + + int fragDataStart = Rcpp::as(fragdat["fragDataStart"]); + int fragDataEnd = Rcpp::as(fragdat["fragDataEnd"]); + + int fragDataLen = fragDataEnd - fragDataStart + 1; + + fragQnames.push_back(it->first); + fragStarts.push_back(fragDataStart + 1); // +1 is to convert to 1-based positions + fragEnds.push_back(fragDataEnd + 1); + fragConfigs.push_back((it->second).getFragConfig()); + nDataPoints.push_back(fragdat["nDataPoints"]); + startDataPoints.push_back(fragdat["startData"]); + endDataPoints.push_back(fragdat["endData"]); + fragData_list.push_back(fragdat["fragDataVec"]); + } + + Rcpp::List data_out = Rcpp::List::create(Rcpp::Named("start") = fragStarts, + Rcpp::Named("end") = fragEnds, + Rcpp::Named("fragConfigs") = fragConfigs, + Rcpp::Named("qnames") = fragQnames, + Rcpp::Named("nDataPoints") = nDataPoints, + Rcpp::Named("startDataPoint") = startDataPoints, + Rcpp::Named("endDataPoint") = endDataPoints, + Rcpp::Named("fragData_list") = fragData_list); + return(data_out); +}; + + +Rcpp::List regionData::getFragDataRle(fragMapType whichMap){ + // start (genomic position of the first data point. note, not the alignment start!) + // end last position of the data points + // fragConfigs configurations of fragments + // qname - name of the fragment + // fragData_list list which contains Rle compressed vectors with 0 (accessible), 1(protected) and NA, non-informative + + + Rcpp::IntegerVector fragStarts; // start positions are 1-based on reference + Rcpp::IntegerVector fragEnds; // start positions are 1-based on reference + + Rcpp::IntegerVector nDataPoints; // number of informative points + Rcpp::IntegerVector startDataPoints; // data points at start positions + Rcpp::IntegerVector endDataPoints; // data points at end positions + + Rcpp::CharacterVector fragConfigs; // configurations of configs + + Rcpp::CharacterVector fragQnames; // fragment names + + Rcpp::List fragData_list; + + + + for(unordered_map::iterator it=fDataMap.begin(); it != fDataMap.end(); ++it){ + + Rcpp::List fragdat = (it->second).getFragDataRle(whichMap); + int npoints = Rcpp::as(fragdat["nDataPoints"]); + + int fragDataStart = Rcpp::as(fragdat["fragDataStart"]); + int fragDataEnd = Rcpp::as(fragdat["fragDataEnd"]); + + int fragDataLen = fragDataEnd - fragDataStart + 1; + + fragQnames.push_back(it->first); + fragStarts.push_back(fragDataStart + 1); // +1 is to convert to 1-based positions + fragEnds.push_back(fragDataEnd + 1); + fragConfigs.push_back((it->second).getFragConfig()); + nDataPoints.push_back(fragdat["nDataPoints"]); + startDataPoints.push_back(fragdat["startData"]); + endDataPoints.push_back(fragdat["endData"]); + + fragData_list.push_back(fragdat["rle_data"]); + } + + Rcpp::List data_out = Rcpp::List::create(Rcpp::Named("start") = fragStarts, + Rcpp::Named("end") = fragEnds, + Rcpp::Named("fragConfigs") = fragConfigs, + Rcpp::Named("qnames") = fragQnames, + Rcpp::Named("nDataPoints") = nDataPoints, + Rcpp::Named("startDataPoint") = startDataPoints, + Rcpp::Named("endDataPoint") = endDataPoints, + Rcpp::Named("fragData_list") = fragData_list); + return(data_out); +}; + + diff --git a/src/regionData-class.h b/src/regionData-class.h index 4d1e944..b576cc3 100644 --- a/src/regionData-class.h +++ b/src/regionData-class.h @@ -20,19 +20,23 @@ class regionData{ int regStart; int regEnd; - unordered_map fDataMap; // map: qname -> fragment + unordered_map fDataMap; // map: qname (string) -> fragment (class fragData) public: + + ///// constructors/destructors ///// regionData(); regionData(const string& regchr, const int& regstart, const int& regend); ~regionData(); + ///// functions for adding new data ///// bool addFragContextData(const string& bampref, const string& qname, const int& fragstart, const int& fragend, + const string& fragconfig, const vector& rposvec, const vector& protectvec, const vector& isonplusstrandvec, @@ -42,13 +46,14 @@ class regionData{ void print(); - + ///// functions for collecting statistics ///// void addCountsToCoocTable(coocCntTable& cTable); void addCountsToProtectStatsTable(protectStatsTable& pStatsTable); + ///// functions for post-processing data ///// // clip fragments to get rid of partial footprints int clipProtectData(const int& clip_until_nbg, // clips from both ends until this number of 0 is met const double& max_protect_frac); // keeps fragment if percentage of 1s in a fragment is above this number) @@ -59,17 +64,44 @@ class regionData{ set getNonUniqueQnames(); // get set of qnames with failed bisulfite conversion - set getFailedBisConvQnames(double max_bisC_meth,int min_bisC_size); + set getFailedBisConvQnames(const double& max_bisC_meth, + const int& min_bisC_size); // remove fragments from the map. returns new size of the map int eraseQnames(const set& qnames); + // remove fragments which contain NO data. used after clipping + int eraseEmptyFrags(fragMapType whichMap); + + // get set of qnames whose data length or data density is too short + set getInsuffDataQnames(const int& min_frag_data_len, + const double& min_frag_data_dens, + const fragMapType& whichMap); + + + vector postProcessFragData(const bool& remove_nonunique, + const int& clip_until_nbg, + const double& max_protect_frac, + const double& max_bisC_meth, + const int& min_bisC_size, + const int& min_frag_data_len, + const double& min_frag_data_dens, + const fragMapType& whichMap); + + + int size(); // convert data in region into a Matrix and return Rcpp::IntegerMatrix getDataMatrix(fragMapType whichMap); + // get List with fetched reads/fragments for R + Rcpp::List getFragDataList(fragMapType whichMap); + + // get List with fetched reads/fragments for R with Rle compressed data vectors + Rcpp::List getFragDataRle(fragMapType whichMap); + }; diff --git a/tests/testthat/Bismark_expected_methylation_data_read_list.rds b/tests/testthat/Bismark_expected_methylation_data_read_list.rds new file mode 100644 index 0000000..d68744e Binary files /dev/null and b/tests/testthat/Bismark_expected_methylation_data_read_list.rds differ diff --git a/tests/testthat/QuasR_expected_methylation_reads_list.rds b/tests/testthat/QuasR_expected_methylation_reads_list.rds new file mode 100644 index 0000000..de4fb43 Binary files /dev/null and b/tests/testthat/QuasR_expected_methylation_reads_list.rds differ diff --git a/tests/testthat/test-get_ctable_from_bams.R b/tests/testthat/test-get_ctable_from_bams.R index e7053ce..9c2936a 100644 --- a/tests/testthat/test-get_ctable_from_bams.R +++ b/tests/testthat/test-get_ctable_from_bams.R @@ -1,6 +1,6 @@ test_that("get_ctable_from_bams works for QuasR bam file", { #library(GenomicRanges) - + #browser() quasR_coocc_matrix <- get_ctable_from_bams(bamfiles = "QuasR_test.bam", samplenames = "quasr_pe", @@ -9,6 +9,8 @@ test_that("get_ctable_from_bams works for QuasR bam file", { IRanges::IRanges(start = 200,end = 500)), genome = "random_genome_700bp.fa", alignerUsed = "QuasR", + min_frag_data_len = 0L, + min_frag_data_dens = 0.0, max_spacing = 300, remove_nonunique = F, clip_until_nbg = 0L, @@ -25,16 +27,18 @@ test_that("get_ctable_from_bams works for Bismark bam file", { #library(GenomicRanges) bismark_coocc_matrix <- get_ctable_from_bams(bamfiles = "Bismark_test.bam", - samplenames = "bismark_pe", - regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", - strand = "+", - IRanges::IRanges(start = 200,end = 500)), - genome = "random_genome_700bp.fa", - alignerUsed = "Bismark", - max_spacing = 300, - remove_nonunique = F, - clip_until_nbg = 0L, - max_bisC_meth = 1) + samplenames = "bismark_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + alignerUsed = "Bismark", + min_frag_data_len = 0L, + min_frag_data_dens = 0.0, + max_spacing = 300, + remove_nonunique = F, + clip_until_nbg = 0L, + max_bisC_meth = 1) expect_coocc_mat <- readRDS("QuasR_expected_coocc_mat.rds") diff --git a/tests/testthat/test-get_data_matrix_from_bams.R b/tests/testthat/test-get_data_matrix_from_bams.R index 68956d4..3711099 100644 --- a/tests/testthat/test-get_data_matrix_from_bams.R +++ b/tests/testthat/test-get_data_matrix_from_bams.R @@ -1,6 +1,6 @@ test_that("get_data_matrix_from_bams works for QuasR bam file", { #library(GenomicRanges) - + #browser() quasR_methmat <- get_data_matrix_from_bams(bamfiles = "QuasR_test.bam", samplenames = "quasr_pe", regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", @@ -24,7 +24,7 @@ test_that("get_data_matrix_from_bams works for QuasR bam file", { expect_true(all(quasR_methmat$allC_DataMatrix[[1]] == quasr_expected_methmat)) - #### check if filtering by tlen works #### + # #### check if filtering by data vector length works #### quasR_methmat <- get_data_matrix_from_bams(bamfiles = "QuasR_test.bam", samplenames = "quasr_pe", regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", @@ -35,9 +35,9 @@ test_that("get_data_matrix_from_bams works for QuasR bam file", { remove_nonunique = F, clip_until_nbg = 0L, max_bisC_meth = 1, - absIsizeMin = 500) + min_frag_data_len = 500L) - ## test whether 0 reads are fetched + # ## test whether 0 reads are fetched expect_true(nrow(quasR_methmat$allC_DataMatrix[[1]]) == 0) quasR_methmat <- get_data_matrix_from_bams(bamfiles = "QuasR_test.bam", samplenames = "quasr_pe", @@ -49,43 +49,19 @@ test_that("get_data_matrix_from_bams works for QuasR bam file", { remove_nonunique = F, clip_until_nbg = 0L, max_bisC_meth = 1, - absIsizeMax = 50) + min_frag_data_dens = 1.0) expect_true(nrow(quasR_methmat$allC_DataMatrix[[1]]) == 0) - #### check if filtering by read length works - quasR_methmat <- get_data_matrix_from_bams(bamfiles = "QuasR_test.bam", - samplenames = "quasr_pe", - regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", - strand = "+", - IRanges::IRanges(start = 200,end = 500)), - genome = "random_genome_700bp.fa", - whichContext ="allC", - remove_nonunique = F, - clip_until_nbg = 0L, - max_bisC_meth = 1, - min_read_size = 400) - expect_true(nrow(quasR_methmat$allC_DataMatrix[[1]]) == 0) - quasR_methmat <- get_data_matrix_from_bams(bamfiles = "QuasR_test.bam", - samplenames = "quasr_pe", - regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", - strand = "+", - IRanges::IRanges(start = 200,end = 500)), - genome = "random_genome_700bp.fa", - whichContext ="allC", - remove_nonunique = F, - clip_until_nbg = 0L, - max_bisC_meth = 1, - max_read_size = 50) - expect_true(nrow(quasR_methmat$allC_DataMatrix[[1]]) == 0) + }) test_that("get_data_matrix_from_bams works for Bismark bam file with indels and soft clipping", { - library(GenomicRanges) - ## coordinates of the test fragment - frag_coord_OT_gr <- GRanges(seqnames = "random_genome_700bp", - strand = "+", - IRanges(start = 200,end = 500)) +# library(GenomicRanges) +# ## coordinates of the test fragment +# frag_coord_OT_gr <- GRanges(seqnames = "random_genome_700bp", +# strand = "+", +# IRanges(start = 200,end = 500)) bismark_methmat <- get_data_matrix_from_bams(bamfiles = "Bismark_test.bam", samplenames = "bismark_pe", @@ -110,7 +86,7 @@ test_that("get_data_matrix_from_bams works for Bismark bam file with indels and ## test that identical expect_true(all(bismark_methmat$allC_DataMatrix[[1]] == bismark_expected_methmat)) - #### test if filtering by tlen works #### + # #### test if filtering by data vector length works #### bismark_methmat <- get_data_matrix_from_bams(bamfiles = "Bismark_test.bam", samplenames = "bismark_pe", regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", @@ -120,7 +96,7 @@ test_that("get_data_matrix_from_bams works for Bismark bam file with indels and whichContext ="allC", remove_nonunique = F, clip_until_nbg = 0L, - absIsizeMin = 500) + min_frag_data_len = 500L) expect_true(nrow(bismark_methmat$allC_DataMatrix[[1]]) == 0) bismark_methmat <- get_data_matrix_from_bams(bamfiles = "Bismark_test.bam", samplenames = "bismark_pe", @@ -131,32 +107,10 @@ test_that("get_data_matrix_from_bams works for Bismark bam file with indels and whichContext ="allC", remove_nonunique = F, clip_until_nbg = 0L, - absIsizeMax = 50) + min_frag_data_dens = 1.0 + ) expect_true(nrow(bismark_methmat$allC_DataMatrix[[1]]) == 0) - #### test if filtering by read length works #### - bismark_methmat <- get_data_matrix_from_bams(bamfiles = "Bismark_test.bam", - samplenames = "bismark_pe", - regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", - strand = "+", - IRanges::IRanges(start = 200,end = 500)), - genome = "random_genome_700bp.fa", - whichContext ="allC", - remove_nonunique = F, - clip_until_nbg = 0L, - min_read_size = 500) - expect_true(nrow(bismark_methmat$allC_DataMatrix[[1]]) == 0) - bismark_methmat <- get_data_matrix_from_bams(bamfiles = "Bismark_test.bam", - samplenames = "bismark_pe", - regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", - strand = "+", - IRanges::IRanges(start = 200,end = 500)), - genome = "random_genome_700bp.fa", - whichContext ="allC", - remove_nonunique = F, - clip_until_nbg = 0L, - max_read_size = 50) - expect_true(nrow(bismark_methmat$allC_DataMatrix[[1]]) == 0) }) \ No newline at end of file diff --git a/tests/testthat/test-get_molecule_data_list_from_bams.R b/tests/testthat/test-get_molecule_data_list_from_bams.R new file mode 100644 index 0000000..1644b25 --- /dev/null +++ b/tests/testthat/test-get_molecule_data_list_from_bams.R @@ -0,0 +1,123 @@ +test_that("get_molecule_data_list_from_bams works for QuasR bam file", { + quasR_methlist <- get_molecule_data_list_from_bams(bamfiles = "QuasR_test.bam", + samplenames = "quasr_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="allC", + remove_nonunique = F, + clip_until_nbg = 0L, + max_bisC_meth = 1, + min_frag_data_len = 50L, + min_frag_data_dens = 0.05) + + roword <- paste0("f1:test_case_",c("OT","CTOT","OB","CTOB")) + quasR_methlist <- quasR_methlist[match(roword,quasR_methlist$qname),] + + rdat <- lapply(quasR_methlist$allC_fragData_list, + function(x){ + x[is.na(x)] <- 2 + return(x) + }) + + + ## load expected data + dat_reads <- readRDS("QuasR_expected_methylation_reads_list.rds") + ## test if all data points equal expected values + expect_true(all(do.call(c,lapply(1:length(rdat),function(i){ + rdat[[i]] == dat_reads[[i]] + })))) + ## test if all starts equal expected values + expect_true(all(quasR_methlist$start == c(207,207,205,205))) + expect_true(all(quasR_methlist$end == c(498,498,497,497))) + + + ## test if returned Rle encoded data is as expected + data_vec <- get_molecule_data_list_from_bams(bamfiles = "QuasR_test.bam", + samplenames = "quasr_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="GCH", + remove_nonunique = F, + clip_until_nbg = 0L, + max_bisC_meth = 1, + min_frag_data_len = 50L, + min_frag_data_dens = 0.05, + data_as_Rle = FALSE) + data_rle <- get_molecule_data_list_from_bams(bamfiles = "QuasR_test.bam", + samplenames = "quasr_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="GCH", + remove_nonunique = F, + clip_until_nbg = 0L, + max_bisC_meth = 1, + min_frag_data_len = 50L, + min_frag_data_dens = 0.05, + data_as_Rle = TRUE) + + data_vec <- data_vec[match(data_rle$qname,data_vec$qname),] + + + + rdat_vec <- lapply(data_vec$GCH_fragData_list, + function(x){ + x[is.na(x)] <- 2 + return(x) + }) + rdat_rle <- lapply(data_rle$GCH_fragData_list, + function(x){ + x <- as.vector(x) + x[is.na(x)] <- 2 + return(x) + }) + expect_true(all(do.call(c,lapply(1:length(rdat_vec),function(i){ + rdat_vec[[i]] == rdat_rle[[i]] + })))) + +}) + + +test_that("get_molecule_data_list_from_bams works for Bismark bam file with indels and soft clipping", { + + bismark_methlist <- get_molecule_data_list_from_bams(bamfiles = "Bismark_test.bam", + samplenames = "bismark_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="allC", + remove_nonunique = F, + clip_until_nbg = 0L, + max_bisC_meth = 1) + ## substitue NA to 2 + roword <- paste0("f1:test_case_",c("OT","OT_I_D","CTOT","CTOT_I_D","OB","OB_I_D", "CTOB","CTOB_I_D")) + bismark_methlist <- bismark_methlist[match(roword,bismark_methlist$qname),] + rdat <- lapply(bismark_methlist$allC_fragData_list, + function(x){ + x[is.na(x)] <- 2 + return(x) + }) + + ## load expected data + dat_reads <- readRDS("Bismark_expected_methylation_data_read_list.rds") + + ## test if all data points equal expected values + expect_true(all(do.call(c,lapply(1:length(rdat),function(i){ + rdat[[i]] == dat_reads[[i]] + })))) + + ## test if all starts and ends equal expected values + expect_true(all(bismark_methlist$start == c(207,207,207,207,205,205,205,205))) + expect_true(all(bismark_methlist$end == c(498,498,498,498,497,497,497,497))) + + + +}) + + diff --git a/tests/testthat/test-get_protect_stats_from_bams.R b/tests/testthat/test-get_protect_stats_from_bams.R index 63250b1..f9ba4ae 100644 --- a/tests/testthat/test-get_protect_stats_from_bams.R +++ b/tests/testthat/test-get_protect_stats_from_bams.R @@ -1,5 +1,6 @@ test_that("get_protect_stats_from_bams works for QuasR bam file", { #library(GenomicRanges) + quasr_protect_stats <- get_protect_stats_from_bams(bamfiles = "QuasR_test.bam", samplenames = "quasr_pe", regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", @@ -7,6 +8,8 @@ test_that("get_protect_stats_from_bams works for QuasR bam file", { IRanges::IRanges(start = 200,end = 500)), genome = "random_genome_700bp.fa", alignerUsed = "QuasR", + min_frag_data_len = 0L, + min_frag_data_dens = 0.0, remove_nonunique = F, clip_until_nbg = 0L, max_bisC_meth = 1) @@ -29,6 +32,8 @@ test_that("get_protect_stats_from_bams works for Bismark bam file", { IRanges::IRanges(start = 200,end = 500)), genome = "random_genome_700bp.fa", alignerUsed = "Bismark", + min_frag_data_len = 0L, + min_frag_data_dens = 0.0, remove_nonunique = F, clip_until_nbg = 0L, max_bisC_meth = 1) diff --git a/tests/testthat/test_esc_SampleFile.rds b/tests/testthat/test_esc_SampleFile.rds new file mode 100644 index 0000000..2ac70f8 Binary files /dev/null and b/tests/testthat/test_esc_SampleFile.rds differ diff --git a/tests/testthat/test_regions_gr.rds b/tests/testthat/test_regions_gr.rds new file mode 100644 index 0000000..fb5af8b Binary files /dev/null and b/tests/testthat/test_regions_gr.rds differ