Skip to content

Commit

Permalink
add function for fetching data for each molecule. add filtering by de…
Browse files Browse the repository at this point in the history
…nsity of informative positions and length.
  • Loading branch information
EvgOz committed Oct 28, 2024
1 parent c5595a4 commit 28b5bd8
Show file tree
Hide file tree
Showing 28 changed files with 1,882 additions and 818 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: fetchNOMe
Type: Package
Title: Package for fast and convenient retrieval of NOMe-seq data from BAM files
Version: 0.4
Version: 0.5
Date: 2024-05-09
Authors@R: person(given = "Evgeniy A.", family="Ozonov", email = "evgeniy.ozonov@fmi.ch", role = c("aut", "cre"))
Description: This R package is tailored for fast and convenient retrieval of NOMe-seq data from BAM files aligned with QuasR, Bismark and BISCUIT.
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ useDynLib(fetchNOMe, .registration=TRUE)
importFrom(Rcpp, evalCpp)
export(get_data_matrix_from_bams)
export(get_ctable_from_bams)
export(get_protect_stats_from_bams)
export(get_protect_stats_from_bams)
export(get_molecule_data_list_from_bams)
16 changes: 10 additions & 6 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

fetch_cooc_ctable_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) {
.Call(`_fetchNOMe_fetch_cooc_ctable_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)
fetch_molec_data_list_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens, data_as_rle) {
.Call(`_fetchNOMe_fetch_molec_data_list_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens, data_as_rle)
}

fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) {
.Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)
fetch_cooc_ctable_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens) {
.Call(`_fetchNOMe_fetch_cooc_ctable_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens)
}

fetch_protect_stats_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) {
.Call(`_fetchNOMe_fetch_protect_stats_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)
fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens) {
.Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens)
}

fetch_protect_stats_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens) {
.Call(`_fetchNOMe_fetch_protect_stats_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed, min_frag_data_len, min_frag_data_dens)
}

37 changes: 11 additions & 26 deletions R/get_ctable_from_bams.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#' @param regions \linkS4class{GRanges} object for regions of interest.
#' @param genome \code{character} path to fasta file which was used as reference for alignment.
#' @param alignerUsed \code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISCUIT.
#' @param min_frag_data_len \code{integer} ignore fragments that have genomic lengths from most-left to most-right data points less than \code{min_frag_data_len}.
#' @param min_frag_data_dens \code{numeric} ignore fragments that have density of data-containing positions lower than \code{min_frag_data_dens}.
#' @param collapseBySample \code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}.
#' @param max_spacing maximum spacing length for which to collect co-occurrence counts.
#' @param remove_nonunique \code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines.
Expand All @@ -22,12 +24,8 @@
#' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency.
#' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis.
#' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis.
#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included.
#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included.
#' @param min_read_size \code{integer}. minimum read length to be included.
#' @param max_read_size \code{integer}. maximum read length to be included.

#' @param max_read_size \code{integer}. maximum read length which is expected in the data. This parameter controls only extension of regions for extracting reference sequence.
#' @param ncores number of cores to use.
#'
#' @return \code{tibble} where each row corresponds to sample - region combination.
Expand Down Expand Up @@ -55,6 +53,8 @@ get_ctable_from_bams <- function(bamfiles,
regions,
genome,
alignerUsed = c("QuasR","Bismark","BISCUIT"),
min_frag_data_len = 50L,
min_frag_data_dens = 0.05,
collapseBySample = TRUE,
max_spacing = 300,
remove_nonunique = TRUE,
Expand All @@ -64,9 +64,6 @@ get_ctable_from_bams <- function(bamfiles,
min_bisC_size = 10,
mapqMin = 0,
mapqMax = 255,
absIsizeMin = NULL,
absIsizeMax = NULL,
min_read_size = 25L,
max_read_size = 1000L,
ncores = 1L){

Expand All @@ -93,28 +90,17 @@ get_ctable_from_bams <- function(bamfiles,

alignerUsed <- match.arg(alignerUsed)

if (is.null(absIsizeMin)) # no tlen filtering
absIsizeMin <- -1L
if (is.null(absIsizeMax))
absIsizeMax <- -1L



message(paste0("Collecting co-occurrence frequencies from BAM files generated by ",alignerUsed,"."))


## load refsequences for regions
## extend regions from left and right
if(absIsizeMax > 0)
ext_len <- absIsizeMax
else
ext_len <- max_read_size

fa_index <- Rsamtools::scanFaIndex(genome)
GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index)
GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)),
seqlengths = GenomicRanges::width(fa_index)))
refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center"))
refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center"))

## trim to remove parts out of chromosomes
refseq_gr <- GenomicRanges::trim(refseq_gr)
Expand Down Expand Up @@ -150,11 +136,10 @@ get_ctable_from_bams <- function(bamfiles,
min_bisC_size = min_bisC_size,
mapqMin = mapqMin,
mapqMax = mapqMax,
absIsizeMin = absIsizeMin,
absIsizeMax = absIsizeMax,
min_read_size = min_read_size,
max_read_size = max_read_size,
alignerUsed = alignerUsed)
alignerUsed = alignerUsed,
min_frag_data_len = min_frag_data_len,
min_frag_data_dens = min_frag_data_dens
)


#coocmatr <- outlist[["CoocCountTable"]]
Expand Down
33 changes: 9 additions & 24 deletions R/get_data_matrix_from_bams.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#' \item{"allC"}{methylation states of all cytosines combined.}
#' }
#' @param alignerUsed \code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISCUIT.
#' @param min_frag_data_len \code{integer} ignore fragments that have genomic lengths from most-left to most-right data points less than \code{min_frag_data_len}.
#' @param min_frag_data_dens \code{numeric} ignore fragments that have density of data-containing positions lower than \code{min_frag_data_dens}.
#' @param collapseBySample \code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}.
#' @param remove_nonunique \code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines.
#' @param clip_until_nbg \code{integer} controlling clipping partial footprints at both ends of fragments. Namely, protected states are omitted from each end until \code{clip_until_nbg} unprotected states are met.
Expand All @@ -27,12 +29,7 @@
#' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency.
#' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis.
#' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis.
#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included.
#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included.
#' @param min_read_size \code{integer}. minimum read length to be included.
#' @param max_read_size \code{integer}. maximum read length to be included.
#' @param max_read_size \code{integer}. maximum read length which is expected in the data. This parameter controls only extension of regions for extracting reference sequence.
#' @param ncores number of cores to use.
#' @return \code{tibble} where each row corresponds to sample - region combination.
#' Columns of the returned \code{tibble} represent:
Expand All @@ -59,6 +56,8 @@ get_data_matrix_from_bams <- function(bamfiles,
genome,
whichContext = c("GCH","WCG","bisC","otherC", "allC"),
alignerUsed = c("QuasR","Bismark","BISCUIT"),
min_frag_data_len = 50L,
min_frag_data_dens = 0.05,
collapseBySample = TRUE,
remove_nonunique = TRUE,
clip_until_nbg = 1L,
Expand All @@ -67,9 +66,6 @@ get_data_matrix_from_bams <- function(bamfiles,
min_bisC_size = 10,
mapqMin = 0,
mapqMax = 255,
absIsizeMin = NULL,
absIsizeMax = NULL,
min_read_size = 25L,
max_read_size = 1000L,
ncores = 1L){
if(!inherits(bamfiles,"character"))
Expand All @@ -96,11 +92,6 @@ get_data_matrix_from_bams <- function(bamfiles,

alignerUsed <- match.arg(alignerUsed)

if (is.null(absIsizeMin)) # no tlen filtering
absIsizeMin <- -1L
if (is.null(absIsizeMax))
absIsizeMax <- -1L

message(paste0("Fetching data from BAM files generated by ",alignerUsed,"."))


Expand All @@ -110,15 +101,11 @@ get_data_matrix_from_bams <- function(bamfiles,
## load refsequences for regions

## extend regions from left and right
if(absIsizeMax > 0)
ext_len <- absIsizeMax
else
ext_len <- max_read_size
fa_index <- Rsamtools::scanFaIndex(genome)
GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index)
GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)),
seqlengths = GenomicRanges::width(fa_index)))
refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center"))
refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center"))

## trim to remove parts out of chromosomes
refseq_gr <- GenomicRanges::trim(refseq_gr)
Expand Down Expand Up @@ -156,11 +143,9 @@ get_data_matrix_from_bams <- function(bamfiles,
min_bisC_size = min_bisC_size,
mapqMin = mapqMin,
mapqMax = mapqMax,
absIsizeMin = absIsizeMin,
absIsizeMax = absIsizeMax,
min_read_size = min_read_size,
max_read_size = max_read_size,
alignerUsed = alignerUsed)
alignerUsed = alignerUsed,
min_frag_data_len = min_frag_data_len,
min_frag_data_dens = min_frag_data_dens)

data_mat <- outlist[["DataMatrix"]]
# invert rows if strand is -
Expand Down
Loading

0 comments on commit 28b5bd8

Please sign in to comment.