Skip to content

Commit

Permalink
feat: compute coverage from pileups
Browse files Browse the repository at this point in the history
  • Loading branch information
jlanga committed Apr 19, 2024
1 parent 49a3f35 commit 126895f
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 13 deletions.
10 changes: 7 additions & 3 deletions config/params.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# Execution params
---

picard_markduplicates:
memory_gb: 1
preprocess:
markduplicates:
memory_gb: 1
# coverage:
# memory: 1G # Raise this to 10-20G if you have a large genome
# mer_len: 21
# ploidy: 1

popoolation:
find_indels:
Expand Down
3 changes: 2 additions & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ samples = pd.read_table("config/samples.tsv", comment="#", dtype="str")


# Other variables
POPULATIONS = samples["population"].drop_duplicates().values.tolist()
POPULATION_LIBRARY = (
samples[["population", "library"]].drop_duplicates().values.tolist()
)
POPULATIONS = list(set(population for population, library in POPULATION_LIBRARY))

PAIRS = ["pe_pe", "pe_se"]
CHROMOSOMES = features["chromosomes"].split(" ")
ENDS = "1 2 u".split(" ")
Expand Down
1 change: 1 addition & 0 deletions workflow/rules/folders.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ PRE_RMDUP = PRE / "rmdup"
PRE_FILT = PRE / "filtered"
PRE_SPLIT = PRE / "split"
PRE_MPILEUP = PRE / "mpileup"
PRE_COV = PRE / "coverage"

# popoolation - sample-wise
POP1 = RESULTS / "popoolation1"
Expand Down
2 changes: 2 additions & 0 deletions workflow/rules/preprocess/__environment__.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ dependencies:
- pigz =2.8
- python =3.12.3
- samtools =1.20
- kmer-jellyfish =2
- genomescope2 =2
13 changes: 12 additions & 1 deletion workflow/rules/preprocess/__functions__.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,18 @@ def get_library_files_from_sample(wildcards):
population = wildcards.population
libraries = samples[samples["population"] == population]["library"].values.tolist()
files = [
PRE_SPLIT / f"{population}.{library}" / f"{chromosome}.cram"
PRE_SPLIT / f"{population}.{library}" / f"{chromosome}.bam"
for library in libraries
]
return files


# coverage
def get_files_for_jellyfish(wildcards):
population = wildcards.population
files = [
READS / f"{population}.{library}_{end}.fq.gz"
for library in samples[samples["population"] == population]["library"]
for end in ["1", "2"]
]
return files
2 changes: 2 additions & 0 deletions workflow/rules/preprocess/__main__.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ include: "__functions__.smk"
include: "index.smk"
include: "map.smk"
include: "mpileup.smk"
include: "coverage.smk"


rule preprocess:
input:
rules.preprocess__mpileup.input,
rules.preprocess__coverage.input,
28 changes: 28 additions & 0 deletions workflow/rules/preprocess/coverage.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
rule preprocess__coverage__compute_hist__:
input:
mpileups=lambda w: [
PRE_MPILEUP / w.population / f"{chromosome}.mpileup.gz"
for chromosome in CHROMOSOMES
],
output:
hist=PRE_COV / "{population}.hist",
log:
PRE_COV / "{population}.hist.log",
conda:
"__environment__.yml"
params:
population=lambda w: w.population,
shell:
"""
( gzip -dc {input.mpileups} \
| awk
'{{hist[$4]++}} END {{for (i in hist) print "{params.population}" "\\t" i "\\t" hist[i]}}' \
| sort -n -k2,2 \
> {output.hist} \
) 2> {log}
"""


rule preprocess__coverage:
input:
[PRE_COV / f"{population}.hist" for population in POPULATIONS],
12 changes: 6 additions & 6 deletions workflow/rules/preprocess/map.smk
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ rule preprocess__map__rmdup__:
PRE_RMDUP / "{population}.{library}.log",
conda:
"__environment__.yml"
resources:
memory_gb=params["preprocess"]["markduplicates"]["memory_gb"],
shell:
"""
gatk MarkDuplicates \
gatk --java-options "-Xmx{resources.memory_gb}g" MarkDuplicates \
--INPUT {input.cram} \
--OUTPUT {output.bam} \
--METRICS_FILE {output.dupstats} \
Expand Down Expand Up @@ -75,8 +77,6 @@ rule preprocess__map__filter__:
bam=temp(PRE_FILT / "{population}.{library}.bam"),
log:
PRE_FILT / "{population}.{library}.log",
resources:
memory_gb=params["picard_markduplicates"]["memory_gb"],
conda:
"__environment__.yml"
shell:
Expand All @@ -102,7 +102,7 @@ rule preprocess__map__split__:
reference=REFERENCE / f"{REFERENCE_NAME}.fa.gz",
fai=REFERENCE / f"{REFERENCE_NAME}.fa.gz.fai",
output:
cram=PRE_SPLIT / "{population}.{library}" / "{chromosome}.cram",
bam=temp(PRE_SPLIT / "{population}.{library}" / "{chromosome}.bam"),
params:
chromosome=lambda w: w.chromosome,
log:
Expand All @@ -114,7 +114,7 @@ rule preprocess__map__split__:
samtools view \
--uncompressed \
--reference {input.reference} \
--output {output.cram} \
--output {output.bam} \
{input.bam} \
{params.chromosome} \
2> {log}
Expand All @@ -124,7 +124,7 @@ rule preprocess__map__split__:
rule preprocess__map:
input:
[
PRE_SPLIT / f"{population}.{library}" / f"{chromosome}.cram"
PRE_SPLIT / f"{population}.{library}" / f"{chromosome}.bam"
for population, library in POPULATION_LIBRARY
for chromosome in CHROMOSOMES
],
4 changes: 2 additions & 2 deletions workflow/rules/preprocess/mpileup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ rule preprocess__mpileup__:
- bcftools mpileup produces VCF format
"""
input:
cram=get_library_files_from_sample,
bams=get_library_files_from_sample,
fa=REFERENCE / f"{REFERENCE_NAME}.fa.gz",
fai=REFERENCE / f"{REFERENCE_NAME}.fa.gz.fai",
output:
Expand All @@ -22,7 +22,7 @@ rule preprocess__mpileup__:
-u \
--reference {input.fa} \
- \
{input.cram} \
{input.bams} \
| samtools mpileup \
-a \
--no-BAQ \
Expand Down

0 comments on commit 126895f

Please sign in to comment.