-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathcurate.smk
129 lines (117 loc) · 4.32 KB
/
curate.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
This part of the workflow handles the data transformation and curation.
Expects different inputs for GISAID vs GenBank:
GISAID:
ndjson = "data/gisaid.ndjson"
GenBank:
ndjson = "data/genbank.ndjson"
biosample = "data/biosample.ndjson"
Produces different output files for GISAID vs GenBank:
GISAID:
fasta = "data/gisaid/sequences.fasta"
metadata = "data/gisaid/metadata_transformed.tsv"
flagged_annotations = temp("data/gisaid/flagged-annotations")
duplicate_biosample = "data/gisaid/duplicate_biosample.txt"
flagged_metadata = "data/gisaid/flagged_metadata.txt"
GenBank:
fasta = "data/genbank/sequences.fasta"
metadata = "data/genbank/metadata_transformed.tsv"
flagged_annotations = temp("data/genbank/flagged-annotations")
duplicate_biosample = "data/genbank/duplicate_biosample.txt"
"""
rule transform_rki_data:
input:
ndjson="data/rki.ndjson",
output:
fasta="data/rki_sequences.fasta",
metadata="data/rki_metadata_transformed.tsv",
params:
subsampled=config.get("subsampled", False),
shell:
"""
./bin/transform-rki \
{input.ndjson} \
--output-fasta {output.fasta} \
--output-metadata {output.metadata}
"""
rule transform_biosample:
input:
biosample = "data/biosample.ndjson"
output:
biosample = "data/genbank/biosample.tsv"
shell:
"""
./bin/transform-biosample {input.biosample} \
--output {output.biosample}
"""
rule transform_genbank_data:
input:
biosample = "data/genbank/biosample.tsv",
ndjson = "data/genbank.ndjson",
cog_uk_accessions = "data/cog_uk_accessions.tsv",
cog_uk_metadata = "data/cog_uk_metadata.csv.gz"
output:
fasta = "data/genbank_sequences.fasta",
metadata = "data/genbank_metadata_transformed.tsv",
flagged_annotations = temp("data/genbank/flagged-annotations"),
duplicate_biosample = "data/genbank/duplicate_biosample.txt"
benchmark:
"benchmarks/transform_genbank_data.txt"
shell:
"""
./bin/transform-genbank {input.ndjson} \
--biosample {input.biosample} \
--duplicate-biosample {output.duplicate_biosample} \
--cog-uk-accessions {input.cog_uk_accessions} \
--cog-uk-metadata {input.cog_uk_metadata} \
--output-metadata {output.metadata} \
--output-fasta {output.fasta} > {output.flagged_annotations}
"""
rule merge_open_data:
input:
genbank_metadata="data/genbank_metadata_transformed.tsv",
rki_metadata="data/rki_metadata_transformed.tsv",
rki_sequences="data/rki_sequences.fasta",
genbank_sequences="data/genbank_sequences.fasta",
output:
metadata="data/genbank/metadata_transformed.tsv",
sequences="data/genbank/sequences.fasta",
shell:
"""
./bin/merge-open \
--input-genbank-metadata {input.genbank_metadata} \
--input-rki-metadata {input.rki_metadata} \
--input-genbank-sequences {input.genbank_sequences} \
--input-rki-sequences {input.rki_sequences} \
--output-metadata {output.metadata} \
--output-sequences {output.sequences}
"""
rule transform_gisaid_data:
input:
ndjson = "data/gisaid.ndjson"
output:
fasta = "data/gisaid/sequences.fasta",
metadata = "data/gisaid/metadata_transformed.tsv",
flagged_annotations = temp("data/gisaid/flagged-annotations"),
additional_info = "data/gisaid/additional_info.tsv"
shell:
"""
./bin/transform-gisaid {input.ndjson} \
--output-metadata {output.metadata} \
--output-fasta {output.fasta} \
--output-additional-info {output.additional_info} \
--output-unix-newline > {output.flagged_annotations};
"""
rule flag_metadata:
### only applicable for GISAID
input:
metadata = "data/gisaid/metadata.tsv"
output:
metadata = "data/gisaid/flagged_metadata.txt"
resources:
# Memory use scales primarily with the size of the metadata file.
mem_mb=20000
shell:
"""
./bin/flag-metadata {input.metadata} > {output.metadata}
"""