Skip to content

Commit 6156d75

Browse files
authored
Merge pull request #107 from Joon-Klaps/65-select-best-ref
65 select best ref
2 parents 2394605 + 578432a commit 6156d75

32 files changed

+1184
-186
lines changed

README.md

+11-8
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,19 @@
5151
- [`mmseqs-linclust`](https://github.com/soedinglab/MMseqs2/wiki#linear-time-clustering-using-mmseqs-linclust)
5252
- [`mmseqs-cluster`](https://github.com/soedinglab/MMseqs2/wiki#cascaded-clustering)
5353
- [`vRhyme`](https://github.com/AnantharamanLab/vRhyme)
54-
- [`mash`](https://github.com/marbl/Mash)
54+
- [`Mash`](https://github.com/marbl/Mash)
5555
8. Scaffolding of contigs to centroid ([`Minimap2`](https://github.com/lh3/minimap2), [`iVar-consensus`](https://andersen-lab.github.io/ivar/html/manualpage.html))
5656
9. [Optional] Annotate 0-depth regions with external reference `custom-script`.
57-
10. Mapping filtered reads to supercontig and mapping constrains([`BowTie2`](http://bowtie-bio.sourceforge.net/bowtie2/),[`BWAmem2`](https://github.com/bwa-mem2/bwa-mem2) and [`BWA`](https://github.com/lh3/bwa))
58-
11. [Optional] Deduplicate reads ([`Picard`](https://broadinstitute.github.io/picard/) or if UMI's are used [`UMI-tools`](https://umi-tools.readthedocs.io/en/latest/QUICK_START.html))
59-
12. Variant calling and filtering ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html),[`iVar`](https://andersen-lab.github.io/ivar/html/manualpage.html))
60-
13. Create consensus genome ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html),[`iVar`](https://andersen-lab.github.io/ivar/html/manualpage.html))
61-
14. Repeat step 10-13 multiple times for the denovo contig route
62-
15. Consensus evaluation and annotation ([`QUAST`](http://quast.sourceforge.net/quast),[`CheckV`](https://bitbucket.org/berkeleylab/checkv/src/master/),[`blastn`](https://blast.ncbi.nlm.nih.gov/Blast.cgi), [`mmseqs-search`](https://github.com/soedinglab/MMseqs2/wiki#batch-sequence-searching-using-mmseqs-search))
63-
16. Result summary visualisation for raw read, alignment, assembly, variant calling and consensus calling results ([`MultiQC`](http://multiqc.info/))
57+
10. [Optional] Select best reference from `--mapping_constrains`:
58+
- [`Mash sketch`](https://github.com/marbl/Mash)
59+
- [`Mash screen`](https://github.com/marbl/Mash)
60+
11. Mapping filtered reads to supercontig and mapping constrains([`BowTie2`](http://bowtie-bio.sourceforge.net/bowtie2/),[`BWAmem2`](https://github.com/bwa-mem2/bwa-mem2) and [`BWA`](https://github.com/lh3/bwa))
61+
12. [Optional] Deduplicate reads ([`Picard`](https://broadinstitute.github.io/picard/) or if UMI's are used [`UMI-tools`](https://umi-tools.readthedocs.io/en/latest/QUICK_START.html))
62+
13. Variant calling and filtering ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html),[`iVar`](https://andersen-lab.github.io/ivar/html/manualpage.html))
63+
14. Create consensus genome ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html),[`iVar`](https://andersen-lab.github.io/ivar/html/manualpage.html))
64+
15. Repeat step 11-14 multiple times for the denovo contig route
65+
16. Consensus evaluation and annotation ([`QUAST`](http://quast.sourceforge.net/quast),[`CheckV`](https://bitbucket.org/berkeleylab/checkv/src/master/),[`blastn`](https://blast.ncbi.nlm.nih.gov/Blast.cgi), [`mmseqs-search`](https://github.com/soedinglab/MMseqs2/wiki#batch-sequence-searching-using-mmseqs-search))
66+
17. Result summary visualisation for raw read, alignment, assembly, variant calling and consensus calling results ([`MultiQC`](http://multiqc.info/))
6467

6568
## Usage
6669

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
id,species,segment,samples,definition,sequence
2-
MN908947,SARS-COV-2,Wuhan-hu-1,SRR11140744;SRR11140748,Original Wuhan strain sequenced in 2019,https://raw.githubusercontent.com/Joon-Klaps/nextclade_data/old_datasets/data/nextstrain/sars-cov-2/MN908947/reference.fasta
3-
BA2,,,SRR11140744,,https://github.com/Joon-Klaps/nextclade_data/raw/old_datasets/data/nextstrain/sars-cov-2/BA.2/reference.fasta
4-
BA24,,,,,https://github.com/Joon-Klaps/nextclade_data/raw/old_datasets/data/nextstrain/sars-cov-2/BA.2/reference.fasta
1+
id,species,segment,samples,definition,selection,sequence
2+
MN908947,SARS-COV-2,Wuhan-hu-1,SRR11140744;SRR11140748,Original Wuhan strain sequenced in 2019,false,https://raw.githubusercontent.com/Joon-Klaps/nextclade_data/old_datasets/data/nextstrain/sars-cov-2/MN908947/reference.fasta
3+
BA2,,,SRR11140744,,false,https://github.com/Joon-Klaps/nextclade_data/raw/old_datasets/data/nextstrain/sars-cov-2/BA.2/reference.fasta
4+
BA24,,,,,true,https://github.com/Joon-Klaps/nextclade_data/raw/old_datasets/data/nextstrain/sars-cov-2/MN908947/sequences.fasta

assets/schemas/mapping_constrains.json

+6
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,12 @@
6464
}
6565
]
6666
},
67+
"selection": {
68+
"errorMessage": "Selection can only be true or false",
69+
"meta": ["selection"],
70+
"type": "boolean",
71+
"default": false
72+
},
6773
"definition": {
6874
"errorMessage": "Give a definition of the sequence for metadata annotation purposes only",
6975
"meta": ["definition"],

bin/select_reference.py

+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#!/usr/bin/env python
2+
3+
"""Provide a command line tool to filter blast results."""
4+
5+
import argparse
6+
import logging
7+
import sys
8+
from pathlib import Path
9+
10+
import pandas as pd
11+
from Bio import SeqIO
12+
13+
logger = logging.getLogger()
14+
15+
16+
def parse_args(argv=None):
17+
"""Define and immediately parse command line arguments."""
18+
parser = argparse.ArgumentParser(
19+
description="Provide a command line tool to filter blast results.",
20+
epilog="Example: python blast_filter.py in.clstr prefix",
21+
)
22+
23+
parser.add_argument(
24+
"-i",
25+
"--mash",
26+
metavar="MASH FILE",
27+
type=Path,
28+
help="Mash screen result file.",
29+
)
30+
31+
parser.add_argument(
32+
"-r",
33+
"--references",
34+
metavar="REFERENCE FILE",
35+
type=Path,
36+
help="Contig sequence file was screened against",
37+
)
38+
39+
parser.add_argument(
40+
"-p",
41+
"--prefix",
42+
metavar="PREFIX",
43+
type=str,
44+
help="Output file prefix",
45+
)
46+
47+
parser.add_argument(
48+
"-l",
49+
"--log-level",
50+
help="The desired log level (default WARNING).",
51+
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
52+
default="INFO",
53+
)
54+
return parser.parse_args(argv)
55+
56+
def to_dict_remove_dups(sequences):
57+
return {record.id: record for record in sequences}
58+
59+
60+
def extract_hits(df, references, prefix):
61+
"""
62+
Extracts contigs hits from a DataFrame and writes them to a FASTA file.
63+
64+
Args:
65+
df (pandas.DataFrame): DataFrame containing the hits information.
66+
contigs (str): Path to the contigs file.
67+
references (str): Path to the references file in FASTA format.
68+
prefix (str): Prefix for the output file.
69+
70+
Returns:
71+
None
72+
"""
73+
try:
74+
ref_records = SeqIO.to_dict(SeqIO.parse(references, "fasta"))
75+
except ValueError as e:
76+
logger.warning(
77+
"Indexing the reference pool file causes an error: %s \n Make sure all fasta headers are unique and it is in fasta format! \n AUTOFIX: Taking last occurence of duplicates to continue analysis",
78+
e,
79+
)
80+
ref_records = to_dict_remove_dups(SeqIO.parse(references, "fasta"))
81+
with open(f"{prefix}_reference.fa", "w") as f:
82+
init_position = f.tell()
83+
for hit in df["query-ID"].unique():
84+
hit_name = hit.split(" ")[0]
85+
if hit_name in ref_records:
86+
# Sometimes reads can have illegal characters in the header
87+
ref_records[hit_name].id = ref_records[hit_name].id.replace("\\","_")
88+
ref_records[hit_name].description = ref_records[hit_name].description.replace("\\","_")
89+
SeqIO.write(ref_records[hit_name], f, "fasta")
90+
if f.tell() == init_position:
91+
logger.error("No reference sequences found in the hits. Exiting...")
92+
93+
94+
def read_mash_screen(file):
95+
"""
96+
Read in the file and return a pandas DataFrame
97+
File format:
98+
[identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment]
99+
0.996341 3786/4000 121 0 USANYPRL230901_81A112023
100+
0.997144 3832/4000 124 0 EnglandQEUH3267E6482022
101+
0.997039 3826/4000 121 0 OP958840
102+
0.997022 3825/4000 122 0 OP971202
103+
"""
104+
105+
logger.info("Reading in the mash screen file...")
106+
df = pd.read_csv(file, sep="\t", header=None)
107+
df.columns = ["identity", "shared-hashes", "median-multiplicity", "p-value", "query-ID", "query-comment"]
108+
109+
logger.info("Removing duplicates and sorting by identity and shared-hashes...")
110+
df['shared-hashes_num'] = df['shared-hashes'].str.split('/').str[0].astype(float)
111+
df = df.sort_values(by=["identity", "shared-hashes_num"], ascending=False)
112+
df = df.drop(columns=['shared-hashes_num'])
113+
114+
return df.iloc[[0]]
115+
116+
117+
def main(argv=None):
118+
"""Coordinate argument parsing and program execution."""
119+
args = parse_args(argv)
120+
logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
121+
if not args.mash.is_file():
122+
logger.error(f"The given input file {args.mash} was not found!")
123+
sys.exit(2)
124+
if not args.references.is_file():
125+
logger.error(f"The given input file {args.references} was not found!")
126+
sys.exit(2)
127+
128+
df = read_mash_screen(args.mash)
129+
130+
extract_hits(df, args.references, args.prefix)
131+
132+
df.to_json(f"{args.prefix}.json",orient="records", lines=True)
133+
134+
return 0
135+
136+
if __name__ == "__main__":
137+
sys.exit(main())

conf/modules.config

+43-2
Original file line numberDiff line numberDiff line change
@@ -595,8 +595,10 @@ process {
595595

596596
withName: MASH_DIST{
597597
ext.args = [
598-
"-i", // sketch sequence not file, **don't change will make pipeline fail**
599-
"-t" // table format **don't change will make pipeline fail**
598+
"-i", // sketch sequence not file, **don't change will make pipeline fail**
599+
"-t", // table format **don't change will make pipeline fail**
600+
"-s ${params.mash_sketch_size}", // sketch size
601+
"-k ${params.mash_sketch_kmer_size}", // k-mer size
600602
].join(' ').trim()
601603
publishDir =[
602604
[
@@ -722,6 +724,45 @@ process {
722724
}
723725
}
724726

727+
withName: CAT_CAT_READS {
728+
publishDir = [
729+
enabled: false
730+
]
731+
}
732+
733+
734+
withName: MASH_SKETCH {
735+
ext.args = [
736+
"-s ${params.mash_sketch_size}", // sketch size
737+
"-k ${params.mash_sketch_kmer_size}", // k-mer size
738+
"-i ", // Sketch individual sequences DON'T CHANGE
739+
].join(' ').trim()
740+
publishDir = [
741+
path: { "${params.outdir}/variants/mapping-info/mash/sketch" },
742+
mode: params.publish_dir_mode,
743+
pattern: '*.msh',
744+
saveAs: { filename -> params.prefix || params.global_prefix ? "${params.global_prefix}-$filename" : filename }
745+
]
746+
}
747+
748+
withName: MASH_SCREEN {
749+
publishDir = [
750+
path: { "${params.outdir}/variants/mapping-info/mash/screen" },
751+
mode: params.publish_dir_mode,
752+
pattern: '*.screen',
753+
saveAs: { filename -> params.prefix || params.global_prefix ? "${params.global_prefix}-$filename" : filename }
754+
]
755+
}
756+
757+
withName: SELECT_REFERENCE {
758+
publishDir = [
759+
path: { "${params.outdir}/variants/mapping-info/mash/select-ref" },
760+
mode: params.publish_dir_mode,
761+
pattern: '*.json',
762+
saveAs: { filename -> params.prefix || params.global_prefix ? "${params.global_prefix}-$filename" : filename }
763+
]
764+
}
765+
725766
//
726767
// Iterative mapping
727768
// use the '$meta.iteration' variable to create a new directory for each iteration when publishing the dir's of the modules
Loading

0 commit comments

Comments
 (0)