diff --git a/Snakefile b/Snakefile index 41617d0..d0ae4b2 100644 --- a/Snakefile +++ b/Snakefile @@ -25,7 +25,56 @@ genome_list = [ line for line in genome_list if line ] # remove empty lines genome_dir = config['genome_dir'].rstrip('/') output_dir = config['output_dir'].rstrip('/') -# read in provided lineages, if any. +### verification / strict mode + +scaled = config['scaled'] +try: + scaled = int(scaled) + if scaled < 1 or scaled > 100000: + raise ValueError +except ValueError: + print('** ERROR: scaled should be a number between 1 and 100000') + print('** (it must also match the query database scaled value)') + if strict_mode: + sys.exit(-1) + +ksize = config['ksize'] +try: + ksize = int(ksize) + if ksize < 15 or ksize > 101: + raise ValueError +except ValueError: + print('** ERROR: ksize should be a nubmer between 15 and 101.') + print('** (it must also match the query database ksize value)') + if strict_mode: + sys.exit(-1) + +# verify that all genome files exist - +for filename in genome_list: + fullpath = os.path.join(genome_dir, filename) + if not os.path.exists(fullpath): + print(f'** ERROR: genome file {filename} does not exist in {genome_dir}') + if strict_mode: + print('** exiting.') + sys.exit(-1) + +# verify that all query databases exist -- +for filename in config['gather_db']: + if not os.path.exists(filename): + print(f'** ERROR: database {filename} does not exist.') + if strict_mode: + print('** exiting.') + sys.exit(-1) + +# does lineage csv exist? +filename = config['lineages_csv'] +if not os.path.exists(filename): + print(f'** ERROR: lineage CSV {filename} does not exist.') + if strict_mode: + print('** exiting.') + sys.exit(-1) + +# read in provided lineages, if any, and verify file. provided_lineages_file = config.get('provided_lineages', '') provided_lineages = {} if provided_lineages_file: @@ -43,6 +92,9 @@ if provided_lineages_file: print(f'** read {len(provided_lineages)} provided lineages') +print('** config file checks PASSED!') +print('** from here on out, it\'s all snakemake...') + ### utility functions def output_files(filename_template, **kw): return expand(output_dir + filename_template, **kw) @@ -57,7 +109,9 @@ def get_provided_lineage(w): else: return "NA" +### ### rules! +### wildcard_constraints: size="\d+" @@ -119,8 +173,6 @@ rule contigs_clean_just_taxonomy: --report {output.report} --summary {output.csv} \ --lineage {params.lineage:q} {params.force} """ -# --lineage 'Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales;Shewanellaceae;Shewanella' - rule combined_summary: input: diff --git a/test-data/00-test.conf b/test-data/00-test.conf index d44ceca..6292d65 100644 --- a/test-data/00-test.conf +++ b/test-data/00-test.conf @@ -1,28 +1,37 @@ -# check and validate config settings strictly. +# check and validate config settings & filenames strictly. strict: 1 -# continue past survivable errors +# continue past survivable errors in decontamination force: 0 +### +### project specific settings +### + +# location for all generated files +output_dir: 'output.test' + # list of genome filenames to decontaminate genome_list: test-data/genome_list.txt +# directory in which genome filenames live +genome_dir: test-data/genomes/ + # (optional) list of lineages for input genomes. comment out or leave # blank if none. provided_lineages: test-data/provided-lineages.csv -# directory in which genome filenames live -genome_dir: test-data/genomes/ - -# location for all generated files -output_dir: 'output.test' +### +### installation/system-wide configuration +### # sourmash query databases for contamination (SBTs, LCAs, or signatures) gather_db: - test-data/podar-ref.lca.json.gz - test-data/LoombaR_2017__SID1050_bax__bin.11.fa.gz.gather-matches.sig.gz -# lineages CSV (see `sourmash lca index`) for signatures in query databases +# lineages CSV containing reference lineages in query database. +# Must correspond to signatures in query databases (e.g. gtdb.csv). lineages_csv: test-data/podar-lineage.csv # scaled and ksize at which to construct genome/contig signatures.