dib-lab · ctb · May 13, 2020 · May 13, 2020 · May 13, 2020
diff --git a/Snakefile b/Snakefile
@@ -25,7 +25,56 @@ genome_list = [ line for line in genome_list if line ]   # remove empty lines
 genome_dir = config['genome_dir'].rstrip('/')
 output_dir = config['output_dir'].rstrip('/')
 
-# read in provided lineages, if any.
+### verification / strict mode
+
+scaled = config['scaled']
+try:
+    scaled = int(scaled)
+    if scaled < 1 or scaled > 100000:
+        raise ValueError
+except ValueError:
+    print('** ERROR: scaled should be a number between 1 and 100000')
+    print('** (it must also match the query database scaled value)')
+    if strict_mode:
+        sys.exit(-1)
+
+ksize = config['ksize']
+try:
+    ksize = int(ksize)
+    if ksize < 15 or ksize > 101:
+        raise ValueError
+except ValueError:
+    print('** ERROR: ksize should be a nubmer between 15 and 101.')
+    print('** (it must also match the query database ksize value)')
+    if strict_mode:
+        sys.exit(-1)
+
+# verify that all genome files exist -
+for filename in genome_list:
+    fullpath = os.path.join(genome_dir, filename)
+    if not os.path.exists(fullpath):
+        print(f'** ERROR: genome file {filename} does not exist in {genome_dir}')
+        if strict_mode:
+            print('** exiting.')
+            sys.exit(-1)
+
+# verify that all query databases exist --
+for filename in config['gather_db']:
+    if not os.path.exists(filename):
+        print(f'** ERROR: database {filename} does not exist.')
+        if strict_mode:
+            print('** exiting.')
+            sys.exit(-1)
+
+# does lineage csv exist?
+filename = config['lineages_csv']
+if not os.path.exists(filename):
+    print(f'** ERROR: lineage CSV {filename} does not exist.')
+    if strict_mode:
+        print('** exiting.')
+        sys.exit(-1)
+
+# read in provided lineages, if any, and verify file.
 provided_lineages_file = config.get('provided_lineages', '')
 provided_lineages = {}
 if provided_lineages_file:
@@ -43,6 +92,9 @@ if provided_lineages_file:
 
     print(f'** read {len(provided_lineages)} provided lineages')
 
+print('** config file checks PASSED!')
+print('** from here on out, it\'s all snakemake...')
+
 ### utility functions
 def output_files(filename_template, **kw):
     return expand(output_dir + filename_template, **kw)
@@ -57,7 +109,9 @@ def get_provided_lineage(w):
     else:
         return "NA"
 
+###
 ### rules!
+###
 
 wildcard_constraints:
     size="\d+"
@@ -119,8 +173,6 @@ rule contigs_clean_just_taxonomy:
             --report {output.report} --summary {output.csv} \
             --lineage {params.lineage:q} {params.force}
     """
-#            --lineage 'Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales;Shewanellaceae;Shewanella'
-
 
 rule combined_summary:
     input:

diff --git a/test-data/00-test.conf b/test-data/00-test.conf
@@ -1,28 +1,37 @@
-# check and validate config settings strictly.
+# check and validate config settings & filenames strictly.
 strict: 1
 
-# continue past survivable errors
+# continue past survivable errors in decontamination
 force: 0
 
+###
+### project specific settings
+###
+
+# location for all generated files
+output_dir: 'output.test'
+
 # list of genome filenames to decontaminate
 genome_list: test-data/genome_list.txt
 
+# directory in which genome filenames live
+genome_dir: test-data/genomes/
+
 # (optional) list of lineages for input genomes. comment out or leave
 # blank if none.
 provided_lineages: test-data/provided-lineages.csv
 
-# directory in which genome filenames live
-genome_dir: test-data/genomes/
-
-# location for all generated files
-output_dir: 'output.test'
+###
+### installation/system-wide configuration
+###
 
 # sourmash query databases for contamination (SBTs, LCAs, or signatures)
 gather_db:
 - test-data/podar-ref.lca.json.gz
 - test-data/LoombaR_2017__SID1050_bax__bin.11.fa.gz.gather-matches.sig.gz
 
-# lineages CSV (see `sourmash lca index`) for signatures in query databases
+# lineages CSV containing reference lineages in query database.
+# Must correspond to signatures in query databases (e.g. gtdb.csv).
 lineages_csv: test-data/podar-lineage.csv
 
 # scaled and ksize at which to construct genome/contig signatures.