Skip to content

Commit ed31d06

Browse files
authored
Merge pull request #5 from marithetland/develop
Merge develop with main
2 parents d018153 + 0e28540 commit ed31d06

File tree

1 file changed

+60
-105
lines changed

1 file changed

+60
-105
lines changed

susCovONT.py

+60-105
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,10 @@
5151
])
5252

5353
BARCODING = collections.OrderedDict([
54-
('native_1-12', ['--arrangements_files "barcode_arrs_nb12.cfg "']),
55-
('native_13-24', ['--arrangements_files "barcode_arrs_nb24.cfg" ']),
56-
('native_1-24', ['--arrangements_files "barcode_arrs_nb12.cfg barcode_arrs_nb24.cfg" ']),
57-
('native_1-96', ['--arrangements_files "barcode_arrs_nb96.cfg" ']),
58-
#('rapid_1-12', ['--barcode_kit "SQK-RBK004" --trim_barcodes ']),
54+
('native_1-12', ['--barcode_kits', 'EXP-NBD104 ']),
55+
('native_13-24', ['--barcode_kits', 'EXP-NBD114 ']),
56+
('native_1-24', ['--barcode_kits', 'EXP-NBD104 EXP-NBD114 ']),
57+
('native_1-96', ['--barcode_kits', 'EXP-NBD196 ']),
5958
('none', [])
6059
])
6160

@@ -64,7 +63,7 @@ def parse_args():
6463
"""Set arguments"""
6564
#Version
6665
parser = ArgumentParser(description='susCovONT')
67-
parser.add_argument('-v','--version', action='version', version='%(prog)s ' + 'v.1.0.1')
66+
parser.add_argument('-v','--version', action='version', version='%(prog)s ' + 'v.1.0.2')
6867

6968
#Argsgroups
7069
input_args = parser.add_argument_group('Input options (required)')
@@ -74,8 +73,8 @@ def parse_args():
7473
#output_args = parser.add_argument_group('Output options (required)')
7574

7675
#Input
77-
input_args.add_argument('-i', '--input_dir', type=pathlib.Path, required=True, help='Input directory, which should contain "fast5_pass" directory fast5-files.')
78-
input_args.add_argument('-s', '--sample_names', type=pathlib.Path, required=True, help='Provide a comma-separated list showing which barcode corresponds to which sample (for final report)')
76+
input_args.add_argument('-i', '--input_dir', type=pathlib.Path, required=True, help='Input directory, which should contain "fast5_pass" directory with fast5-files.')
77+
input_args.add_argument('-s', '--sample_names', type=pathlib.Path, required=True, help='Provide a comma-separated file showing which barcode corresponds to which sample (for final report)')
7978

8079
#Options to perform basecalling and/or demultiplexing before running artic guppyplex and minion
8180
optional_args.add_argument('-b', '--basecalling_model', type=str, required=False, choices=["r9.4_fast","r9.4_hac","r10_fast","r10_hac"], help='Use flag to perform basecalling before running the artic pipeline. Indicate which basecalling mode to use. In most cases you want to use a HAC option.')
@@ -280,6 +279,7 @@ def check_input(args):
280279
##Check that the fast5_pass folder with files (recursive) exists in the input dir
281280
fast5_pass=os.path.join(outdir,"fast5_pass/")
282281
fast5_pass_alt=os.path.join(outdir,"001_rawData/fast5_pass/")
282+
283283
if not os.path.exists(fast5_pass):
284284
if os.path.exists(fast5_pass_alt):
285285
len_fast5=fileCount(fast5_pass_alt, '.fast5')
@@ -299,84 +299,60 @@ def check_input(args):
299299
##Check guppy parameters if basecalling/demultiplexing
300300
model_choices = list(BASECALLING.keys())
301301
barcode_choices = list(BARCODING.keys())
302-
if args.basecalling_model:
302+
if args.basecalling_model and args.barcode_kit:
303303
check_guppy_version(outdir)
304304
if args.basecalling_model not in model_choices:
305305
sys.exit('Error: valid --model choices are: {}'.format(join_with_or(model_choices)))
306-
if args.barcode_kit:
307-
check_guppy_version(outdir)
308306
if args.barcode_kit not in barcode_choices:
309307
sys.exit('Error: valid --barcodes choices are: {}'.format(join_with_or(barcode_choices)))
310308

311309
##Check FASTQ files
312310
#If not basecalling or demultiplexing, FASTQ files must be present for artic minion to run
313-
fastq_pass=os.path.join(outdir,"fastq_pass") #FASTQ demultiplexed on ONT (guppy), stored in run_name dir
314-
fastq_pass_undem=os.path.join(outdir,"001_rawData/fastq_pass_notDemultiplexed") #FASTQ PASS from guppy basecaller, not demultiplexed. Exists together with fastq_pass under 001_rawData/
315-
fastq_pass_dem=os.path.join(outdir,"001_rawData/fastq_pass") #FASTQ PASS from guppy basecaller, demultiplexed
316-
sequencing_summary=""
311+
fastq_pass=os.path.join(outdir,"fastq_pass/")
312+
fastq_pass_alt=os.path.join(outdir,"001_rawData/fastq_pass/")
313+
sequencing_summary="" #TODO: Remove this?
317314

318-
##If not demultiplexing or basecalling:
319-
if not args.barcode_kit and not args.basecalling_model:
315+
if args.basecalling_model and not args.barcode_kit:
316+
sys.exit("Error: Cannot perform basecalling and downstream artic analysis without demultiplexing. Please also specify --barcode_kit /-k in your command.")
317+
if not args.basecalling_model and args.barcode_kit:
318+
sys.exit("Error: Cannot perform demultiplexing and downstream artic analysis without also basecalling. Please also specify --basecalling_model /-b in your command.")
319+
320+
if not args.basecalling_model and not args.barcode_kit:
321+
#Do not perform basecalling and demultiplexing, proceed to artic nf
320322
#The fastq_pass folder must exist as we are running nextflow directly on this + fast5.
321-
if os.path.exists(fastq_pass_dem):
322-
fastq_pass_dem_path=fastq_pass_dem
323-
if os.path.exists(fastq_pass_undem):
324-
sequencing_summary=(fastq_pass_undem+"/sequencing_summary.txt")
325-
fastq_pass_path=fastq_pass_undem
326-
elif not os.path.exists(fastq_pass_undem):
327-
fastq_pass_path=fastq_pass_dem
328-
if os.path.exists(os.path.join(fastq_pass_dem,"sequencing_summary.txt")):
329-
sequencing_summary=(os.path.join(fastq_pass_dem,"sequencing_summary.txt"))
323+
if not os.path.exists(fastq_pass):
324+
if os.path.exists(fastq_pass_alt):
325+
len_fastq=fileCount(fastq_pass_alt, '.fastq')
326+
if len_fastq != 0:
327+
fastq_pass_path=fastq_pass_alt
328+
sequencing_summary=(os.path.join(fastq_pass_alt,"sequencing_summary.txt"))
330329
else:
331-
sequencing_summary=(outdir+"sequencing_summary*.txt")
330+
sys.exit('Error: Found no .fastq files. Please check that your input directory is correct or perform basecalling with -b and -k flags.')
331+
else:
332+
sys.exit('Error: {} is not a directory'.format(fastq_pass))
333+
if os.path.exists(fastq_pass):
334+
len_fastq=fileCount(fastq_pass, '.fastq')
335+
if len_fastq != 0:
336+
fastq_pass_path=fastq_pass
337+
sequencing_summary=(os.path.join(fastq_pass,"sequencing_summary.txt"))
338+
if not os.path.exists(sequencing_summary):
339+
sequencing_summary=(outdir+"sequencing_summary*.txt") #TODO: split run name to the last two "_"s to get this name properly
332340
seq_summary=os.path.abspath(sequencing_summary)
333341
seq_summary_exist2=[os.path.abspath(x) for x in glob.glob(sequencing_summary)]
334342
sequencing_summary=listToString(seq_summary_exist2)
335343
else:
336-
sys.exit('Error: Could not find sequencing_summary.txt file to match ' + fastq_pass_dem)
337-
elif os.path.exists(fastq_pass):
338-
#If already basecalled AND demultiplexed on the GridION/MinIT:
339-
fastq_pass_path=fastq_pass
340-
fastq_pass_dem_path=fastq_pass
341-
sequencing_summary=(os.path.join(fastq_pass,"sequencing_summary.txt"))
342-
if not os.path.exists(sequencing_summary):
343-
sequencing_summary=(outdir+"sequencing_summary*.txt") #TODO: split run name to the last two "_"s to get this name properly
344-
seq_summary=os.path.abspath(sequencing_summary)
345-
seq_summary_exist2=[os.path.abspath(x) for x in glob.glob(sequencing_summary)]
346-
sequencing_summary=listToString(seq_summary_exist2)
347-
##IF demultiplexing
348-
if args.barcode_kit:
349-
#Check that there isn't already a folder called fastq_pass_demultiplexed
350-
if os.path.exists(fastq_pass_dem):
351-
sys.exit('Error: The folder {} already exists. Please delete/move this folder if you want to perform demultiplexing.'.format(str(fastq_pass_dem)))
344+
sys.exit('Error: Found no .fastq files. Please check that your input directory is correct or perform basecalling with -b and -k flags.')
352345

353-
##If basecalling (and therefore demultiplexing)
354-
if args.basecalling_model and not args.barcode_kit:
355-
sys.exit("Error: Cannot perform basecalling and downstream artic analysis without demultiplexing. Please also specify --barcode_kit /-k in your command.")
356-
if args.barcode_kit and args.basecalling_model:
346+
if args.basecalling_model and args.barcode_kit:
347+
#Perform basecalling and demultiplexing in joint command
348+
#The fastq_pass folder must not exist as it will be created
357349
if os.path.exists(fastq_pass):
358350
sys.exit('Error: The folder {} already exists. Please delete/move this folder if you want to perform basecalling.'.format(str(fastq_pass)))
359-
elif os.path.exists(fastq_pass_dem):
360-
sys.exit('Error: The folder {} already exists. Please delete/move this folder if you want to perform basecalling.'.format(str(fastq_pass_dem)))
361-
elif os.path.exists(fastq_pass_undem):
362-
sys.exit('Error: The folder {} already exists. Please delete/move this folder if you want to perform basecalling.'.format(str(fastq_pass_undem)))
351+
elif os.path.exists(fastq_pass_alt):
352+
sys.exit('Error: The folder {} already exists. Please delete/move this folder if you want to perform basecalling.'.format(str(fastq_pass_alt)))
363353
else:
364-
fastq_pass_path=fastq_pass_undem
365-
fastq_pass_dem_path=fastq_pass_dem
366-
sequencing_summary=(fastq_pass_undem+"/sequencing_summary.txt")
367-
368-
##If demultiplexing but not basecalling
369-
if args.barcode_kit and not args.basecalling_model:
370-
#If basecalling not specified, then the basecalled fastq folder must already exist
371-
if os.path.exists(fastq_pass_undem):
372-
fastq_pass_path=fastq_pass_undem
373-
sequencing_summary=(fastq_pass_undem+"/sequencing_summary.txt")
374-
fastq_pass_dem_path=fastq_pass_dem
375-
elif os.path.exists(fastq_pass):
376-
fastq_pass_dem_path=fastq_pass
377-
sequencing_summary=(outdir+"sequencing_summary*.txt")
378-
else:
379-
sys.exit("Error: Did not find a fastq_pass folder in {}/fastq_pass or {}/001_rawData/fastq_pass. Have you basecalled your fast5s or did you forget to specify basecalling (--basecalling_model)?").format(str(outdir))
354+
fastq_pass_path=fastq_pass_alt
355+
sequencing_summary=os.path.join(fastq_pass_alt,"sequencing_summary.txt")
380356

381357
#Check that the specified sequencing summary actually exists
382358
if not args.basecalling_model or not args.barcode_kit:
@@ -392,44 +368,28 @@ def check_input(args):
392368
#Check sample file and get df
393369
sample_df=get_sample_names(args.sample_names)
394370

371+
return run_name, outdir, fast5_pass_path, fastq_pass_path, seq_summary, sample_df
395372

396-
return run_name, outdir, fast5_pass_path, fastq_pass_path, fastq_pass_dem_path, seq_summary, sample_df
397-
398-
def get_guppy_basecalling_command(input_dir, save_dir, basecalling_model, resume, cpu):
373+
def get_guppy_basecalling_command(input_dir, save_dir, basecalling_model, barcode_kit, resume, cpu):
399374
"""Get the command used for running guppy basecaller"""
400375
basecalling_command = ['guppy_basecaller ',
401376
'--input_path ', input_dir,
402377
'--recursive ',
378+
'--require_barcodes_both_ends ',
403379
'--save_path ', save_dir]
404380
basecalling_command += BASECALLING[basecalling_model]
381+
basecalling_command += BARCODING[barcode_kit]
405382
if cpu:
406-
basecalling_command += ['--num_callers 6'] #change to auto / add option to use CPU or GPU with options
383+
basecalling_command += [' --num_callers 6 '] #change to auto / add option to use CPU or GPU with options
407384
else:
408-
basecalling_command += ['-x ', 'auto'] #change to auto / add option to use CPU or GPU with options
385+
basecalling_command += [' -x ', 'auto '] #change to auto / add option to use CPU or GPU with options
409386

410387
if resume:
411-
basecalling_command += ['--resume'] #add option to resume
388+
basecalling_command += [' --resume '] #add option to resume
412389

413390
print(listToString(basecalling_command))
414391
return basecalling_command
415392

416-
def get_guppy_barcoder_command(input_dir, save_dir, barcode_kit,resume, cpu):
417-
"""Get the command used for running guppy barcoder (demultiplexing)"""
418-
barcoding_command = ['guppy_barcoder ',
419-
'--require_barcodes_both_ends '
420-
'--input_path ', input_dir,
421-
'--save_path ', save_dir]
422-
barcoding_command += BARCODING[barcode_kit]
423-
if cpu:
424-
barcoding_command += ['--num_callers 6'] #change to auto / add option to use CPU or GPU with options
425-
else:
426-
barcoding_command += ['-x ', 'auto'] #change to auto / add option to use CPU or GPU with options
427-
428-
if resume:
429-
barcoding_command += ['--resume'] #add option to resume
430-
431-
print(listToString(barcoding_command))
432-
return barcoding_command
433393

434394
def get_nextflow_command(demultiplexed_fastq, fast5_pass, sequencing_summary,nf_outdir,run_name,nf_dir_location,conda_location,schemeRepoURL,offline):
435395
"""Get the command used for running artic via nextflow"""
@@ -625,7 +585,7 @@ def split_consensusFasta(run_name,final_report_name,consensus_dir): #TODO: Make
625585
return
626586

627587

628-
def move_input_files(outdir,raw_data_path,fast5_pass_path,fastq_pass_path,fastq_pass_dem_path):
588+
def move_input_files(outdir,raw_data_path):
629589
"""At the end of the pipeline, move input dirs to subdir 001_rawData"""
630590
#Make 001_rawDAta if not exists
631591
if not os.path.isdir(raw_data_path):
@@ -779,15 +739,16 @@ def main():
779739
##Set up log to stdout
780740
logfile = None
781741
logging.basicConfig(filename=logfile,level=logging.DEBUG,filemode='w',format='%(asctime)s %(message)s',datefmt='%m-%d-%Y %H:%M:%S')
782-
logging.info('Running susCovONT v.1.0.1') #Print program version
742+
743+
logging.info('Running susCovONT v.1.0.2') #Print program version
783744
logging.info('command line: {0}'.format(' '.join(sys.argv))) #Print input command
784745

785746
##Set config variables and check that required tools exist
786747
nf_dir_location, conda_location, schemeRepoURL = set_config_variables(args)
787748

788749
##Check input and set variables
789-
run_name, outdir, fast5_pass_path, fastq_pass_path, fastq_pass_dem_path, sequencing_summary, sample_df = check_input(args)
790-
750+
run_name, outdir, fast5_pass_path, fastq_pass_path, sequencing_summary, sample_df = check_input(args)
751+
791752
##Set output subdirectories
792753
#outdir=outdir #TODO:Set optional (parental) outdir
793754
raw_data_path=os.path.join(outdir,'001_rawData/')
@@ -839,20 +800,14 @@ def main():
839800

840801
###Run pipeline
841802
##Guppy basecalling
842-
if args.basecalling_model:
843-
basecalling_command=(get_guppy_basecalling_command(fast5_pass_path, fastq_pass_path, args.basecalling_model.lower(), args.guppy_resume_basecalling, args.guppy_use_cpu))
803+
if args.basecalling_model and args.barcode_kit:
804+
basecalling_command=(get_guppy_basecalling_command(fast5_pass_path, fastq_pass_path, args.basecalling_model.lower(), args.barcode_kit.lower(), args.guppy_resume_basecalling, args.guppy_use_cpu))
844805
if not args.dry_run:
845806
run_command([listToString(basecalling_command)], shell=True)
846-
847-
##Guppy demultiplexing
848-
if args.basecalling_model or args.barcode_kit:
849-
demultiplexing_command=(get_guppy_barcoder_command(fastq_pass_path, fastq_pass_dem_path, args.barcode_kit.lower(), args.guppy_resume_basecalling, args.guppy_use_cpu))
850-
if not args.dry_run:
851-
run_command([listToString(demultiplexing_command)], shell=True)
852-
807+
853808
#Artic guppyplex and artic minion via PHW's nextflow pipeline
854809
if not args.generate_report_only and not args.no_artic:
855-
nextflow_command=(get_nextflow_command(fastq_pass_dem_path, fast5_pass_path, sequencing_summary,nf_outdir,run_name,nf_dir_location,conda_location, schemeRepoURL,args.offline))
810+
nextflow_command=(get_nextflow_command(fastq_pass_path, fast5_pass_path, sequencing_summary,nf_outdir,run_name,nf_dir_location,conda_location, schemeRepoURL,args.offline))
856811
if not args.dry_run and not args.generate_report_only:
857812
run_command([listToString(nextflow_command)], shell=True)
858813

@@ -885,7 +840,7 @@ def main():
885840

886841
#Move input files to 001_rawData directory
887842
if not args.no_move_files or not args.dry_run:
888-
move_input_files(outdir,raw_data_path,fast5_pass_path,fastq_pass_path,fastq_pass_dem_path)
843+
move_input_files(outdir,raw_data_path)
889844

890845
#Pipeline complete
891846
total_time = time.time() - start_time

0 commit comments

Comments
 (0)