From 231930df58e975e1b3c6d1fbbc4c06e64a3e83bc Mon Sep 17 00:00:00 2001 From: Bryan Briney Date: Thu, 4 May 2023 11:59:56 -0700 Subject: [PATCH 1/2] remove unused imports --- abstar/core/abstar.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/abstar/core/abstar.py b/abstar/core/abstar.py index b27e48e..82d7434 100644 --- a/abstar/core/abstar.py +++ b/abstar/core/abstar.py @@ -73,7 +73,7 @@ get_parquet_dtypes, ) -from ..utils.output import PARQUET_INCOMPATIBLE +# from ..utils.output import PARQUET_INCOMPATIBLE from ..utils.queue.celery import celery @@ -99,7 +99,6 @@ ##################################################################### - def create_parser() -> ArgumentParser: parser = ArgumentParser( prog="abstar", @@ -390,7 +389,9 @@ def __init__( temp=None, sequences=None, chunksize=500, - output_type=["json",], + output_type=[ + "json", + ], assigner="blastn", merge=False, pandaseq_algo="simple_bayesian", @@ -423,7 +424,13 @@ def __init__( self.log = os.path.abspath(log) if log is not None else log self.temp = os.path.abspath(temp) if temp is not None else temp self.chunksize = int(chunksize) - self.output_type = [output_type,] if output_type in STR_TYPES else output_type + self.output_type = ( + [ + output_type, + ] + if output_type in STR_TYPES + else output_type + ) self.parquet = parquet self.assigner = assigner self.merge = True if basespace else merge @@ -726,7 +733,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args): for temp_file in temp_files: with open(temp_file, "rb") as f: shutil.copyfileobj( - f, out_file, length=16 * 1024 ** 2 + f, out_file, length=16 * 1024**2 ) # Increasing buffer size to 16MB for faster transfer # For file formats with headers, only keep headers from the first file elif output_type in ["imgt", "tabular", "airr"]: @@ -737,7 +744,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args): out_file.write(line) elif j >= 1: out_file.write(line) - + if args.parquet: logger.info("Converting concatenated output to parquet format") # Make clear the output format from which the parquet file is generated. @@ -764,7 +771,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args): df.to_parquet( pfile, engine="pyarrow", compression="snappy", write_index=False ) - + ofiles.append(ofile) return ofiles @@ -1544,5 +1551,6 @@ def run_main(arg_list: Optional[Iterable[str]] = None): output_dir = main(args) sys.stdout.write("\n\n") + if __name__ == "__main__": run_main() From 426b7bf637cda8fd306ec683dd7d94f6d74f7f99 Mon Sep 17 00:00:00 2001 From: Bryan Briney Date: Thu, 4 May 2023 12:15:30 -0700 Subject: [PATCH 2/2] move preprocess to utils --- abstar/__init__.py | 2 +- abstar/preprocess/umi.py | 81 ++++++++-------- abstar/{ => utils}/preprocess.py | 156 ++++++++++++++++++------------- 3 files changed, 130 insertions(+), 109 deletions(-) rename abstar/{ => utils}/preprocess.py (73%) diff --git a/abstar/__init__.py b/abstar/__init__.py index 27ae239..b1cda2f 100644 --- a/abstar/__init__.py +++ b/abstar/__init__.py @@ -4,6 +4,6 @@ warnings.simplefilter("ignore", BiopythonWarning) from .core.abstar import run, run_standalone, main, create_parser, validate_args -from .preprocess import fastqc, adapter_trim, quality_trim +from .utils.preprocess import fastqc, adapter_trim, quality_trim from .version import __version__ diff --git a/abstar/preprocess/umi.py b/abstar/preprocess/umi.py index 6bb7089..e1100c7 100644 --- a/abstar/preprocess/umi.py +++ b/abstar/preprocess/umi.py @@ -35,9 +35,7 @@ class UMI: - """ - - """ + """ """ def __init__( self, @@ -47,9 +45,7 @@ def __init__( ignore_strand: bool = False, extra_length_for_alignment: int = 25, ): - """ - - """ + """ """ self.length = length self.pattern = pattern.strip().upper() self.ignore_strand = ignore_strand @@ -138,7 +134,7 @@ def align( def get_mismatches(self) -> Optional[int]: """ - Returns the total number of mismatches between the pattern(s) + Returns the total number of mismatches between the pattern(s) and the input sequence. """ total = len(self.pattern.replace("[UMI]", "")) @@ -202,54 +198,54 @@ def parse_umis( 4. a list of lists/tuples, of the format ``[sequence_id, sequence]`` output_file : str, default=None - Path to an output file. Required if `sequences` is not a - file. If `sequences` is a file and `output` is not provided, UMI parsing is + Path to an output file. Required if `sequences` is not a + file. If `sequences` is a file and `output` is not provided, UMI parsing is done in-place and the `sequences` file is updated. pattern : str or iterable, default=None - Pattern (or iterable of patterns) for identifying the location of the UMI, - or the name of a built-in pattern. Built-in options include ``"smartseq-human-bcr"``. - Patterns may optionally contain leading and/or trailing conserved regions, with - the UMI position within the pattern represented by ``"[UMI]"``. As an example, - the built-in pattern for SmartSeq-BCR UMIs is:: - - "[UMI]TCAGCGGGAAGACATT" - - which would be a UMI sequence followed immediately by ``"TCAGCGGGAAGACATT"``. - By default, the pattern is matched on the 5' -> 3' strand. This allows - users to more easily construct patterns from their amplification primers without - needing to worry about reverse-complementing patterns for UMIs at the 3' end of the - input sequence. To override this, set `ignore_strand` to ``True``. If `pattern` - is not provided, UMIs will be parsed using only `length`, starting at the start - or end of the sequence. + Pattern (or iterable of patterns) for identifying the location of the UMI, + or the name of a built-in pattern. Built-in options include ``"smartseq-human-bcr"``. + Patterns may optionally contain leading and/or trailing conserved regions, with + the UMI position within the pattern represented by ``"[UMI]"``. As an example, + the built-in pattern for SmartSeq-BCR UMIs is:: + + "[UMI]TCAGCGGGAAGACATT" + + which would be a UMI sequence followed immediately by ``"TCAGCGGGAAGACATT"``. + By default, the pattern is matched on the 5' -> 3' strand. This allows + users to more easily construct patterns from their amplification primers without + needing to worry about reverse-complementing patterns for UMIs at the 3' end of the + input sequence. To override this, set `ignore_strand` to ``True``. If `pattern` + is not provided, UMIs will be parsed using only `length`, starting at the start + or end of the sequence. .. note:: - The UMIs for all `patterns` that meet the `allowed_mismatches` criteria in - the conserved leading/trailing portions will be concatenated into the final + The UMIs for all `patterns` that meet the `allowed_mismatches` criteria in + the conserved leading/trailing portions will be concatenated into the final UMI. This allows the use of multiple `patterns` for either: * different patterns designed to match one of several different types of - sequences in a heterogeneous sample. For example, if heavy, kappa and - lambda primers each have different conserved regions flanking the UMI - and the input file contains a mix of heavy, kappa and lambda chains, + sequences in a heterogeneous sample. For example, if heavy, kappa and + lambda primers each have different conserved regions flanking the UMI + and the input file contains a mix of heavy, kappa and lambda chains, supplying all patterns (assuming they're sufficiently different from each other) will allow parsing of UMIs from all chains. - * different patterns for sequences that contain multiple UMIs, either - at opposite ends of the sequence or on the same end of the sequence + * different patterns for sequences that contain multiple UMIs, either + at opposite ends of the sequence or on the same end of the sequence but in different locations and with different conserved flanking regions. length : int or iterable, default=None - Length of the UMI sequence, or iterable of UMI lengths. If multiple lengths are - provided, there must be an equal number of `patterns`, and they must be in the - same order (the first `pattern` should correspond to the first UMI `length`). If - `length` is positive, the UMI will be parsed from the start of the sequence. If - `length` is negative, the UMI will be parsed from the end of the sequence. If multiple - `patterns` are provided with a single `length`, that `length` will be used for all - `patterns`. Required if `pattern` does not have a conserved trailing region. - If `length` is not provided and a trailing conserved region is present in `pattern`, - the entire portion of `sequence` preceeding the trailing region will be parsed as the - UMI. Ignored if both leading and trailing sequences are present in `pattern`, - as the entire region between the conserved flanking regions will be parsed as + Length of the UMI sequence, or iterable of UMI lengths. If multiple lengths are + provided, there must be an equal number of `patterns`, and they must be in the + same order (the first `pattern` should correspond to the first UMI `length`). If + `length` is positive, the UMI will be parsed from the start of the sequence. If + `length` is negative, the UMI will be parsed from the end of the sequence. If multiple + `patterns` are provided with a single `length`, that `length` will be used for all + `patterns`. Required if `pattern` does not have a conserved trailing region. + If `length` is not provided and a trailing conserved region is present in `pattern`, + the entire portion of `sequence` preceeding the trailing region will be parsed as the + UMI. Ignored if both leading and trailing sequences are present in `pattern`, + as the entire region between the conserved flanking regions will be parsed as the UMI regardless of what `length` is provided. @@ -460,4 +456,3 @@ def _get_allowed_mismatches(name): "allowed_mismatches": SMARTSEQ_HUMAN_BCR_MISMATCH, }, } - diff --git a/abstar/preprocess.py b/abstar/utils/preprocess.py similarity index 73% rename from abstar/preprocess.py rename to abstar/utils/preprocess.py index f1a8916..afb2855 100644 --- a/abstar/preprocess.py +++ b/abstar/utils/preprocess.py @@ -31,21 +31,30 @@ from Bio import SeqIO -from .utils.pandaseq import pair_files +from .pandaseq import pair_files from abutils.utils.log import get_logger from abutils.utils.pipeline import list_files, make_dir -logger = get_logger('preprocess') - - -def quality_trim(input_directory=None, output_directory=None, - quality_cutoff=20, length_cutoff=50, - quality_type='sanger', compress_output=True, file_pairs=None, - singles_directory=None, nextseq=False, paired_reads=True, - allow_5prime_trimming=False, print_debug=False): - ''' +logger = get_logger("preprocess") + + +def quality_trim( + input_directory=None, + output_directory=None, + quality_cutoff=20, + length_cutoff=50, + quality_type="sanger", + compress_output=True, + file_pairs=None, + singles_directory=None, + nextseq=False, + paired_reads=True, + allow_5prime_trimming=False, + print_debug=False, +): + """ Performs quality trimming with sickle. Args: @@ -101,10 +110,10 @@ def quality_trim(input_directory=None, output_directory=None, Returns: str: Path to the output directory - ''' + """ if input_directory is None and any([file_pairs is None, output_directory is None]): - err = '\nERROR: Either an input_directory must be provided or ' - err += 'both file_pairs and an output_directory must be provided.\n' + err = "\nERROR: Either an input_directory must be provided or " + err += "both file_pairs and an output_directory must be provided.\n" print(err) sys.exit(1) if file_pairs: @@ -113,7 +122,7 @@ def quality_trim(input_directory=None, output_directory=None, input_directory = os.path.normpath(input_directory) if output_directory is None: oparent = os.path.dirname(input_directory) - output_directory = os.path.join(oparent, 'quality_trimmed') + output_directory = os.path.join(oparent, "quality_trimmed") make_dir(output_directory) if paired_reads: files = list_files(input_directory) @@ -128,48 +137,52 @@ def quality_trim(input_directory=None, output_directory=None, elif len(f) == 1: paired_end = False else: - err = 'ERROR: Each batch of files must contain either 1 (single-end reads) or ' - err += '2 (paired-end reads) files. This batch contains {} files:\n{}'.format( - len(f), '\n'.join(f)) - err2 += 'If you have paired-end reads that do not follow the Illumina naming scheme, ' - err2 += 'you can pass pairs of filenames (a list of lists/tuples) with the option. ' - err2 += 'If using , the output directory must also be provided.' + err = "ERROR: Each batch of files must contain either 1 (single-end reads) or " + err += ( + "2 (paired-end reads) files. This batch contains {} files:\n{}".format( + len(f), "\n".join(f) + ) + ) + err2 += "If you have paired-end reads that do not follow the Illumina naming scheme, " + err2 += "you can pass pairs of filenames (a list of lists/tuples) with the option. " + err2 += "If using , the output directory must also be provided." logger.info(err) logger.info(err2) continue f.sort() # set basic sickle cmd options - sickle = 'sickle pe' if paired_end else 'sickle se' - sickle += ' -t {}'.format(quality_type) - sickle += ' -l {}'.format(length_cutoff) - sickle += ' -q {}'.format(quality_cutoff) + sickle = "sickle pe" if paired_end else "sickle se" + sickle += " -t {}".format(quality_type) + sickle += " -l {}".format(length_cutoff) + sickle += " -q {}".format(quality_cutoff) if compress_output: - sickle += ' -g' + sickle += " -g" if not allow_5prime_trimming: - sickle += ' -x' + sickle += " -x" # compute input/output filenames, add to sickle cmd - sickle += ' -f {}'.format(f[0]) - o1_basename = os.path.basename(f[0]).rstrip('.gz') + sickle += " -f {}".format(f[0]) + o1_basename = os.path.basename(f[0]).rstrip(".gz") if compress_output: - o1_basename += '.gz' - sickle += ' -o {}'.format(os.path.join(output_directory, o1_basename)) + o1_basename += ".gz" + sickle += " -o {}".format(os.path.join(output_directory, o1_basename)) if paired_end: - sickle += ' -r {}'.format(f[1]) - o2_basename = os.path.basename(f[1]).rstrip('.gz') + sickle += " -r {}".format(f[1]) + o2_basename = os.path.basename(f[1]).rstrip(".gz") if compress_output: - o2_basename += '.gz' - sickle += ' -p {}'.format(os.path.join(output_directory, o2_basename)) + o2_basename += ".gz" + sickle += " -p {}".format(os.path.join(output_directory, o2_basename)) # compute singles output filename, add to sickle cmd if paired_end: if singles_directory is not None: - sfilename = '{}_{}_singles.fastq'.format( - o1_basename.rstrip('.gz').rstrip('.fastq').rstrip('.fq'), - o2_basename.rstrip('.gz').rstrip('.fastq').rstrip('.fq')) + sfilename = "{}_{}_singles.fastq".format( + o1_basename.rstrip(".gz").rstrip(".fastq").rstrip(".fq"), + o2_basename.rstrip(".gz").rstrip(".fastq").rstrip(".fq"), + ) if compress_output: - sfilename += '.gz' - sickle += ' -s {}'.format(os.path.join(singles_directory, sfilename)) + sfilename += ".gz" + sickle += " -s {}".format(os.path.join(singles_directory, sfilename)) else: - sickle += ' -s /dev/null' + sickle += " -s /dev/null" if print_debug: print(sickle) # run sickle @@ -179,17 +192,23 @@ def quality_trim(input_directory=None, output_directory=None, logger.debug(stderr) if print_debug: print(stdout) - print('') + print("") print(stderr) - print('') + print("") return output_directory -def adapter_trim(input_directory, output_directory=None, - adapter_5prime=None, adapter_3prime=None, - adapter_5prime_anchored=None, adapter_3prime_anchored=None, - adapter_both=None, compress_output=True): - ''' +def adapter_trim( + input_directory, + output_directory=None, + adapter_5prime=None, + adapter_3prime=None, + adapter_5prime_anchored=None, + adapter_3prime_anchored=None, + adapter_both=None, + compress_output=True, +): + """ Trims adapters with cutadapt. Args: @@ -226,37 +245,43 @@ def adapter_trim(input_directory, output_directory=None, Returns: str: Path to the output directory - ''' + """ input_directory = os.path.normpath(input_directory) if output_directory is None: oparent = os.path.dirname(input_directory) - output_directory = os.path.join(oparent, 'adapter_trimmed') + output_directory = os.path.join(oparent, "adapter_trimmed") make_dir(output_directory) files = list_files(input_directory) # parse adapter FASTA files, compile adapter option list adapters = [] - opts = ['-g', '-a', '-b'] + opts = ["-g", "-a", "-b"] adapt_files = [adapter_5prime, adapter_3prime, adapter_both] for o, a in zip(opts, adapt_files): if a is None: continue - adapts = [str(s.seq) for s in SeqIO.parse(open(a, 'r'), 'fasta')] - adapters += [' '.join(z) for z in zip([o] * len(adapts), adapts)] + adapts = [str(s.seq) for s in SeqIO.parse(open(a, "r"), "fasta")] + adapters += [" ".join(z) for z in zip([o] * len(adapts), adapts)] if adapter_5prime_anchored is not None: - adapts = ['^{}'.format(str(s.seq)) for s in SeqIO.parse(open(adapter_5prime_anchored, 'r'), 'fasta')] - adapters += ['-g {}'.format(a) for a in adapts] + adapts = [ + "^{}".format(str(s.seq)) + for s in SeqIO.parse(open(adapter_5prime_anchored, "r"), "fasta") + ] + adapters += ["-g {}".format(a) for a in adapts] if adapter_3prime_anchored is not None: - adapts = ['{}$'.format(str(s.seq)) for s in SeqIO.parse(open(adapter_3prime_anchored, 'r'), 'fasta')] - adapters += ['-a {}'.format(a) for a in adapts] + adapts = [ + "{}$".format(str(s.seq)) + for s in SeqIO.parse(open(adapter_3prime_anchored, "r"), "fasta") + ] + adapters += ["-a {}".format(a) for a in adapts] # process input files for ifile in files: - oname = os.path.basename(ifile).rstrip('.gz') + oname = os.path.basename(ifile).rstrip(".gz") if compress_output: - oname += '.gz' + oname += ".gz" ofile = os.path.join(output_directory, oname) # set up cutadapt command - adapter_string = ' '.join(adapters) - cutadapt = 'cutadapt -o {} {} {}'.format(ofile, adapter_string, ifile) + adapter_string = " ".join(adapters) + cutadapt = "cutadapt -o {} {} {}".format(ofile, adapter_string, ifile) # run cutadapt p = Popen(cutadapt, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() @@ -266,7 +291,7 @@ def adapter_trim(input_directory, output_directory=None, def fastqc(input_directory, output_directory=None, threads=-1): - ''' + """ Performs FASTQC analysis on raw NGS data. @@ -287,17 +312,18 @@ def fastqc(input_directory, output_directory=None, threads=-1): Returns: str: path to the output directory - ''' + """ input_directory = os.path.normpath(input_directory) if output_directory is None: oparent = os.path.dirname(input_directory) - output_directory = os.path.join(oparent, 'fastqc_reports') + output_directory = os.path.join(oparent, "fastqc_reports") make_dir(output_directory) files = list_files(input_directory) if threads < 0: threads = cpu_count() - fastqc_cmd = 'fastqc --noextract -o={} -t={} {}'.format(output_directory, - threads, ' '.join(files)) + fastqc_cmd = "fastqc --noextract -o={} -t={} {}".format( + output_directory, threads, " ".join(files) + ) p = Popen(fastqc_cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() logger.debug(stdout)