From 231930df58e975e1b3c6d1fbbc4c06e64a3e83bc Mon Sep 17 00:00:00 2001
From: Bryan Briney <bryanbriney@gmail.com>
Date: Thu, 4 May 2023 11:59:56 -0700
Subject: [PATCH 1/2] remove unused imports

---
 abstar/core/abstar.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/abstar/core/abstar.py b/abstar/core/abstar.py
index b27e48e..82d7434 100644
--- a/abstar/core/abstar.py
+++ b/abstar/core/abstar.py
@@ -73,7 +73,7 @@
     get_parquet_dtypes,
 )
 
-from ..utils.output import PARQUET_INCOMPATIBLE
+# from ..utils.output import PARQUET_INCOMPATIBLE
 from ..utils.queue.celery import celery
 
 
@@ -99,7 +99,6 @@
 #####################################################################
 
 
-
 def create_parser() -> ArgumentParser:
     parser = ArgumentParser(
         prog="abstar",
@@ -390,7 +389,9 @@ def __init__(
         temp=None,
         sequences=None,
         chunksize=500,
-        output_type=["json",],
+        output_type=[
+            "json",
+        ],
         assigner="blastn",
         merge=False,
         pandaseq_algo="simple_bayesian",
@@ -423,7 +424,13 @@ def __init__(
         self.log = os.path.abspath(log) if log is not None else log
         self.temp = os.path.abspath(temp) if temp is not None else temp
         self.chunksize = int(chunksize)
-        self.output_type = [output_type,] if output_type in STR_TYPES else output_type
+        self.output_type = (
+            [
+                output_type,
+            ]
+            if output_type in STR_TYPES
+            else output_type
+        )
         self.parquet = parquet
         self.assigner = assigner
         self.merge = True if basespace else merge
@@ -726,7 +733,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args):
                 for temp_file in temp_files:
                     with open(temp_file, "rb") as f:
                         shutil.copyfileobj(
-                            f, out_file, length=16 * 1024 ** 2
+                            f, out_file, length=16 * 1024**2
                         )  # Increasing buffer size to 16MB for faster transfer
             # For file formats with headers, only keep headers from the first file
             elif output_type in ["imgt", "tabular", "airr"]:
@@ -737,7 +744,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args):
                                 out_file.write(line)
                             elif j >= 1:
                                 out_file.write(line)
-                                
+
         if args.parquet:
             logger.info("Converting concatenated output to parquet format")
             # Make clear the output format from which the parquet file is generated.
@@ -764,7 +771,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args):
                 df.to_parquet(
                     pfile, engine="pyarrow", compression="snappy", write_index=False
                 )
-                
+
         ofiles.append(ofile)
     return ofiles
 
@@ -1544,5 +1551,6 @@ def run_main(arg_list: Optional[Iterable[str]] = None):
     output_dir = main(args)
     sys.stdout.write("\n\n")
 
+
 if __name__ == "__main__":
     run_main()

From 426b7bf637cda8fd306ec683dd7d94f6d74f7f99 Mon Sep 17 00:00:00 2001
From: Bryan Briney <bryanbriney@gmail.com>
Date: Thu, 4 May 2023 12:15:30 -0700
Subject: [PATCH 2/2] move preprocess to utils

---
 abstar/__init__.py               |   2 +-
 abstar/preprocess/umi.py         |  81 ++++++++--------
 abstar/{ => utils}/preprocess.py | 156 ++++++++++++++++++-------------
 3 files changed, 130 insertions(+), 109 deletions(-)
 rename abstar/{ => utils}/preprocess.py (73%)

diff --git a/abstar/__init__.py b/abstar/__init__.py
index 27ae239..b1cda2f 100644
--- a/abstar/__init__.py
+++ b/abstar/__init__.py
@@ -4,6 +4,6 @@
 warnings.simplefilter("ignore", BiopythonWarning)
 
 from .core.abstar import run, run_standalone, main, create_parser, validate_args
-from .preprocess import fastqc, adapter_trim, quality_trim
+from .utils.preprocess import fastqc, adapter_trim, quality_trim
 
 from .version import __version__
diff --git a/abstar/preprocess/umi.py b/abstar/preprocess/umi.py
index 6bb7089..e1100c7 100644
--- a/abstar/preprocess/umi.py
+++ b/abstar/preprocess/umi.py
@@ -35,9 +35,7 @@
 
 
 class UMI:
-    """
-    
-    """
+    """ """
 
     def __init__(
         self,
@@ -47,9 +45,7 @@ def __init__(
         ignore_strand: bool = False,
         extra_length_for_alignment: int = 25,
     ):
-        """
-        
-        """
+        """ """
         self.length = length
         self.pattern = pattern.strip().upper()
         self.ignore_strand = ignore_strand
@@ -138,7 +134,7 @@ def align(
 
     def get_mismatches(self) -> Optional[int]:
         """
-        Returns the total number of mismatches between the pattern(s) 
+        Returns the total number of mismatches between the pattern(s)
         and the input sequence.
         """
         total = len(self.pattern.replace("[UMI]", ""))
@@ -202,54 +198,54 @@ def parse_umis(
             4. a list of lists/tuples, of the format ``[sequence_id, sequence]``
 
     output_file : str, default=None
-        Path to an output file. Required if `sequences` is not a 
-        file. If `sequences` is a file and `output` is not provided, UMI parsing is 
+        Path to an output file. Required if `sequences` is not a
+        file. If `sequences` is a file and `output` is not provided, UMI parsing is
         done in-place and the `sequences` file is updated.
 
     pattern : str or iterable, default=None
-        Pattern (or iterable of patterns) for identifying the location of the UMI, 
-        or the name of a built-in pattern. Built-in options include ``"smartseq-human-bcr"``. 
-        Patterns may optionally contain leading and/or trailing conserved regions, with 
-        the UMI position within the pattern represented by ``"[UMI]"``. As an example, 
-        the built-in pattern for SmartSeq-BCR UMIs is::  
-
-            "[UMI]TCAGCGGGAAGACATT"  
-        
-        which would be a UMI sequence followed immediately by ``"TCAGCGGGAAGACATT"``. 
-        By default, the pattern is matched on the 5' -> 3' strand. This allows 
-        users to more easily construct patterns from their amplification primers without 
-        needing to worry about reverse-complementing patterns for UMIs at the 3' end of the 
-        input sequence. To override this, set `ignore_strand` to ``True``. If `pattern` 
-        is not provided, UMIs will be parsed using only `length`, starting at the start 
-        or end of the sequence.  
+        Pattern (or iterable of patterns) for identifying the location of the UMI,
+        or the name of a built-in pattern. Built-in options include ``"smartseq-human-bcr"``.
+        Patterns may optionally contain leading and/or trailing conserved regions, with
+        the UMI position within the pattern represented by ``"[UMI]"``. As an example,
+        the built-in pattern for SmartSeq-BCR UMIs is::
+
+            "[UMI]TCAGCGGGAAGACATT"
+
+        which would be a UMI sequence followed immediately by ``"TCAGCGGGAAGACATT"``.
+        By default, the pattern is matched on the 5' -> 3' strand. This allows
+        users to more easily construct patterns from their amplification primers without
+        needing to worry about reverse-complementing patterns for UMIs at the 3' end of the
+        input sequence. To override this, set `ignore_strand` to ``True``. If `pattern`
+        is not provided, UMIs will be parsed using only `length`, starting at the start
+        or end of the sequence.
 
         .. note::
 
-            The UMIs for all `patterns` that meet the `allowed_mismatches` criteria in 
-            the conserved leading/trailing portions will be concatenated into the final 
+            The UMIs for all `patterns` that meet the `allowed_mismatches` criteria in
+            the conserved leading/trailing portions will be concatenated into the final
             UMI. This allows the use of multiple `patterns` for either:
               * different patterns designed to match one of several different types of
-                sequences in a heterogeneous sample. For example, if heavy, kappa and 
-                lambda primers each have different conserved regions flanking the UMI 
-                and the input file contains a mix of heavy, kappa and lambda chains, 
+                sequences in a heterogeneous sample. For example, if heavy, kappa and
+                lambda primers each have different conserved regions flanking the UMI
+                and the input file contains a mix of heavy, kappa and lambda chains,
                 supplying all patterns (assuming they're sufficiently different from
                 each other) will allow parsing of UMIs from all chains.
-              * different patterns for sequences that contain multiple UMIs, either 
-                at opposite ends of the sequence or on the same end of the sequence 
+              * different patterns for sequences that contain multiple UMIs, either
+                at opposite ends of the sequence or on the same end of the sequence
                 but in different locations and with different conserved flanking regions.
 
     length : int or iterable, default=None
-        Length of the UMI sequence, or iterable of UMI lengths. If multiple lengths are 
-        provided, there must be an equal number of `patterns`, and they must be in the 
-        same order (the first `pattern` should correspond to the first UMI `length`). If 
-        `length` is positive, the UMI will be parsed from the start of the sequence. If 
-        `length` is negative, the UMI will be parsed from the end of the sequence. If multiple 
-        `patterns` are provided with a single `length`, that `length` will be used for all 
-        `patterns`. Required if `pattern` does not have a conserved trailing region. 
-        If `length` is not provided and a trailing conserved region is present in `pattern`, 
-        the entire portion of `sequence` preceeding the trailing region will be parsed as the 
-        UMI. Ignored if both leading and trailing sequences are present in `pattern`, 
-        as the entire region between the conserved flanking regions will be parsed as 
+        Length of the UMI sequence, or iterable of UMI lengths. If multiple lengths are
+        provided, there must be an equal number of `patterns`, and they must be in the
+        same order (the first `pattern` should correspond to the first UMI `length`). If
+        `length` is positive, the UMI will be parsed from the start of the sequence. If
+        `length` is negative, the UMI will be parsed from the end of the sequence. If multiple
+        `patterns` are provided with a single `length`, that `length` will be used for all
+        `patterns`. Required if `pattern` does not have a conserved trailing region.
+        If `length` is not provided and a trailing conserved region is present in `pattern`,
+        the entire portion of `sequence` preceeding the trailing region will be parsed as the
+        UMI. Ignored if both leading and trailing sequences are present in `pattern`,
+        as the entire region between the conserved flanking regions will be parsed as
         the UMI regardless of what `length` is provided.
 
 
@@ -460,4 +456,3 @@ def _get_allowed_mismatches(name):
         "allowed_mismatches": SMARTSEQ_HUMAN_BCR_MISMATCH,
     },
 }
-
diff --git a/abstar/preprocess.py b/abstar/utils/preprocess.py
similarity index 73%
rename from abstar/preprocess.py
rename to abstar/utils/preprocess.py
index f1a8916..afb2855 100644
--- a/abstar/preprocess.py
+++ b/abstar/utils/preprocess.py
@@ -31,21 +31,30 @@
 
 from Bio import SeqIO
 
-from .utils.pandaseq import pair_files
+from .pandaseq import pair_files
 
 from abutils.utils.log import get_logger
 from abutils.utils.pipeline import list_files, make_dir
 
 
-logger = get_logger('preprocess')
-
-
-def quality_trim(input_directory=None, output_directory=None,
-        quality_cutoff=20, length_cutoff=50,
-        quality_type='sanger', compress_output=True, file_pairs=None,
-        singles_directory=None, nextseq=False, paired_reads=True,
-        allow_5prime_trimming=False, print_debug=False):
-    '''
+logger = get_logger("preprocess")
+
+
+def quality_trim(
+    input_directory=None,
+    output_directory=None,
+    quality_cutoff=20,
+    length_cutoff=50,
+    quality_type="sanger",
+    compress_output=True,
+    file_pairs=None,
+    singles_directory=None,
+    nextseq=False,
+    paired_reads=True,
+    allow_5prime_trimming=False,
+    print_debug=False,
+):
+    """
     Performs quality trimming with sickle.
 
     Args:
@@ -101,10 +110,10 @@ def quality_trim(input_directory=None, output_directory=None,
     Returns:
 
         str: Path to the output directory
-    '''
+    """
     if input_directory is None and any([file_pairs is None, output_directory is None]):
-        err = '\nERROR: Either an input_directory must be provided or '
-        err += 'both file_pairs and an output_directory must be provided.\n'
+        err = "\nERROR: Either an input_directory must be provided or "
+        err += "both file_pairs and an output_directory must be provided.\n"
         print(err)
         sys.exit(1)
     if file_pairs:
@@ -113,7 +122,7 @@ def quality_trim(input_directory=None, output_directory=None,
         input_directory = os.path.normpath(input_directory)
         if output_directory is None:
             oparent = os.path.dirname(input_directory)
-            output_directory = os.path.join(oparent, 'quality_trimmed')
+            output_directory = os.path.join(oparent, "quality_trimmed")
         make_dir(output_directory)
         if paired_reads:
             files = list_files(input_directory)
@@ -128,48 +137,52 @@ def quality_trim(input_directory=None, output_directory=None,
         elif len(f) == 1:
             paired_end = False
         else:
-            err = 'ERROR: Each batch of files must contain either 1 (single-end reads) or '
-            err += '2 (paired-end reads) files. This batch contains {} files:\n{}'.format(
-                len(f), '\n'.join(f))
-            err2 += 'If you have paired-end reads that do not follow the Illumina naming scheme, '
-            err2 += 'you can pass pairs of filenames (a list of lists/tuples) with the <file_pairs> option. '
-            err2 += 'If using <file_pairs>, the output directory must also be provided.'
+            err = "ERROR: Each batch of files must contain either 1 (single-end reads) or "
+            err += (
+                "2 (paired-end reads) files. This batch contains {} files:\n{}".format(
+                    len(f), "\n".join(f)
+                )
+            )
+            err2 += "If you have paired-end reads that do not follow the Illumina naming scheme, "
+            err2 += "you can pass pairs of filenames (a list of lists/tuples) with the <file_pairs> option. "
+            err2 += "If using <file_pairs>, the output directory must also be provided."
             logger.info(err)
             logger.info(err2)
             continue
         f.sort()
         # set basic sickle cmd options
-        sickle = 'sickle pe' if paired_end else 'sickle se'
-        sickle += ' -t {}'.format(quality_type)
-        sickle += ' -l {}'.format(length_cutoff)
-        sickle += ' -q {}'.format(quality_cutoff)
+        sickle = "sickle pe" if paired_end else "sickle se"
+        sickle += " -t {}".format(quality_type)
+        sickle += " -l {}".format(length_cutoff)
+        sickle += " -q {}".format(quality_cutoff)
         if compress_output:
-            sickle += ' -g'
+            sickle += " -g"
         if not allow_5prime_trimming:
-            sickle += ' -x'
+            sickle += " -x"
         # compute input/output filenames, add to sickle cmd
-        sickle += ' -f {}'.format(f[0])
-        o1_basename = os.path.basename(f[0]).rstrip('.gz')
+        sickle += " -f {}".format(f[0])
+        o1_basename = os.path.basename(f[0]).rstrip(".gz")
         if compress_output:
-            o1_basename += '.gz'
-        sickle += ' -o {}'.format(os.path.join(output_directory, o1_basename))
+            o1_basename += ".gz"
+        sickle += " -o {}".format(os.path.join(output_directory, o1_basename))
         if paired_end:
-            sickle += ' -r {}'.format(f[1])
-            o2_basename = os.path.basename(f[1]).rstrip('.gz')
+            sickle += " -r {}".format(f[1])
+            o2_basename = os.path.basename(f[1]).rstrip(".gz")
             if compress_output:
-                o2_basename += '.gz'
-            sickle += ' -p {}'.format(os.path.join(output_directory, o2_basename))
+                o2_basename += ".gz"
+            sickle += " -p {}".format(os.path.join(output_directory, o2_basename))
         # compute singles output filename, add to sickle cmd
         if paired_end:
             if singles_directory is not None:
-                sfilename = '{}_{}_singles.fastq'.format(
-                    o1_basename.rstrip('.gz').rstrip('.fastq').rstrip('.fq'),
-                    o2_basename.rstrip('.gz').rstrip('.fastq').rstrip('.fq'))
+                sfilename = "{}_{}_singles.fastq".format(
+                    o1_basename.rstrip(".gz").rstrip(".fastq").rstrip(".fq"),
+                    o2_basename.rstrip(".gz").rstrip(".fastq").rstrip(".fq"),
+                )
                 if compress_output:
-                    sfilename += '.gz'
-                sickle += ' -s {}'.format(os.path.join(singles_directory, sfilename))
+                    sfilename += ".gz"
+                sickle += " -s {}".format(os.path.join(singles_directory, sfilename))
             else:
-                sickle += ' -s /dev/null'
+                sickle += " -s /dev/null"
         if print_debug:
             print(sickle)
         # run sickle
@@ -179,17 +192,23 @@ def quality_trim(input_directory=None, output_directory=None,
         logger.debug(stderr)
         if print_debug:
             print(stdout)
-            print('')
+            print("")
             print(stderr)
-            print('')
+            print("")
     return output_directory
 
 
-def adapter_trim(input_directory, output_directory=None,
-        adapter_5prime=None, adapter_3prime=None,
-        adapter_5prime_anchored=None, adapter_3prime_anchored=None,
-        adapter_both=None, compress_output=True):
-    '''
+def adapter_trim(
+    input_directory,
+    output_directory=None,
+    adapter_5prime=None,
+    adapter_3prime=None,
+    adapter_5prime_anchored=None,
+    adapter_3prime_anchored=None,
+    adapter_both=None,
+    compress_output=True,
+):
+    """
     Trims adapters with cutadapt.
 
     Args:
@@ -226,37 +245,43 @@ def adapter_trim(input_directory, output_directory=None,
     Returns:
 
         str: Path to the output directory
-    '''
+    """
     input_directory = os.path.normpath(input_directory)
     if output_directory is None:
         oparent = os.path.dirname(input_directory)
-        output_directory = os.path.join(oparent, 'adapter_trimmed')
+        output_directory = os.path.join(oparent, "adapter_trimmed")
     make_dir(output_directory)
     files = list_files(input_directory)
     # parse adapter FASTA files, compile adapter option list
     adapters = []
-    opts = ['-g', '-a', '-b']
+    opts = ["-g", "-a", "-b"]
     adapt_files = [adapter_5prime, adapter_3prime, adapter_both]
     for o, a in zip(opts, adapt_files):
         if a is None:
             continue
-        adapts = [str(s.seq) for s in SeqIO.parse(open(a, 'r'), 'fasta')]
-        adapters += [' '.join(z) for z in zip([o] * len(adapts), adapts)]
+        adapts = [str(s.seq) for s in SeqIO.parse(open(a, "r"), "fasta")]
+        adapters += [" ".join(z) for z in zip([o] * len(adapts), adapts)]
     if adapter_5prime_anchored is not None:
-        adapts = ['^{}'.format(str(s.seq)) for s in SeqIO.parse(open(adapter_5prime_anchored, 'r'), 'fasta')]
-        adapters += ['-g {}'.format(a) for a in adapts]
+        adapts = [
+            "^{}".format(str(s.seq))
+            for s in SeqIO.parse(open(adapter_5prime_anchored, "r"), "fasta")
+        ]
+        adapters += ["-g {}".format(a) for a in adapts]
     if adapter_3prime_anchored is not None:
-        adapts = ['{}$'.format(str(s.seq)) for s in SeqIO.parse(open(adapter_3prime_anchored, 'r'), 'fasta')]
-        adapters += ['-a {}'.format(a) for a in adapts]
+        adapts = [
+            "{}$".format(str(s.seq))
+            for s in SeqIO.parse(open(adapter_3prime_anchored, "r"), "fasta")
+        ]
+        adapters += ["-a {}".format(a) for a in adapts]
     # process input files
     for ifile in files:
-        oname = os.path.basename(ifile).rstrip('.gz')
+        oname = os.path.basename(ifile).rstrip(".gz")
         if compress_output:
-            oname += '.gz'
+            oname += ".gz"
         ofile = os.path.join(output_directory, oname)
         # set up cutadapt command
-        adapter_string = ' '.join(adapters)
-        cutadapt = 'cutadapt -o {} {} {}'.format(ofile, adapter_string, ifile)
+        adapter_string = " ".join(adapters)
+        cutadapt = "cutadapt -o {} {} {}".format(ofile, adapter_string, ifile)
         # run cutadapt
         p = Popen(cutadapt, stdout=PIPE, stderr=PIPE, shell=True)
         stdout, stderr = p.communicate()
@@ -266,7 +291,7 @@ def adapter_trim(input_directory, output_directory=None,
 
 
 def fastqc(input_directory, output_directory=None, threads=-1):
-    '''
+    """
     Performs FASTQC analysis on raw NGS data.
 
 
@@ -287,17 +312,18 @@ def fastqc(input_directory, output_directory=None, threads=-1):
     Returns:
 
         str: path to the output directory
-    '''
+    """
     input_directory = os.path.normpath(input_directory)
     if output_directory is None:
         oparent = os.path.dirname(input_directory)
-        output_directory = os.path.join(oparent, 'fastqc_reports')
+        output_directory = os.path.join(oparent, "fastqc_reports")
     make_dir(output_directory)
     files = list_files(input_directory)
     if threads < 0:
         threads = cpu_count()
-    fastqc_cmd = 'fastqc --noextract -o={} -t={} {}'.format(output_directory,
-        threads, ' '.join(files))
+    fastqc_cmd = "fastqc --noextract -o={} -t={} {}".format(
+        output_directory, threads, " ".join(files)
+    )
     p = Popen(fastqc_cmd, stdout=PIPE, stderr=PIPE, shell=True)
     stdout, stderr = p.communicate()
     logger.debug(stdout)