Production hotfixes

charles-cowart · charles-cowart · commit a032b6822273 · 2025-01-30T14:09:33.000-08:00
diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py
@@ -119,6 +119,9 @@ def demux(id_map, fp, out_d, task, maxtask):
         # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1 BX:Z:TATGACACATGCGGCCCT' # noqa
         # '@baz/1
 
+        # NB: from 6d794a37-12cd-4f8e-95d6-72a4b8a1ec1c's only-adapter-filtered results: # noqa
+        # @A00953:244:HYHYWDSXY:3:1101:14082:3740 1:N:0:CCGTAAGA+TCTAACGC
+
         fname_encoded, sid = i.split(delimiter, 1)
 
         if fname_encoded not in openfps:
diff --git a/sequence_processing_pipeline/GenPrepFileJob.py b/sequence_processing_pipeline/GenPrepFileJob.py
@@ -2,7 +2,7 @@
 from sequence_processing_pipeline.PipelineError import PipelineError
 from os import makedirs, symlink
 from os.path import join, exists, basename
-from shutil import copytree
+from shutil import copy
 from functools import partial
 from collections import defaultdict
 from metapool import (demux_sample_sheet, parse_prep,
@@ -31,6 +31,11 @@ def __init__(self, run_dir, convert_job_path, qc_job_path, output_path,
         self.commands = []
         self.has_replicates = False
         self.replicate_count = 0
+        # instead of a directory, reports_path should point to the single file
+        # currently needed by seqpro. This means reports_path should equal:
+        # /.../ConvertJob/Reports/Demultiplex_Stats.csv not
+        # /.../ConvertJob/Reports.
+
         self.reports_path = reports_path
 
         # make the 'root' of your run_directory
@@ -39,14 +44,18 @@ def __init__(self, run_dir, convert_job_path, qc_job_path, output_path,
         # run_directory
 
         # This directory will already exist on restarts, hence avoid
-        # copying.
+        # copying. To support legacy seqpro, We will copy the single file
+        # seqpro needs into a clean sub-directory named 'Reports'. This can
+        # be fixed when seqpro is refactored.
         reports_dir = join(self.output_path, self.run_id, 'Reports')
 
         if exists(reports_dir):
             self.is_restart = True
         else:
             self.is_restart = False
-            copytree(self.reports_path, reports_dir)
+
+            makedirs(reports_dir)
+            copy(self.reports_path, reports_dir)
 
         # extracting from either convert_job_path or qc_job_path should
         # produce equal results.
diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py
@@ -8,11 +8,13 @@
 from shutil import move
 import logging
 from sequence_processing_pipeline.Commands import split_similar_size_bins
-from sequence_processing_pipeline.util import iter_paired_files
+from sequence_processing_pipeline.util import (iter_paired_files,
+                                               determine_orientation)
 from jinja2 import Environment
 from glob import glob
 import re
 from sys import executable
+from gzip import open as gzip_open
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -116,6 +118,63 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
 
         self._validate_project_data()
 
+    def hack_helper(self):
+        # get list of raw compressed fastq files. only consider R1 and R2
+        # reads.
+
+        # Note that NuQCJob works across all projects in a sample-sheet. if
+        # there are more than one project in the sample-sheet and if one
+        # is a TellSeq project and one isn't then that would break an
+        # an assumption of this helper (one file is representative of all
+        # files.) However since tellseq requires a special sample-sheet, it
+        # can be assumed that all projects in a tellseq sample-sheet will be
+        # tellseq and therefore carry the TellSeq BX metadata. The inverse
+        # should also be true.
+
+        fastq_paths = glob(self.root_dir + '/*/*.fastq.gz')
+        fastq_paths = [x for x in fastq_paths
+                       if determine_orientation(x) in ['R1', 'R2']]
+
+        apply_bx = None
+
+        for fp in fastq_paths:
+            # open a compressed fastq file and read its first line.
+            with gzip_open(fp, 'r') as f:
+                line = f.readline()
+
+            # convert the line to regular text and remove newline.
+            line = line.decode("utf-8").strip()
+
+            # if file is empty process another file until we find
+            # one that isn't empty.
+            if line == '':
+                continue
+
+            # break up sequence id line into sequence id plus possible
+            # metadata element(s).
+            line = line.split(' ')
+
+            if len(line) == 1:
+                # there is no metadata. do not apply 'BX'.
+                apply_bx = False
+                break
+            elif len(line) == 2:
+                # there is some kind of additional metadata,
+                # but it may not be BX.
+                if line[-1].startswith('BX'):
+                    apply_bx = True
+                    break
+                else:
+                    apply_bx = False
+                    break
+            else:
+                raise ValueError("I don't know how to process '%s'" % line)
+
+        if apply_bx is None:
+            raise ValueError("It seems like all raw files are empty")
+
+        return apply_bx
+
     def _validate_project_data(self):
         # Validate project settings in [Bioinformatics]
         for project in self.project_data:
@@ -394,15 +453,26 @@ def _generate_mmi_filter_cmds(self, working_dir):
 
         cores_to_allocate = int(self.cores_per_task / 2)
 
-        if len(self.additional_fastq_tags) > 0:
+        # hack_helper is a hack that will scan all of the R1 and R2 files
+        # in self.root_dir until it finds a non-empty file to read. It will
+        # read the first line of the compressed fastq file and see if it
+        # contains optional BX metadata. If not it will return False, other
+        # wise True.
+        apply_bx = self.hack_helper()
+
+        # the default setting.
+        tags = ""
+        t_switch = ""
+
+        if apply_bx & len(self.additional_fastq_tags) > 0:
             # add tags for known metadata types that fastq files may have
             # been annotated with. Samtools will safely ignore tags that
             # are not present.
+            # NB: This doesn't appear to be true, actually. if there is
+            # a metadata element but it does not begin with 'BX', supplying
+            # '-T BX' will cause an error writing output to disk.
             tags = " -T %s" % ','.join(self.additional_fastq_tags)
             t_switch = " -y"
-        else:
-            tags = ""
-            t_switch = ""
 
         for count, mmi_db_path in enumerate(self.mmi_file_paths):
             if count == 0:
@@ -499,3 +569,26 @@ def _generate_job_script(self, max_bucket_size):
                                     pmls_path=self.pmls_path))
 
         return job_script_path
+
+    def parse_logs(self):
+        log_path = join(self.output_path, 'logs')
+        files = sorted(glob(join(log_path, '*')))
+        msgs = []
+
+        # assume that the only possible files in logs directory are '.out'
+        # files and zero, one, or many 'seqs.movi.n.txt.gz' files.
+        # the latter may be present because the last step of a successful
+        # job is to rename and move this file into its final location while
+        # the logs directory is the default 'working' directory for this job
+        # as this ensures slurm.err and slurm.out files will always be in
+        # a known location.
+
+        # for now, construct lists of both of these types of files.
+        output_logs = [x for x in files if x.endswith('.out')]
+
+        for some_file in output_logs:
+            with open(some_file, 'r') as f:
+                msgs += [line for line in f.readlines()
+                         if 'error:' in line.lower()]
+
+        return [msg.strip() for msg in msgs]
diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
@@ -850,8 +850,13 @@ def get_project_info(self, short_names=False):
                 if CONTAINS_REPLICATES_KEY in bi_df.columns.tolist():
                     # subselect rows in [Bioinformatics] based on whether they
                     # match the project name.
-                    df = bi_df.loc[bi_df['Sample_Project'] ==
-                                   curr_project_info[proj_name_key]]
+
+                    # whether short_names or full_names are requested in the
+                    # results, the match will always need to be against the
+                    # full project name, which is what's expected to be in
+                    # the Sample_Project column.
+                    sample_project = curr_project_info[PROJECT_FULL_NAME_KEY]
+                    df = bi_df.loc[bi_df['Sample_Project'] == sample_project]
                     # since only one project can match by definition, convert
                     # to dict and extract the needed value.
                     curr_contains_reps = df.iloc[0].to_dict()[