Skip to content
This repository was archived by the owner on Apr 22, 2025. It is now read-only.

Commit a032b68

Browse files
Production hotfixes
1 parent dd39ff0 commit a032b68

File tree

4 files changed

+120
-10
lines changed

4 files changed

+120
-10
lines changed

sequence_processing_pipeline/Commands.py

+3
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ def demux(id_map, fp, out_d, task, maxtask):
119119
# '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1 BX:Z:TATGACACATGCGGCCCT' # noqa
120120
# '@baz/1
121121

122+
# NB: from 6d794a37-12cd-4f8e-95d6-72a4b8a1ec1c's only-adapter-filtered results: # noqa
123+
# @A00953:244:HYHYWDSXY:3:1101:14082:3740 1:N:0:CCGTAAGA+TCTAACGC
124+
122125
fname_encoded, sid = i.split(delimiter, 1)
123126

124127
if fname_encoded not in openfps:

sequence_processing_pipeline/GenPrepFileJob.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from sequence_processing_pipeline.PipelineError import PipelineError
33
from os import makedirs, symlink
44
from os.path import join, exists, basename
5-
from shutil import copytree
5+
from shutil import copy
66
from functools import partial
77
from collections import defaultdict
88
from metapool import (demux_sample_sheet, parse_prep,
@@ -31,6 +31,11 @@ def __init__(self, run_dir, convert_job_path, qc_job_path, output_path,
3131
self.commands = []
3232
self.has_replicates = False
3333
self.replicate_count = 0
34+
# instead of a directory, reports_path should point to the single file
35+
# currently needed by seqpro. This means reports_path should equal:
36+
# /.../ConvertJob/Reports/Demultiplex_Stats.csv not
37+
# /.../ConvertJob/Reports.
38+
3439
self.reports_path = reports_path
3540

3641
# make the 'root' of your run_directory
@@ -39,14 +44,18 @@ def __init__(self, run_dir, convert_job_path, qc_job_path, output_path,
3944
# run_directory
4045

4146
# This directory will already exist on restarts, hence avoid
42-
# copying.
47+
# copying. To support legacy seqpro, We will copy the single file
48+
# seqpro needs into a clean sub-directory named 'Reports'. This can
49+
# be fixed when seqpro is refactored.
4350
reports_dir = join(self.output_path, self.run_id, 'Reports')
4451

4552
if exists(reports_dir):
4653
self.is_restart = True
4754
else:
4855
self.is_restart = False
49-
copytree(self.reports_path, reports_dir)
56+
57+
makedirs(reports_dir)
58+
copy(self.reports_path, reports_dir)
5059

5160
# extracting from either convert_job_path or qc_job_path should
5261
# produce equal results.

sequence_processing_pipeline/NuQCJob.py

+98-5
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@
88
from shutil import move
99
import logging
1010
from sequence_processing_pipeline.Commands import split_similar_size_bins
11-
from sequence_processing_pipeline.util import iter_paired_files
11+
from sequence_processing_pipeline.util import (iter_paired_files,
12+
determine_orientation)
1213
from jinja2 import Environment
1314
from glob import glob
1415
import re
1516
from sys import executable
17+
from gzip import open as gzip_open
1618

1719

1820
logging.basicConfig(level=logging.DEBUG)
@@ -116,6 +118,63 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
116118

117119
self._validate_project_data()
118120

121+
def hack_helper(self):
122+
# get list of raw compressed fastq files. only consider R1 and R2
123+
# reads.
124+
125+
# Note that NuQCJob works across all projects in a sample-sheet. if
126+
# there are more than one project in the sample-sheet and if one
127+
# is a TellSeq project and one isn't then that would break an
128+
# an assumption of this helper (one file is representative of all
129+
# files.) However since tellseq requires a special sample-sheet, it
130+
# can be assumed that all projects in a tellseq sample-sheet will be
131+
# tellseq and therefore carry the TellSeq BX metadata. The inverse
132+
# should also be true.
133+
134+
fastq_paths = glob(self.root_dir + '/*/*.fastq.gz')
135+
fastq_paths = [x for x in fastq_paths
136+
if determine_orientation(x) in ['R1', 'R2']]
137+
138+
apply_bx = None
139+
140+
for fp in fastq_paths:
141+
# open a compressed fastq file and read its first line.
142+
with gzip_open(fp, 'r') as f:
143+
line = f.readline()
144+
145+
# convert the line to regular text and remove newline.
146+
line = line.decode("utf-8").strip()
147+
148+
# if file is empty process another file until we find
149+
# one that isn't empty.
150+
if line == '':
151+
continue
152+
153+
# break up sequence id line into sequence id plus possible
154+
# metadata element(s).
155+
line = line.split(' ')
156+
157+
if len(line) == 1:
158+
# there is no metadata. do not apply 'BX'.
159+
apply_bx = False
160+
break
161+
elif len(line) == 2:
162+
# there is some kind of additional metadata,
163+
# but it may not be BX.
164+
if line[-1].startswith('BX'):
165+
apply_bx = True
166+
break
167+
else:
168+
apply_bx = False
169+
break
170+
else:
171+
raise ValueError("I don't know how to process '%s'" % line)
172+
173+
if apply_bx is None:
174+
raise ValueError("It seems like all raw files are empty")
175+
176+
return apply_bx
177+
119178
def _validate_project_data(self):
120179
# Validate project settings in [Bioinformatics]
121180
for project in self.project_data:
@@ -394,15 +453,26 @@ def _generate_mmi_filter_cmds(self, working_dir):
394453

395454
cores_to_allocate = int(self.cores_per_task / 2)
396455

397-
if len(self.additional_fastq_tags) > 0:
456+
# hack_helper is a hack that will scan all of the R1 and R2 files
457+
# in self.root_dir until it finds a non-empty file to read. It will
458+
# read the first line of the compressed fastq file and see if it
459+
# contains optional BX metadata. If not it will return False, other
460+
# wise True.
461+
apply_bx = self.hack_helper()
462+
463+
# the default setting.
464+
tags = ""
465+
t_switch = ""
466+
467+
if apply_bx & len(self.additional_fastq_tags) > 0:
398468
# add tags for known metadata types that fastq files may have
399469
# been annotated with. Samtools will safely ignore tags that
400470
# are not present.
471+
# NB: This doesn't appear to be true, actually. if there is
472+
# a metadata element but it does not begin with 'BX', supplying
473+
# '-T BX' will cause an error writing output to disk.
401474
tags = " -T %s" % ','.join(self.additional_fastq_tags)
402475
t_switch = " -y"
403-
else:
404-
tags = ""
405-
t_switch = ""
406476

407477
for count, mmi_db_path in enumerate(self.mmi_file_paths):
408478
if count == 0:
@@ -499,3 +569,26 @@ def _generate_job_script(self, max_bucket_size):
499569
pmls_path=self.pmls_path))
500570

501571
return job_script_path
572+
573+
def parse_logs(self):
574+
log_path = join(self.output_path, 'logs')
575+
files = sorted(glob(join(log_path, '*')))
576+
msgs = []
577+
578+
# assume that the only possible files in logs directory are '.out'
579+
# files and zero, one, or many 'seqs.movi.n.txt.gz' files.
580+
# the latter may be present because the last step of a successful
581+
# job is to rename and move this file into its final location while
582+
# the logs directory is the default 'working' directory for this job
583+
# as this ensures slurm.err and slurm.out files will always be in
584+
# a known location.
585+
586+
# for now, construct lists of both of these types of files.
587+
output_logs = [x for x in files if x.endswith('.out')]
588+
589+
for some_file in output_logs:
590+
with open(some_file, 'r') as f:
591+
msgs += [line for line in f.readlines()
592+
if 'error:' in line.lower()]
593+
594+
return [msg.strip() for msg in msgs]

sequence_processing_pipeline/Pipeline.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -850,8 +850,13 @@ def get_project_info(self, short_names=False):
850850
if CONTAINS_REPLICATES_KEY in bi_df.columns.tolist():
851851
# subselect rows in [Bioinformatics] based on whether they
852852
# match the project name.
853-
df = bi_df.loc[bi_df['Sample_Project'] ==
854-
curr_project_info[proj_name_key]]
853+
854+
# whether short_names or full_names are requested in the
855+
# results, the match will always need to be against the
856+
# full project name, which is what's expected to be in
857+
# the Sample_Project column.
858+
sample_project = curr_project_info[PROJECT_FULL_NAME_KEY]
859+
df = bi_df.loc[bi_df['Sample_Project'] == sample_project]
855860
# since only one project can match by definition, convert
856861
# to dict and extract the needed value.
857862
curr_contains_reps = df.iloc[0].to_dict()[

0 commit comments

Comments
 (0)