Skip to content

Commit 1cdf853

Browse files
committed
Fixed up a bunch of print statements to make the reporting a bit more structured.
Also changed output style to match the data loader a bit more closely.
1 parent da1a067 commit 1cdf853

File tree

2 files changed

+55
-52
lines changed

2 files changed

+55
-52
lines changed

verify/AIRR-repertoire-checks.py

+49-44
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def rename_cols(flattened_sub_df, field_name):
156156
elif flag == "json_response":
157157
json_data = self.execute_query("repertoire", "")
158158
else:
159-
print("INVALID FLAG: pick one of 'metadata' or 'json_response'")
159+
print("FAIL: INVALID FLAG - pick one of 'metadata' or 'json_response'")
160160
sys.exit(0)
161161

162162
# Begin flattening
@@ -238,23 +238,24 @@ def identify_file_type(self):
238238

239239
# Metadata if of type JSON
240240
elif "json" in metadata:
241-
florian_json = requests.get(metadata)
242-
florian_json = florian_json.json()
243-
master = self.flatten_json(florian_json)
241+
repsonse = requests.get(metadata)
242+
json_data = repsonse.json()
243+
master = self.flatten_json(json_data)
244244
else:
245-
print("File format provided is not valid")
245+
print("FAIL: File format provided is not valid")
246246
sys.exit(0)
247247

248248
# Check if file is empty
249249
if master.empty:
250-
print("EMPTY DATA FRAME: Cannot find specified study ID\n")
250+
print("FAIL: EMPTY DATA FRAME - Cannot find specified study ID\n")
251251
print(master)
252252
sys.exit(0)
253253

254+
print("PASS: Metadata file %s loaded\n"%(metadata))
254255
return master
255256

256257
except:
257-
print("Warning: Provided wrong type file: cannot read metadata.")
258+
print("FAIL: Provided wrong type file: cannot read metadata.")
258259
sys.exit(0)
259260

260261
def execute_query(self, flag, repertoire_id):
@@ -280,29 +281,26 @@ def execute_query(self, flag, repertoire_id):
280281
query_files = f"{self.facet_json}{self.study_id}/facet_repertoire_id_{repertoire_id}.json"
281282
query_url = self.url_facet_query
282283
else:
283-
print("INVALID FLAG: provide one of 'repertoire' or 'facet'")
284+
print("FAIL: INVALID FLAG - provide one of 'repertoire' or 'facet'")
284285
sys.exit(0)
285286

286-
287-
288287
# Test query is well built, then perform query
289288
try:
290289

291290
# Process json file into JSON structure readable by Python
292291
query_dict = curlairripa.process_json_files(force, verbose, query_files)
293292

294293
# Perform the query. Time it
294+
print("INFO: Sending query to %s" % query_url)
295295
start_time = time.time()
296296
query_json = curlairripa.processQuery(query_url, header_dict, expect_pass, query_dict, verbose, force)
297297
total_time = time.time() - start_time
298+
# Time
299+
print("INFO: Total query time (in seconds): %s" % total_time)
298300

299301
# Parse
300302
parsed_query = json.loads(query_json)
301303

302-
# Time
303-
print("ELAPSED DOWNLOAD TIME (in seconds): %s" % total_time)
304-
print("------------------------------------------------------")
305-
306304
return parsed_query
307305

308306
except:
@@ -356,7 +354,13 @@ def validate_repertoire_data_airr(self, validate):
356354
json.dump(rep_json, outfile)
357355
outfile.close()
358356
# Perform AIRR validation test
359-
airr.load_repertoire(output_dir + filename, validate)
357+
try:
358+
airr.load_repertoire(output_dir + filename, validate)
359+
except Exception as e:
360+
print("FAIL: AIRR Repertoire error: %s\n"%(e))
361+
sys.exit(1)
362+
print("PASS: AIRR Repertoire is valid\n")
363+
360364

361365
def perform_mapping_test(self, repertoire_metadata_df, repertoire_response_df):
362366
"""
@@ -413,23 +417,29 @@ def print_mapping_results(self, field_names_in_mapping_not_in_api, field_names_i
413417
file metadata
414418
:return: None
415419
"""
416-
print_separators()
417-
print("Field names in mapping, ir_adc_api_response, not in API response\n")
420+
print("\nINFO: Checking field names from AIRR mapping for API (column ir_adc_api_response) not found in API response")
418421
# Print items not found in API, skip those reported as NaN or empty string
422+
count = 0
419423
for item in field_names_in_mapping_not_in_api:
420424
if type(item) == float or item == "":
421425
continue
422426
else:
423427
print(item)
428+
count = count + 1
429+
if count == 0:
430+
print("PASS: No fields missing")
424431

425-
print_separators()
426-
print("Field names in mapping, ir_curator, not in metadata fields\n")
432+
print("\nINFO: Checking field names in AIRR mapping for curation (column ir_curator) not found in metadata fields")
427433
# Print items not found in metadata sheet, skip those reported as NaN or empty string
434+
count = 0
428435
for item in field_names_in_mapping_not_in_md:
429436
if type(item) == float or item == "":
430437
continue
431438
else:
432439
print(item)
440+
count = count + 1
441+
if count == 0:
442+
print("PASS: No fields missing")
433443

434444
def annotation_count(self, data_df, repertoire_id, test_type_key):
435445
# Initialize
@@ -549,7 +559,7 @@ def metadata_content_testing(unique_items, json_study_df, data_df, connecting_fi
549559
specific study
550560
:return:
551561
"""
552-
print("Content cross comparison\n")
562+
print("Metadata/API content cross comparison\n")
553563

554564
# Get entries of interest in API response
555565
repertoire_list = json_study_df["repertoire_id"].to_list()
@@ -620,11 +630,11 @@ def print_content_test_results(content_results, details_dir, study_id):
620630
print("Could not find differing results between column content.")
621631
# Not so perfect results
622632
else:
623-
print("Some fields may require attention:")
624-
print("In ADC API: ", content_results["API field"].unique())
625-
print("In metadata: ", content_results["MD field"].unique())
633+
print("WARN: Some fields may require attention:")
634+
print("WARN: In ADC API: ", content_results["API field"].unique())
635+
print("WARN: In metadata: ", content_results["MD field"].unique())
626636
file_name = "".join([details_dir, str(study_id), "_reported_fields_", str(pd.to_datetime('today')), ".csv"])
627-
print(f"For details refer to {file_name}")
637+
print(f"WARN: For details refer to {file_name}")
628638
content_results.to_csv(file_name)
629639

630640

@@ -683,10 +693,10 @@ def assess_test_results(ir_seq_api, sum_all, ir_sec, ir_rea):
683693
test_flag = set([str(ir_seq_api), str(sum_all), str(ir_sec)])
684694
if len(test_flag) == 1:
685695
test_result = True
686-
print(ir_rea + " returned TRUE (test passed), see CSV for details")
696+
print("PASS: Repertoire " + ir_rea + " returned TRUE (test passed), see CSV for details\n")
687697
else:
688698
test_result = False
689-
print(ir_rea + " returned FALSE (test failed), see CSV for details")
699+
print("PASS: Repertoire " + ir_rea + " returned FALSE (test failed), see CSV for details\n")
690700

691701
return test_result
692702

@@ -899,17 +909,16 @@ def main():
899909
sanity_check = SanityCheck(metadata_df=metadata, repertoire_json=json_input, facet_json=facet_json_input,
900910
annotation_dir=annotation_directory, url_api_end_point=query_url,
901911
study_id=study_id, mapping_file=mapping_file, output_directory=details_dir, url_facet_query = facet_query)
902-
# Generate printed report
903-
print_data_validator()
904912

905-
# Read repertoire response from metadata file
913+
# Read metadata file
914+
print_separators()
915+
print("Check Metadata file\n")
906916
master = sanity_check.identify_file_type()
907917
data_df = master
908918

909-
# Report separators
910-
print_separators()
911-
912919
# Read repertoire response from API
920+
print_separators()
921+
print("Check AIRR Mapping against API and Metadata file\n")
913922
concat_version = sanity_check.flatten_json("json_response")
914923
concat_version['study.study_id'] = concat_version['study.study_id'].replace(" ", "", regex=True)
915924
json_study_df = concat_version[concat_version['study.study_id'] == study_id]
@@ -921,10 +930,8 @@ def main():
921930
# Print mapping file test results
922931
sanity_check.print_mapping_results(field_names_in_mapping_not_in_api, field_names_in_mapping_not_in_md)
923932

924-
# Report separators
925-
print_separators()
926-
927933
# Content test
934+
print_separators()
928935
identify_mutual_repertoire_ids_in_data(connecting_field, data_df, json_study_df)
929936
# Select repertoire ids
930937
unique_items = identify_mutual_repertoire_ids_in_data(connecting_field, data_df, json_study_df)
@@ -933,16 +940,14 @@ def main():
933940
# Generate CSV results
934941
print_content_test_results(sanity_test_df, details_dir, study_id)
935942

936-
# Report separators
937-
print_separators()
938-
939943
# Report AIRR validation
940-
print("AIRR FIELD VALIDATION")
944+
print_separators()
945+
print("AIRR field validation\n")
941946
sanity_check.validate_repertoire_data_airr(validate=False)
942947

943948
# Annotation count
944949
print_separators()
945-
print("ANNOTATION COUNT")
950+
print("Annotation count validation (API, file size, curator count)\n")
946951
full_result_suite = []
947952
for item in unique_items:
948953
# Delay queries
@@ -966,9 +971,9 @@ def main():
966971

967972
# Process each according to the tool used
968973
else:
969-
print("Processing annotations using:")
970-
print(" annotation_file_format: %s"%(annotation_file_format))
971-
print(" ir_rearrangement_tool: %s"%(tool))
974+
print("INFO: Processing annotations for Repertoire %s using:"%(item))
975+
print("INFO: annotation_file_format: %s"%(annotation_file_format))
976+
print("INFO: ir_rearrangement_tool: %s"%(tool))
972977
############## CASE 1
973978
if "vquest" in annotation_file_format.lower():
974979
result_iter = sanity_check.annotation_count(rowMD, rowMD['repertoire_id'].to_list()[0], "imgt")
@@ -998,7 +1003,7 @@ def main():
9981003
final_result = pd.concat(full_result_suite)
9991004
count_file_name = str(study_id) + "_Facet_Count_curator_count_Annotation_count_"+str(pd.to_datetime('today')) + ".csv"
10001005
final_result.to_csv(details_dir + count_file_name)
1001-
print("For details on sequence count refer to " + count_file_name)
1006+
print("INFO: For details on sequence count refer to " + count_file_name)
10021007

10031008
if __name__ == '__main__':
10041009
main()

verify/joint_sanity_testing.sh

+6-8
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,22 @@ SCRIPT_DIR=`dirname "$0"`
1313

1414
# ---------------------------------------------------------------------
1515
TIME=`date +%Y-%m-%d_%H-%M-%S`
16-
echo "Starting run at: " ${TIME}
17-
echo "Generate facet queries"
18-
echo ""
16+
echo "INFO: Starting run at: " ${TIME}
1917

2018
# $1 base_url String containing URL to API server (e.g. https://airr-api2.ireceptor.org)
2119
# $2 entry_point Options: string 'rearragement' or string 'repertoire'
2220
# $3 path_to_json Enter full path to JSON directory where facet JSON query files will be stored
2321
# $4 no_filters Enter full path to JSON query nofilters
2422
# $5 study_id Enter study_id
2523

26-
python3 $SCRIPT_DIR/generate_facet_json.py -v "$1" "$2" "$3" "$4" "$5"
24+
echo "INFO: Generate facet queries"
25+
python3 $SCRIPT_DIR/generate_facet_json.py "$1" "$2" "$3" "$4" "$5"
2726
if [ $? -ne 0 ]
2827
then
2928
echo "ERROR: Could not generate queries correctly."
3029
exit 1
3130
fi
3231

33-
echo ""
34-
echo "Begin sanity check"
35-
echo ""
3632

3733
# $6 mapping_file Indicate the full path to where the mapping file is found
3834
# $1 base_url String containing URL to API server (e.g. https://airr-api2.ireceptor.org)
@@ -45,6 +41,8 @@ echo ""
4541
# $9 details_dir Enter full path where you'd like to store content feedback in CSV format
4642
# $10 annotation_tool Enter the name of the tool used to process sequence data. Choices: igblast, vquest, mixcr
4743

44+
echo ""
45+
echo "INFO: Begin sanity check"
4846
python3 $SCRIPT_DIR/AIRR-repertoire-checks.py "$6" "$1" "$2" "$4" "$7" "$5" "$3" "$8" "$9" "CC-FC" ${10}
4947
TIME=`date +%Y-%m-%d_%H-%M-%S`
50-
echo "Ending run at: " ${TIME}
48+
echo "INFO: Ending run at: " ${TIME}

0 commit comments

Comments
 (0)