@@ -156,7 +156,7 @@ def rename_cols(flattened_sub_df, field_name):
156
156
elif flag == "json_response" :
157
157
json_data = self .execute_query ("repertoire" , "" )
158
158
else :
159
- print ("INVALID FLAG: pick one of 'metadata' or 'json_response'" )
159
+ print ("FAIL: INVALID FLAG - pick one of 'metadata' or 'json_response'" )
160
160
sys .exit (0 )
161
161
162
162
# Begin flattening
@@ -238,23 +238,24 @@ def identify_file_type(self):
238
238
239
239
# Metadata if of type JSON
240
240
elif "json" in metadata :
241
- florian_json = requests .get (metadata )
242
- florian_json = florian_json .json ()
243
- master = self .flatten_json (florian_json )
241
+ repsonse = requests .get (metadata )
242
+ json_data = repsonse .json ()
243
+ master = self .flatten_json (json_data )
244
244
else :
245
- print ("File format provided is not valid" )
245
+ print ("FAIL: File format provided is not valid" )
246
246
sys .exit (0 )
247
247
248
248
# Check if file is empty
249
249
if master .empty :
250
- print ("EMPTY DATA FRAME: Cannot find specified study ID\n " )
250
+ print ("FAIL: EMPTY DATA FRAME - Cannot find specified study ID\n " )
251
251
print (master )
252
252
sys .exit (0 )
253
253
254
+ print ("PASS: Metadata file %s loaded\n " % (metadata ))
254
255
return master
255
256
256
257
except :
257
- print ("Warning : Provided wrong type file: cannot read metadata." )
258
+ print ("FAIL : Provided wrong type file: cannot read metadata." )
258
259
sys .exit (0 )
259
260
260
261
def execute_query (self , flag , repertoire_id ):
@@ -280,29 +281,26 @@ def execute_query(self, flag, repertoire_id):
280
281
query_files = f"{ self .facet_json } { self .study_id } /facet_repertoire_id_{ repertoire_id } .json"
281
282
query_url = self .url_facet_query
282
283
else :
283
- print ("INVALID FLAG: provide one of 'repertoire' or 'facet'" )
284
+ print ("FAIL: INVALID FLAG - provide one of 'repertoire' or 'facet'" )
284
285
sys .exit (0 )
285
286
286
-
287
-
288
287
# Test query is well built, then perform query
289
288
try :
290
289
291
290
# Process json file into JSON structure readable by Python
292
291
query_dict = curlairripa .process_json_files (force , verbose , query_files )
293
292
294
293
# Perform the query. Time it
294
+ print ("INFO: Sending query to %s" % query_url )
295
295
start_time = time .time ()
296
296
query_json = curlairripa .processQuery (query_url , header_dict , expect_pass , query_dict , verbose , force )
297
297
total_time = time .time () - start_time
298
+ # Time
299
+ print ("INFO: Total query time (in seconds): %s" % total_time )
298
300
299
301
# Parse
300
302
parsed_query = json .loads (query_json )
301
303
302
- # Time
303
- print ("ELAPSED DOWNLOAD TIME (in seconds): %s" % total_time )
304
- print ("------------------------------------------------------" )
305
-
306
304
return parsed_query
307
305
308
306
except :
@@ -356,7 +354,13 @@ def validate_repertoire_data_airr(self, validate):
356
354
json .dump (rep_json , outfile )
357
355
outfile .close ()
358
356
# Perform AIRR validation test
359
- airr .load_repertoire (output_dir + filename , validate )
357
+ try :
358
+ airr .load_repertoire (output_dir + filename , validate )
359
+ except Exception as e :
360
+ print ("FAIL: AIRR Repertoire error: %s\n " % (e ))
361
+ sys .exit (1 )
362
+ print ("PASS: AIRR Repertoire is valid\n " )
363
+
360
364
361
365
def perform_mapping_test (self , repertoire_metadata_df , repertoire_response_df ):
362
366
"""
@@ -413,23 +417,29 @@ def print_mapping_results(self, field_names_in_mapping_not_in_api, field_names_i
413
417
file metadata
414
418
:return: None
415
419
"""
416
- print_separators ()
417
- print ("Field names in mapping, ir_adc_api_response, not in API response\n " )
420
+ print ("\n INFO: Checking field names from AIRR mapping for API (column ir_adc_api_response) not found in API response" )
418
421
# Print items not found in API, skip those reported as NaN or empty string
422
+ count = 0
419
423
for item in field_names_in_mapping_not_in_api :
420
424
if type (item ) == float or item == "" :
421
425
continue
422
426
else :
423
427
print (item )
428
+ count = count + 1
429
+ if count == 0 :
430
+ print ("PASS: No fields missing" )
424
431
425
- print_separators ()
426
- print ("Field names in mapping, ir_curator, not in metadata fields\n " )
432
+ print ("\n INFO: Checking field names in AIRR mapping for curation (column ir_curator) not found in metadata fields" )
427
433
# Print items not found in metadata sheet, skip those reported as NaN or empty string
434
+ count = 0
428
435
for item in field_names_in_mapping_not_in_md :
429
436
if type (item ) == float or item == "" :
430
437
continue
431
438
else :
432
439
print (item )
440
+ count = count + 1
441
+ if count == 0 :
442
+ print ("PASS: No fields missing" )
433
443
434
444
def annotation_count (self , data_df , repertoire_id , test_type_key ):
435
445
# Initialize
@@ -549,7 +559,7 @@ def metadata_content_testing(unique_items, json_study_df, data_df, connecting_fi
549
559
specific study
550
560
:return:
551
561
"""
552
- print ("Content cross comparison\n " )
562
+ print ("Metadata/API content cross comparison\n " )
553
563
554
564
# Get entries of interest in API response
555
565
repertoire_list = json_study_df ["repertoire_id" ].to_list ()
@@ -620,11 +630,11 @@ def print_content_test_results(content_results, details_dir, study_id):
620
630
print ("Could not find differing results between column content." )
621
631
# Not so perfect results
622
632
else :
623
- print ("Some fields may require attention:" )
624
- print ("In ADC API: " , content_results ["API field" ].unique ())
625
- print ("In metadata: " , content_results ["MD field" ].unique ())
633
+ print ("WARN: Some fields may require attention:" )
634
+ print ("WARN: In ADC API: " , content_results ["API field" ].unique ())
635
+ print ("WARN: In metadata: " , content_results ["MD field" ].unique ())
626
636
file_name = "" .join ([details_dir , str (study_id ), "_reported_fields_" , str (pd .to_datetime ('today' )), ".csv" ])
627
- print (f"For details refer to { file_name } " )
637
+ print (f"WARN: For details refer to { file_name } " )
628
638
content_results .to_csv (file_name )
629
639
630
640
@@ -683,10 +693,10 @@ def assess_test_results(ir_seq_api, sum_all, ir_sec, ir_rea):
683
693
test_flag = set ([str (ir_seq_api ), str (sum_all ), str (ir_sec )])
684
694
if len (test_flag ) == 1 :
685
695
test_result = True
686
- print (ir_rea + " returned TRUE (test passed), see CSV for details" )
696
+ print ("PASS: Repertoire " + ir_rea + " returned TRUE (test passed), see CSV for details\n " )
687
697
else :
688
698
test_result = False
689
- print (ir_rea + " returned FALSE (test failed), see CSV for details" )
699
+ print ("PASS: Repertoire " + ir_rea + " returned FALSE (test failed), see CSV for details\n " )
690
700
691
701
return test_result
692
702
@@ -899,17 +909,16 @@ def main():
899
909
sanity_check = SanityCheck (metadata_df = metadata , repertoire_json = json_input , facet_json = facet_json_input ,
900
910
annotation_dir = annotation_directory , url_api_end_point = query_url ,
901
911
study_id = study_id , mapping_file = mapping_file , output_directory = details_dir , url_facet_query = facet_query )
902
- # Generate printed report
903
- print_data_validator ()
904
912
905
- # Read repertoire response from metadata file
913
+ # Read metadata file
914
+ print_separators ()
915
+ print ("Check Metadata file\n " )
906
916
master = sanity_check .identify_file_type ()
907
917
data_df = master
908
918
909
- # Report separators
910
- print_separators ()
911
-
912
919
# Read repertoire response from API
920
+ print_separators ()
921
+ print ("Check AIRR Mapping against API and Metadata file\n " )
913
922
concat_version = sanity_check .flatten_json ("json_response" )
914
923
concat_version ['study.study_id' ] = concat_version ['study.study_id' ].replace (" " , "" , regex = True )
915
924
json_study_df = concat_version [concat_version ['study.study_id' ] == study_id ]
@@ -921,10 +930,8 @@ def main():
921
930
# Print mapping file test results
922
931
sanity_check .print_mapping_results (field_names_in_mapping_not_in_api , field_names_in_mapping_not_in_md )
923
932
924
- # Report separators
925
- print_separators ()
926
-
927
933
# Content test
934
+ print_separators ()
928
935
identify_mutual_repertoire_ids_in_data (connecting_field , data_df , json_study_df )
929
936
# Select repertoire ids
930
937
unique_items = identify_mutual_repertoire_ids_in_data (connecting_field , data_df , json_study_df )
@@ -933,16 +940,14 @@ def main():
933
940
# Generate CSV results
934
941
print_content_test_results (sanity_test_df , details_dir , study_id )
935
942
936
- # Report separators
937
- print_separators ()
938
-
939
943
# Report AIRR validation
940
- print ("AIRR FIELD VALIDATION" )
944
+ print_separators ()
945
+ print ("AIRR field validation\n " )
941
946
sanity_check .validate_repertoire_data_airr (validate = False )
942
947
943
948
# Annotation count
944
949
print_separators ()
945
- print ("ANNOTATION COUNT " )
950
+ print ("Annotation count validation (API, file size, curator count) \n " )
946
951
full_result_suite = []
947
952
for item in unique_items :
948
953
# Delay queries
@@ -966,9 +971,9 @@ def main():
966
971
967
972
# Process each according to the tool used
968
973
else :
969
- print ("Processing annotations using:" )
970
- print (" annotation_file_format: %s" % (annotation_file_format ))
971
- print (" ir_rearrangement_tool: %s" % (tool ))
974
+ print ("INFO: Processing annotations for Repertoire %s using:" % ( item ) )
975
+ print ("INFO: annotation_file_format: %s" % (annotation_file_format ))
976
+ print ("INFO: ir_rearrangement_tool: %s" % (tool ))
972
977
############## CASE 1
973
978
if "vquest" in annotation_file_format .lower ():
974
979
result_iter = sanity_check .annotation_count (rowMD , rowMD ['repertoire_id' ].to_list ()[0 ], "imgt" )
@@ -998,7 +1003,7 @@ def main():
998
1003
final_result = pd .concat (full_result_suite )
999
1004
count_file_name = str (study_id ) + "_Facet_Count_curator_count_Annotation_count_" + str (pd .to_datetime ('today' )) + ".csv"
1000
1005
final_result .to_csv (details_dir + count_file_name )
1001
- print ("For details on sequence count refer to " + count_file_name )
1006
+ print ("INFO: For details on sequence count refer to " + count_file_name )
1002
1007
1003
1008
if __name__ == '__main__' :
1004
1009
main ()
0 commit comments