ZonglinY
diff --git a/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_gpt4_5_50.pt
30.9 KB b/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_gpt4_5_50.pt
30.9 KB
diff --git a/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_reasons_gpt4_5_50.pt
642 KB b/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_reasons_gpt4_5_50.pt
642 KB
diff --git a/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/background_inspiration_hypotheses.pt
2.96 MB b/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/background_inspiration_hypotheses.pt
2.96 MB
diff --git a/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_gpt4_5_50.pt
48.2 KB b/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_gpt4_5_50.pt
48.2 KB
diff --git a/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_reasons_gpt4_5_50.pt
1.29 MB b/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_reasons_gpt4_5_50.pt
1.29 MB
diff --git a/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/background_inspiration_hypotheses.pt
8.35 MB b/‎Checkpoints/claude_45bkg_4itr_bkgnoter5_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/background_inspiration_hypotheses.pt
8.35 MB
diff --git a/‎Checkpoints/claude_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline2_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_gpt4_0_50.pt
22.5 KB b/‎Checkpoints/claude_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline2_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_gpt4_0_50.pt
22.5 KB
diff --git a/‎Checkpoints/claude_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline2_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_reasons_gpt4_0_50.pt
141 KB b/‎Checkpoints/claude_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline2_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_reasons_gpt4_0_50.pt
141 KB
diff --git a/‎Checkpoints/claude_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline2_survey1_bkgInspPasgSwap0_hypSuggestor0/background_inspiration_hypotheses.pt
867 KB b/‎Checkpoints/claude_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline2_survey1_bkgInspPasgSwap0_hypSuggestor0/background_inspiration_hypotheses.pt
867 KB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_gpt4_0_5.pt
4.41 KB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_gpt4_0_5.pt
4.41 KB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_reasons_gpt4_0_5.pt
71.8 KB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_reasons_gpt4_0_5.pt
71.8 KB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/background_inspiration_hypotheses.pt
1.2 MB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/background_inspiration_hypotheses.pt
1.2 MB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_gpt4_0_5.pt
4.29 KB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_gpt4_0_5.pt
4.29 KB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_reasons_gpt4_0_5.pt
70.3 KB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/automatic_evaluation_hypotheses_reasons_gpt4_0_5.pt
70.3 KB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/background_inspiration_hypotheses.pt
1.02 MB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0/background_inspiration_hypotheses.pt
1.02 MB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_gpt4_0_5.pt
6.1 KB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_gpt4_0_5.pt
6.1 KB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_reasons_gpt4_0_5.pt
142 KB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_reasons_gpt4_0_5.pt
142 KB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/background_inspiration_hypotheses.pt
1.61 MB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/background_inspiration_hypotheses.pt
1.61 MB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_gpt4_0_5.pt
4.41 KB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_gpt4_0_5.pt
4.41 KB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_reasons_gpt4_0_5.pt
77.6 KB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/automatic_evaluation_hypotheses_reasons_gpt4_0_5.pt
77.6 KB
diff --git a/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/background_inspiration_hypotheses.pt
1.22 MB b/‎Checkpoints/claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1/background_inspiration_hypotheses.pt
1.22 MB
diff --git a/‎__pycache__/evaluate_utils.cpython-38.pyc
-40 Bytes b/‎__pycache__/evaluate_utils.cpython-38.pyc
-40 Bytes
diff --git a/‎__pycache__/evaluator.cpython-38.pyc
157 Bytes b/‎__pycache__/evaluator.cpython-38.pyc
157 Bytes
diff --git a/‎__pycache__/read_from_pdf.cpython-38.pyc
-40 Bytes b/‎__pycache__/read_from_pdf.cpython-38.pyc
-40 Bytes
diff --git a/‎__pycache__/tomato.cpython-38.pyc
840 Bytes b/‎__pycache__/tomato.cpython-38.pyc
840 Bytes
diff --git a/‎__pycache__/utils.cpython-38.pyc
50 Bytes b/‎__pycache__/utils.cpython-38.pyc
50 Bytes
diff --git a/‎compare_score.py
+40-4 b/‎compare_score.py
+40-4
diff --git a/‎evaluate_main.py
+4 b/‎evaluate_main.py
+4
diff --git a/‎evaluate_main.sh
+5-4 b/‎evaluate_main.sh
+5-4
diff --git a/‎evaluator.py
+39-11 b/‎evaluator.py
+39-11
diff --git a/‎main.py
+3-3 b/‎main.py
+3-3
diff --git a/‎main.sh
+7-7 b/‎main.sh
+7-7
@@ -103,8 +103,9 @@ def read_file_find_score_concat_score(model_name, start_end_id_1, num_CoLM_feedb
 
 
 def find_hyperparameter_for_display_results(model_name, method_name):
-    assert method_name == "MOOSE_base" or method_name == "MOOSE" or method_name == "rand_background_baseline" or method_name == "rand_background_rand_inspiration_baseline" or method_name == "rand_background_BM25_inspiration_baseline" or method_name == "gpt35_background_gpt35_inspiration" or method_name == "groundtruth_background_groundtruth_inspiration" or method_name == "MOOSE_wo_ff1" or method_name == "MOOSE_wo_ff2" or method_name == "MOOSE_wo_survey" or method_name == "MOOSE_w_random_corpus"
+    assert method_name == "MOOSE_base" or method_name == "MOOSE" or method_name == "rand_background_baseline" or method_name == "rand_background_rand_inspiration_baseline" or method_name == "rand_background_BM25_inspiration_baseline" or method_name == "gpt35_background_gpt35_inspiration" or method_name == "groundtruth_background_groundtruth_inspiration" or method_name == "MOOSE_wo_ff1" or method_name == "MOOSE_wo_ff2" or method_name == "MOOSE_wo_survey" or method_name == "MOOSE_w_random_corpus" or method_name == "MOOSE_base_claude" or method_name == "MOOSE_based_with_ff1_and_ff2_claude" or method_name == "MOOSE_claude_onlyindirect2" or method_name == "MOOSE_claude_onlyindirect0" or method_name == "MOOSE_baseline2_claude"
 
+    ### chatgpt ckpts
     ## baseline ckpts
     ckpt_baseline1_0_50 = "chatgpt_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline1_hypEqlInsp_manualTitleSuggester_clearSplit_pastfdbkmodified_hypSuggestor"
     ckpt_baseline2_0_50 = "chatgpt_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline2_hypEqlInsp_manualTitleSuggester_clearSplit_pastfdbkmodified_hypSuggestor"
@@ -123,6 +124,20 @@ def find_hyperparameter_for_display_results(model_name, method_name):
     ckpt_tomato_pf_0_50_without_selfeval_with_hypSuggestor =    "chatgpt_50bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1_hypEqlInsp_manualTitleSuggester_clearSplit_pastfdbkmodified_hypSuggestor"
     ckpt_tomato_pf_0_50_noSurvey = "chatgpt_50bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey0_hypEqlInsp_manualTitleSuggester_clearSplit_pastfdbkmodified_hypSuggestor"
     ckpt_tomato_pf_0_50_bkg_insp_pasg_swap = "chatgpt_50bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey1_bkgInspPasgSwap1_hypEqlInsp_manualTitleSuggester_clearSplit_pastfdbkmodified_hypSuggestor"
+    ### claude ckpts
+    ## MOOSE-based ckpts
+    ckpt_tomato_base_0_5_claude = "claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0"
+    ckpt_tomato_base_5_50_claude = "claude_45bkg_4itr_bkgnoter5_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0"
+    ## MOOSE-future ckpts
+    ckpt_tomato_base_ff1_ff2_0_5_claude = "claude_5bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1"
+    ## MOOSE-future-past-indirect2 ckpts
+    ckpt_tomato_base_ff1_ff2_past_onlyindirect2_0_5_claude = "claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1"
+    ## MOOSE-future-past-indirect0 ckpts
+    ckpt_tomato_base_ff1_ff2_past_onlyindirect0_0_5_claude = "claude_5bkg_4itr_bkgnoter0_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1"
+    ckpt_tomato_base_ff1_ff2_past_onlyindirect0_5_50_claude = "claude_45bkg_4itr_bkgnoter5_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1"
+    ## MOOSE_baseline2_claude
+    ckpt_baseline2_0_50_claude = "claude_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline2_survey1_bkgInspPasgSwap0_hypSuggestor0"
+
 
     if method_name == "MOOSE_base":
         start_end_id = [[0,5], [5,25], [25,50]]
@@ -168,6 +183,26 @@ def find_hyperparameter_for_display_results(model_name, method_name):
         start_end_id = [[0, 50]]
         num_CoLM_feedback_times = 4
         ckpt_addr_full = [ckpt_tomato_pf_0_50_bkg_insp_pasg_swap]
+    elif method_name == "MOOSE_base_claude":
+        start_end_id = [[0, 5], [5, 50]]
+        num_CoLM_feedback_times = 4
+        ckpt_addr_full = [ckpt_tomato_base_0_5_claude, ckpt_tomato_base_5_50_claude]
+    elif method_name == "MOOSE_based_with_ff1_and_ff2_claude":
+        start_end_id = [[0, 5]]
+        num_CoLM_feedback_times = 4
+        ckpt_addr_full = [ckpt_tomato_base_ff1_ff2_0_5_claude]
+    elif method_name == "MOOSE_claude_onlyindirect2":
+        start_end_id = [[0, 5]]
+        num_CoLM_feedback_times = 4
+        ckpt_addr_full = [ckpt_tomato_base_ff1_ff2_past_onlyindirect2_0_5_claude]
+    elif method_name == "MOOSE_claude_onlyindirect0":
+        start_end_id = [[0, 5], [5, 50]]
+        num_CoLM_feedback_times = 4
+        ckpt_addr_full = [ckpt_tomato_base_ff1_ff2_past_onlyindirect0_0_5_claude, ckpt_tomato_base_ff1_ff2_past_onlyindirect0_5_50_claude]
+    elif method_name == "MOOSE_baseline2_claude":
+        start_end_id = [[0, 50]]
+        num_CoLM_feedback_times = 0
+        ckpt_addr_full = [ckpt_baseline2_0_50_claude]
     else:
         raise NotImplementedError
 
@@ -179,9 +214,10 @@ def main():
     # 'chatgpt' or 'gpt4'
     model_name = 'gpt4'
     # "MOOSE_base", "rand_background_baseline", "rand_background_rand_inspiration_baseline", "rand_background_BM25_inspiration_baseline", "gpt35_background_gpt35_inspiration", "MOOSE_wo_ff1", "MOOSE_wo_ff2", "MOOSE_wo_survey", "MOOSE_w_random_corpus"
-    method_name1 = "MOOSE_base"
+    # "MOOSE_baseline2_claude", "MOOSE_base_claude", "MOOSE_claude_onlyindirect0"
+    method_name1 = "MOOSE_base_claude"
     # "MOOSE"
-    method_name2 = "MOOSE"
+    method_name2 = "MOOSE_claude_onlyindirect0"
     ## load data and find score
     start_end_id_1, num_CoLM_feedback_times_1, ckpt_addr1_full = find_hyperparameter_for_display_results(model_name, method_name1)
     start_end_id_2, num_CoLM_feedback_times_2, ckpt_addr2_full = find_hyperparameter_for_display_results(model_name, method_name2)
@@ -204,7 +240,7 @@ def main():
     print("ave_score2_w_ind: ", ave_score2_w_ind)
 
     # score_all_itrs
-    if method_name1 == "MOOSE_base" and method_name2 == "MOOSE":
+    if (method_name1 == "MOOSE_base" and method_name2 == "MOOSE") or (method_name1 == "MOOSE_base_claude" and method_name2 == "MOOSE_claude_onlyindirect0"):
         score_all_itrs = np.concatenate((score1_wo_ind_itrs, score2_wo_ind_itrs, score2_w_ind_itrs), axis=1)
         print("\nscore_all_itrs: ", score_all_itrs.shape)
         ave_score_all_itrs = np.nanmean(score_all_itrs, axis=1)
 
@@ -12,6 +12,8 @@ def main():
     parser.add_argument("--num_CoLM_feedback_times", type=int, default=1, help="number of re-generation times given new feedbacks for CoLM")
     parser.add_argument("--start_id", type=int, default=0, help="To evaluate [start_id : end_id] of the Checkpoint file; -1 when not using it")
     parser.add_argument("--end_id", type=int, default=10, help="To evaluate [start_id : end_id] of the Checkpoint file; -1 when not using it")
+    parser.add_argument("--if_indirect_feedback", type=int, default=1, help="whether conduct indirect feedback modules such as inspiration_changer and background_changer; also can be called --if_past_feedback")
+    parser.add_argument("--if_only_indirect_feedback", type=int, default=0, help="0: tomato-base will perform; 1: Do NOT perform tomato-base because tomato-base has been performed in this checkpoint (prev data will be load up); 2: Do NOT perform tomato-base, but at least tomato-base + past feedback")
     # used for prev_eval_output_dir: ~/Outs/Tomato/gpt4_eval_chatgpt_25bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_hypEqlInsp_manualTitleSuggester_clearSplit_pastfdbkmodified_hypSuggestor_5_25.out
     parser.add_argument("--prev_eval_output_dir", type=str, default="", help="In case previous evaluation code has exception, but we don't want to waste money on openai API to re-evaluate the already evaluated hypotheses -- we pick up the previous score from the 'x.out' file")
     parser.add_argument("--if_azure_api", type=int, default=0, help="0: Use openai api from openai website; 1: use openai api from azure")
@@ -21,6 +23,8 @@ def main():
 
     assert args.model_name == 'gpt4' or args.model_name == 'chatgpt'
     assert args.start_id >= -1 and args.end_id >= -1
+    assert args.if_indirect_feedback == 1 or args.if_indirect_feedback == 0
+    assert args.if_only_indirect_feedback == 0 or args.if_only_indirect_feedback == 1 or args.if_only_indirect_feedback == 2
     assert args.if_azure_api == 0 or args.if_azure_api == 1
     assert args.if_groundtruth_hypotheses == 0 or args.if_groundtruth_hypotheses == 1
     if args.start_id == -1 or args.end_id == -1:
 
@@ -3,12 +3,13 @@
 #SBATCH --partition=DGXq
 #SBATCH -w node19
 #SBATCH --gres=gpu:1
-#SBATCH --output /export/home/zonglin001/Outs/Tomato/gpt4_eval_chatgpt_50bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0_hypEqlInsp_manualTitleSuggester_clearSplit_pastfdbkmodified_hypSuggestor.out
+#SBATCH --output /export/home/zonglin001/Outs/Tomato/gpt4_eval_claude_45bkg_4itr_bkgnoter5_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1.out
 
 # chatgpt / gpt4
 python -u evaluate_main.py --if_groundtruth_hypotheses 0 \
         --model_name gpt4 --num_CoLM_feedback_times 4 \
-        --start_id 0 --end_id 50 \
-        --if_azure_api 0 \
-        --output_dir ~/Checkpoints/Tomato/chatgpt_50bkg_4itr_bkgnoter0_indirect1_onlyindirect2_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0_hypEqlInsp_manualTitleSuggester_clearSplit_pastfdbkmodified_hypSuggestor \
+        --if_indirect_feedback 1 --if_only_indirect_feedback 0 \
+        --start_id 5 --end_id 50 \
+        --if_azure_api 1 \
+        --output_dir ~/Checkpoints/Tomato/claude_45bkg_4itr_bkgnoter5_indirect1_onlyindirect0_close0_ban0_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor1 \
         --api_key sk-
@@ -1,6 +1,7 @@
 import os, time, re
 import torch
 import openai
+from openai import AzureOpenAI
 import transformers
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import numpy as np
@@ -17,10 +18,15 @@ def __init__(self, args):
         if args.if_azure_api == 0:
             openai.api_key = self.args.api_key
         else:
-            openai.api_type = ""
-            openai.api_base = ""
-            openai.api_version = ""
-            openai.api_key = self.args.api_key
+            # openai.api_type = ""
+            # openai.api_base = ""
+            # openai.api_version = "2024-02-15-preview"
+            # openai.api_key = self.args.api_key
+            self.client = AzureOpenAI(
+                azure_endpoint = "https://declaregpt4.openai.azure.com/", 
+                api_key=self.args.api_key,  
+                api_version="2024-02-15-preview"
+            )
         assert openai.api_key != ""
         # self.hypotheses is a sub-element of self.result
         self.result = None
@@ -95,6 +101,19 @@ def evaluate(self):
         score_reasons = {}
         cnt_finished = 0
         if self.args.if_groundtruth_hypotheses == 0:
+            # num_chunks_with_and_without_past_feedback_per_bkg
+            if self.args.if_indirect_feedback == 0:
+                num_chunks_with_and_without_past_feedback_per_bkg = 1
+            elif self.args.if_indirect_feedback == 1:
+                if self.args.if_only_indirect_feedback == 0 or self.args.if_only_indirect_feedback == 1:
+                    num_chunks_with_and_without_past_feedback_per_bkg = 2
+                elif self.args.if_only_indirect_feedback == 2:
+                    num_chunks_with_and_without_past_feedback_per_bkg = 1
+                else:
+                    raise NotImplementedError
+            else:
+                raise NotImplementedError
+            # start looping
             for cur_id_bkg, cur_bkg_ori in enumerate(self.background):
                 if cur_bkg_ori not in scores:
                     cur_bkg = cur_bkg_ori
@@ -103,16 +122,17 @@ def evaluate(self):
                     score_reasons[cur_bkg] = []
                     cur_bkg = cur_bkg_ori
                     cur_hyp_for_cur_bkg = self.hypotheses[cur_bkg_ori]
-                    # in case a bkg has more than one data item in our dataset
-                    if len(cur_hyp_for_cur_bkg) > 1:
-                        cur_hyp_for_cur_bkg = cur_hyp_for_cur_bkg[:1]
+                    # in case a bkg has more than one data item (annotated publication) in our dataset
+                    if len(cur_hyp_for_cur_bkg) > 1*num_chunks_with_and_without_past_feedback_per_bkg:
+                        cur_hyp_for_cur_bkg = cur_hyp_for_cur_bkg[:1*num_chunks_with_and_without_past_feedback_per_bkg]
                 else:
                     # raise Exception("repeated key in scores: {}; cur_bkg: {}".format(scores, cur_bkg))
+                    assert len(self.hypotheses[cur_bkg_ori]) == 2*num_chunks_with_and_without_past_feedback_per_bkg
                     cur_bkg = cur_bkg_ori + " "
                     assert cur_bkg not in score_reasons
                     scores[cur_bkg] = []
                     score_reasons[cur_bkg] = []
-                    cur_hyp_for_cur_bkg = self.hypotheses[cur_bkg_ori][1:2]
+                    cur_hyp_for_cur_bkg = self.hypotheses[cur_bkg_ori][1*num_chunks_with_and_without_past_feedback_per_bkg:2*num_chunks_with_and_without_past_feedback_per_bkg]
                 if cur_id_bkg == 0:
                     print("len(cur_hyp_for_cur_bkg): ", len(cur_hyp_for_cur_bkg))
                 for cur_id_hyp_direct_or_indirect , cur_hyp_direct_or_indirect in enumerate(cur_hyp_for_cur_bkg):
@@ -225,13 +245,21 @@ def llm_generation(self, input_txt):
                     reply = response["choices"][0]['message']['content']
                     if_api_completed = True
                 else:
-                    response = openai.ChatCompletion.create(
-                    engine=api_model_name,
+                    # response = openai.ChatCompletion.create(
+                    # engine=api_model_name,
+                    # messages=[{"role": "user", "content": input_txt}],
+                    # top_p=0.90,
+                    # temperature=temperature,
+                    # max_tokens=max_tokens)
+                    # reply = response["choices"][0]['message']['content']
+                    # if_api_completed = True
+                    response = self.client.chat.completions.create(
+                    model=api_model_name, 
                     messages=[{"role": "user", "content": input_txt}],
                     top_p=0.90,
                     temperature=temperature,
                     max_tokens=max_tokens)
-                    reply = response["choices"][0]['message']['content']
+                    reply = response.choices[0].message.content
                     if_api_completed = True
             except:
                 print("OpenAI reach its rate limit")
 
@@ -8,7 +8,7 @@
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_name", type=str, default="vicuna",
-                        help="model name: gpt2/llama/vicuna/vicuna13/chatgpt/falcon")
+                        help="model name: gpt2/llama/vicuna/vicuna13/chatgpt/falcon/claude")
     parser.add_argument("--root_data_dir", type=str, default="./Data/")
     parser.add_argument("--survey_data_dir", type=str, default="./Data/Surveys/")
     parser.add_argument("--output_dir", type=str, default="~/Checkpoints/Tomato/try")
@@ -28,7 +28,7 @@ def main():
     args = parser.parse_args()
 
     # check hyper-parameters
-    assert args.model_name == 'llama' or args.model_name == 'vicuna' or args.model_name == 'vicuna13' or args.model_name == 'gpt2' or args.model_name == 'chatgpt' or args.model_name == 'falcon'
+    assert args.model_name == 'llama' or args.model_name == 'vicuna' or args.model_name == 'vicuna13' or args.model_name == 'gpt2' or args.model_name == 'chatgpt' or args.model_name == 'falcon' or args.model_name == "claude"
     assert args.if_indirect_feedback == 1 or args.if_indirect_feedback == 0
     assert args.if_only_indirect_feedback == 0 or args.if_only_indirect_feedback == 1 or args.if_only_indirect_feedback == 2
     assert args.if_close_domain == 1 or args.if_close_domain == 0
@@ -59,7 +59,7 @@ def main():
     # check gpu
     n_gpu = torch.cuda.device_count()
     print("n_gpu: ", n_gpu)
-    if not args.model_name == 'chatgpt':
+    if not (args.model_name == 'chatgpt' or args.model_name == "claude"):
         print_nvidia_smi()
         assert n_gpu >= 1
 
 
@@ -1,18 +1,18 @@
 #!/bin/bash
-#SBATCH -J Tomato
+#SBATCH -J baseline
 #SBATCH --partition=DGXq
 #SBATCH -w node18
 #SBATCH --gres=gpu:1
-#SBATCH --output /export/home/zonglin001/Outs/Tomato/chatgpt_50bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0_hypEqlInsp_manualTitleSuggester_clearSplit_pastfdbkmodified_hypSuggestor.out
+#SBATCH --output /export/home/zonglin001/Outs/Tomato/claude_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline2_survey1_bkgInspPasgSwap0_hypSuggestor0.out
 
 
-# vicuna / gpt2 / chatgpt / vicuna13 / falcon
-python -u main.py --model_name chatgpt \
-        --output_dir ~/Checkpoints/Tomato/chatgpt_50bkg_4itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline0_survey1_bkgInspPasgSwap0_hypSuggestor0_hypEqlInsp_manualTitleSuggester_clearSplit_pastfdbkmodified_hypSuggestor \
-        --num_background_for_hypotheses 50 --num_CoLM_feedback_times 4 --bkg_corpus_chunk_noter 0 \
+# vicuna / gpt2 / chatgpt / vicuna13 / falcon / claude
+python -u main.py --model_name claude \
+        --output_dir ~/Checkpoints/Tomato/claude_50bkg_0itr_bkgnoter0_indirect0_onlyindirect0_close0_ban1_baseline2_survey1_bkgInspPasgSwap0_hypSuggestor0 \
+        --num_background_for_hypotheses 50 --num_CoLM_feedback_times 0 --bkg_corpus_chunk_noter 0 \
         --if_indirect_feedback 0 --if_only_indirect_feedback 0 \
         --if_close_domain 0 --if_ban_selfeval 1 \
-        --if_baseline 0 \
+        --if_baseline 2 \
         --if_novelty_module_have_access_to_surveys 1 --if_insp_pasg_for_bkg_and_bkg_pasg_included_in_insp 0 \
         --if_hypothesis_suggstor 0 \
         --api_key sk-