diff --git a/evals/absorption/eval_output_schema_absorption_first_letter.json b/evals/absorption/eval_output_schema_absorption_first_letter.json
index d1dbc81..b0eef3e 100644
--- a/evals/absorption/eval_output_schema_absorption_first_letter.json
+++ b/evals/absorption/eval_output_schema_absorption_first_letter.json
@@ -33,10 +33,22 @@
           "type": "integer"
         },
         "model_name": {
-          "default": "pythia-70m-deduped",
+          "default": "gemma-2-2b",
           "description": "Model name",
           "title": "Model Name",
           "type": "string"
+        },
+        "llm_batch_size": {
+          "default": 32,
+          "description": "LLM batch size, inference only",
+          "title": "LLM Batch Size",
+          "type": "integer"
+        },
+        "llm_dtype": {
+          "default": "bfloat16",
+          "description": "LLM data type",
+          "title": "LLM Data Type",
+          "type": "string"
         }
       },
       "title": "AbsorptionEvalConfig",
diff --git a/evals/autointerp/eval_output_schema_autointerp.json b/evals/autointerp/eval_output_schema_autointerp.json
new file mode 100644
index 0000000..c7b454e
--- /dev/null
+++ b/evals/autointerp/eval_output_schema_autointerp.json
@@ -0,0 +1,283 @@
+{
+  "$defs": {
+    "AutoInterpEvalConfig": {
+      "description": "Controls all parameters for how autointerp will work.\n\nArguments:\n    model_name:                     The name of the model to use\n    device:                         The device to use\n    n_latents:                      The number of latents to use\n    override_latents:               The latents to use (overrides n_latents if supplied)\n    dead_latent_threshold:          The log sparsity value below which we consider a latent to be dead\n    seed:                           The seed to use for all randomness\n\n    buffer:                         The size of the buffer to use for scoring\n    no_overlap:                     Whether to allow overlapping sequences for scoring\n    act_threshold_frac:             The fraction of the maximum activation to use as the activation threshold\n    total_tokens:                   The total number of tokens we'll gather data for.\n    batch_size:                     The batch size to use for the scoring phase\n    scoring:                        Whether to perform the scoring phase, or just return explanation\n    max_tokens_in_explanation:      The maximum number of tokens to allow in an explanation\n    use_demos_in_explanation:       Whether to use demonstrations in the explanation prompt\n\n    n_top_ex_for_generation:        The number of top activating sequences to use for the generation phase\n    n_iw_sampled_ex_for_generation: The number of importance-sampled sequences to use for the generation phase (this\n                                    is a replacement for quantile sampling)\n\n    n_top_ex_for_scoring:           The number of top sequences to use for scoring\n    n_random_ex_for_scoring:        The number of random sequences to use for scoring\n    n_iw_sampled_ex_for_scoring:    The number of importance-sampled sequences to use for scoring",
+      "properties": {
+        "model_name": {
+          "default": "",
+          "description": "The name of the model to use",
+          "title": "Model Name",
+          "type": "string"
+        },
+        "n_latents": {
+          "default": 1000,
+          "description": "The number of latents for the LLM judge to interpret",
+          "title": "Number of Latents",
+          "type": "integer"
+        },
+        "override_latents": {
+          "anyOf": [
+            {
+              "items": {
+                "type": "integer"
+              },
+              "type": "array"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "The latents to use (overrides n_latents if supplied)",
+          "title": "Override Latents"
+        },
+        "dead_latent_threshold": {
+          "default": 15,
+          "description": "Minimum number of required activations",
+          "title": "Dead Latent Threshold",
+          "type": "number"
+        },
+        "random_seed": {
+          "default": 42,
+          "description": "The seed to use for all randomness",
+          "title": "Random Seed",
+          "type": "integer"
+        },
+        "dataset_name": {
+          "default": "monology/pile-uncopyrighted",
+          "description": "The name of the dataset to use",
+          "title": "Dataset Name",
+          "type": "string"
+        },
+        "llm_context_size": {
+          "default": 128,
+          "description": "The context size to use for the LLM",
+          "title": "LLM Context Size",
+          "type": "integer"
+        },
+        "llm_batch_size": {
+          "default": 512,
+          "description": "Split up total tokens into batches of this size",
+          "title": "LLM Batch Size",
+          "type": "integer"
+        },
+        "llm_dtype": {
+          "default": "float32",
+          "description": "The data type to use for the LLM",
+          "title": "LLM Data Type",
+          "type": "string"
+        },
+        "buffer": {
+          "default": 10,
+          "description": "The size of the buffer to use for scoring",
+          "title": "Buffer Size",
+          "type": "integer"
+        },
+        "no_overlap": {
+          "default": true,
+          "description": "Whether to allow overlapping sequences for scoring",
+          "title": "No Overlap",
+          "type": "boolean"
+        },
+        "act_threshold_frac": {
+          "default": 0.01,
+          "description": "The fraction of the maximum activation to use as the activation threshold",
+          "title": "Activation Threshold Fraction",
+          "type": "number"
+        },
+        "total_tokens": {
+          "default": 2000000,
+          "description": "The total number of tokens we'll gather data for",
+          "title": "Total Tokens",
+          "type": "integer"
+        },
+        "scoring": {
+          "default": true,
+          "description": "Whether to perform the scoring phase, or just return explanation",
+          "title": "Scoring",
+          "type": "boolean"
+        },
+        "max_tokens_in_explanation": {
+          "default": 30,
+          "description": "The maximum number of tokens to allow in an explanation",
+          "title": "Max Tokens in Explanation",
+          "type": "integer"
+        },
+        "use_demos_in_explanation": {
+          "default": true,
+          "description": "Whether to use demonstrations in the explanation prompt",
+          "title": "Use Demos in Explanation",
+          "type": "boolean"
+        },
+        "n_top_ex_for_generation": {
+          "default": 10,
+          "description": "The number of top activating sequences to use for the generation phase",
+          "title": "Number of Top Examples for Generation",
+          "type": "integer"
+        },
+        "n_iw_sampled_ex_for_generation": {
+          "default": 5,
+          "description": "The number of importance-sampled sequences to use for the generation phase",
+          "title": "Number of IW Sampled Examples for Generation",
+          "type": "integer"
+        },
+        "n_top_ex_for_scoring": {
+          "default": 2,
+          "description": "The number of top sequences to use for scoring",
+          "title": "Number of Top Examples for Scoring",
+          "type": "integer"
+        },
+        "n_random_ex_for_scoring": {
+          "default": 10,
+          "description": "The number of random sequences to use for scoring",
+          "title": "Number of Random Examples for Scoring",
+          "type": "integer"
+        },
+        "n_iw_sampled_ex_for_scoring": {
+          "default": 2,
+          "description": "The number of importance-sampled sequences to use for scoring",
+          "title": "Number of IW Sampled Examples for Scoring",
+          "type": "integer"
+        }
+      },
+      "title": "AutoInterpEvalConfig",
+      "type": "object"
+    },
+    "AutoInterpMetricCategories": {
+      "properties": {
+        "autointerp": {
+          "$ref": "#/$defs/AutoInterpMetrics",
+          "description": "Metrics related to autointerp",
+          "title": "AutoInterp"
+        }
+      },
+      "required": [
+        "autointerp"
+      ],
+      "title": "AutoInterpMetricCategories",
+      "type": "object"
+    },
+    "AutoInterpMetrics": {
+      "properties": {
+        "autointerp_score": {
+          "description": "AutoInterp detection score, using methodology similar to Eleuther's 'Open Source Automated Interpretability for Sparse Autoencoder Features'",
+          "title": "AutoInterp Score",
+          "type": "number",
+          "ui_default_display": true
+        }
+      },
+      "required": [
+        "autointerp_score"
+      ],
+      "title": "AutoInterpMetrics",
+      "type": "object"
+    },
+    "BaseResultDetail": {
+      "properties": {},
+      "title": "BaseResultDetail",
+      "type": "object"
+    }
+  },
+  "description": "An evaluation of the interpretability of SAE latents. This evaluation is based on Eleuther's 'Open Source Automated Interpretability for Sparse Autoencoder Features'",
+  "properties": {
+    "eval_type_id": {
+      "default": "autointerp",
+      "description": "The type of the evaluation",
+      "title": "Eval Type ID",
+      "type": "string"
+    },
+    "eval_config": {
+      "$ref": "#/$defs/AutoInterpEvalConfig",
+      "description": "The configuration of the evaluation.",
+      "title": "Eval Config Type"
+    },
+    "eval_id": {
+      "description": "A unique UUID identifying this specific eval run",
+      "title": "ID",
+      "type": "string"
+    },
+    "datetime_epoch_millis": {
+      "description": "The datetime of the evaluation in epoch milliseconds",
+      "title": "DateTime (epoch ms)",
+      "type": "integer"
+    },
+    "eval_result_metrics": {
+      "$ref": "#/$defs/AutoInterpMetricCategories",
+      "description": "The metrics of the evaluation, organized by category. Define your own categories and the metrics that go inside them.",
+      "title": "Result Metrics Categorized"
+    },
+    "eval_result_details": {
+      "default": null,
+      "description": "Optional. The details of the evaluation. A list of objects that stores nested or more detailed data, such as details about the absorption of each letter.",
+      "items": {
+        "$ref": "#/$defs/BaseResultDetail"
+      },
+      "title": "Result Details",
+      "type": "array"
+    },
+    "sae_bench_commit_hash": {
+      "description": "The commit hash of the SAE Bench that ran the evaluation.",
+      "title": "SAE Bench Commit Hash",
+      "type": "string"
+    },
+    "sae_lens_id": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "description": "The ID of the SAE in SAE Lens.",
+      "title": "SAE Lens ID"
+    },
+    "sae_lens_release_id": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "description": "The release ID of the SAE in SAE Lens.",
+      "title": "SAE Lens Release ID"
+    },
+    "sae_lens_version": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "description": "The version of SAE Lens that ran the evaluation.",
+      "title": "SAE Lens Version"
+    },
+    "eval_result_unstructured": {
+      "anyOf": [
+        {},
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Optional. Any additional outputs that don't fit into the structured eval_result_metrics or eval_result_details fields. Since these are unstructured, don't expect this to be easily renderable in UIs, or contain any titles or descriptions.",
+      "title": "Unstructured Results"
+    }
+  },
+  "required": [
+    "eval_config",
+    "eval_id",
+    "datetime_epoch_millis",
+    "eval_result_metrics",
+    "sae_bench_commit_hash",
+    "sae_lens_id",
+    "sae_lens_release_id",
+    "sae_lens_version"
+  ],
+  "title": "AutoInterp",
+  "type": "object"
+}
\ No newline at end of file
diff --git a/evals/core/eval_output_schema_core.json b/evals/core/eval_output_schema_core.json
index 4e8c3ed..129869f 100644
--- a/evals/core/eval_output_schema_core.json
+++ b/evals/core/eval_output_schema_core.json
@@ -8,6 +8,12 @@
           "title": "Model Name",
           "type": "string"
         },
+        "llm_dtype": {
+          "default": "float32",
+          "description": "LLM data type",
+          "title": "LLM Data Type",
+          "type": "string"
+        },
         "batch_size_prompts": {
           "default": 16,
           "description": "Batch size for evaluation prompts",
@@ -80,6 +86,12 @@
           "title": "Compute Featurewise Weight-Based Metrics",
           "type": "boolean"
         },
+        "exclude_special_tokens_from_reconstruction": {
+          "default": false,
+          "description": "Exclude special tokens like BOS, EOS, PAD from reconstruction",
+          "title": "Exclude Special Tokens from Reconstruction",
+          "type": "boolean"
+        },
         "verbose": {
           "default": false,
           "description": "Enable verbose output",
diff --git a/evals/scr_and_tpp/eval_output_schema_scr.json b/evals/scr_and_tpp/eval_output_schema_scr.json
index 35c84aa..b6fadfc 100644
--- a/evals/scr_and_tpp/eval_output_schema_scr.json
+++ b/evals/scr_and_tpp/eval_output_schema_scr.json
@@ -1,6 +1,6 @@
 {
   "$defs": {
-    "ShiftAndTppEvalConfig": {
+    "ScrAndTppEvalConfig": {
       "properties": {
         "random_seed": {
           "default": 42,
@@ -9,7 +9,7 @@
           "type": "integer"
         },
         "dataset_names": {
-          "description": "List of dataset names for both the SHIFT and TPP metrics",
+          "description": "List of dataset names for both the SCR and TPP metrics",
           "items": {
             "type": "string"
           },
@@ -18,7 +18,7 @@
         },
         "perform_scr": {
           "default": true,
-          "description": "If True, the eval will be Spurious Correlation Removal (SCR) using SHIFT. If False, the eval will be TPP.",
+          "description": "If True, the eval will be Spurious Correlation Removal (SCR). If False, the eval will be TPP.",
           "title": "Perform Spurious Correlation Removal",
           "type": "boolean"
         },
@@ -89,7 +89,7 @@
           "type": "integer"
         },
         "llm_dtype": {
-          "default": "bfloat16",
+          "default": "float32",
           "description": "",
           "title": "LLM Dtype",
           "type": "string"
@@ -125,30 +125,30 @@
             },
             "type": "array"
           },
-          "description": "Column1 Values apply only to the SHIFT metric. Column1 values represents the class pairs we train the linear probes on. In each case, we will create a perfectly biased dataset, such as all professors are males and all nurses are females.",
+          "description": "Column1 Values apply only to the SCR metric. Column1 values represents the class pairs we train the linear probes on. In each case, we will create a perfectly biased dataset, such as all professors are males and all nurses are females.",
           "title": "Column 1 Values Lookup",
           "type": "object"
         }
       },
-      "title": "ShiftAndTppEvalConfig",
+      "title": "ScrAndTppEvalConfig",
       "type": "object"
     },
-    "ShiftMetricCategories": {
+    "ScrMetricCategories": {
       "properties": {
-        "shift_metrics": {
-          "$ref": "#/$defs/ShiftMetrics",
-          "description": "SHIFT SCR metrics, calculated for different numbers of ablated features. Also includes the results for both correlation removal directions.",
-          "title": "Shift Metrics",
+        "scr_metrics": {
+          "$ref": "#/$defs/ScrMetrics",
+          "description": "SCR metrics, calculated for different numbers of ablated features. Also includes the results for both correlation removal directions.",
+          "title": "SCR Metrics",
           "ui_default_display": true
         }
       },
       "required": [
-        "shift_metrics"
+        "scr_metrics"
       ],
-      "title": "ShiftMetricCategories",
+      "title": "ScrMetricCategories",
       "type": "object"
     },
-    "ShiftMetrics": {
+    "ScrMetrics": {
       "properties": {
         "scr_dir1_threshold_2": {
           "anyOf": [
@@ -425,10 +425,10 @@
           "title": "SCR Dir 2, Top 500 SAE latents"
         }
       },
-      "title": "ShiftMetrics",
+      "title": "ScrMetrics",
       "type": "object"
     },
-    "ShiftResultDetail": {
+    "ScrResultDetail": {
       "properties": {
         "dataset_name": {
           "description": "",
@@ -713,11 +713,11 @@
       "required": [
         "dataset_name"
       ],
-      "title": "ShiftResultDetail",
+      "title": "ScrResultDetail",
       "type": "object"
     }
   },
-  "description": "The SHIFT Spurious Correlation Removal (SCR) evaluation ablates SAE latents to shift the bias of a biased linear probe. The methodology is from `Evaluating Sparse Autoencoders on Targeted Concept Removal Tasks`.",
+  "description": "The Spurious Correlation Removal (SCR) evaluation ablates SAE latents to shift the bias of a biased linear probe. The methodology is from `Evaluating Sparse Autoencoders on Targeted Concept Removal Tasks`.",
   "properties": {
     "eval_type_id": {
       "default": "scr",
@@ -726,7 +726,7 @@
       "type": "string"
     },
     "eval_config": {
-      "$ref": "#/$defs/ShiftAndTppEvalConfig",
+      "$ref": "#/$defs/ScrAndTppEvalConfig",
       "description": "The configuration of the evaluation.",
       "title": "Eval Config Type"
     },
@@ -741,16 +741,16 @@
       "type": "integer"
     },
     "eval_result_metrics": {
-      "$ref": "#/$defs/ShiftMetricCategories",
+      "$ref": "#/$defs/ScrMetricCategories",
       "description": "The metrics of the evaluation, organized by category. Define your own categories and the metrics that go inside them.",
       "title": "Result Metrics Categorized"
     },
     "eval_result_details": {
-      "description": "Each object is a stat on the SHIFT SCR results for a single dataset.",
+      "description": "Each object is a stat on the SCR results for a single dataset.",
       "items": {
-        "$ref": "#/$defs/ShiftResultDetail"
+        "$ref": "#/$defs/ScrResultDetail"
       },
-      "title": "Per-Dataset SHIFT Spurious Correlation Removal (SCR) Results",
+      "title": "Per-Dataset Spurious Correlation Removal (SCR) Results",
       "type": "array"
     },
     "sae_bench_commit_hash": {
@@ -816,6 +816,6 @@
     "sae_lens_release_id",
     "sae_lens_version"
   ],
-  "title": "SHIFT",
+  "title": "SCR",
   "type": "object"
 }
\ No newline at end of file
diff --git a/evals/scr_and_tpp/eval_output_schema_tpp.json b/evals/scr_and_tpp/eval_output_schema_tpp.json
index e34cbc8..f625d83 100644
--- a/evals/scr_and_tpp/eval_output_schema_tpp.json
+++ b/evals/scr_and_tpp/eval_output_schema_tpp.json
@@ -1,6 +1,6 @@
 {
   "$defs": {
-    "ShiftAndTppEvalConfig": {
+    "ScrAndTppEvalConfig": {
       "properties": {
         "random_seed": {
           "default": 42,
@@ -9,7 +9,7 @@
           "type": "integer"
         },
         "dataset_names": {
-          "description": "List of dataset names for both the SHIFT and TPP metrics",
+          "description": "List of dataset names for both the SCR and TPP metrics",
           "items": {
             "type": "string"
           },
@@ -18,7 +18,7 @@
         },
         "perform_scr": {
           "default": true,
-          "description": "If True, the eval will be Spurious Correlation Removal (SCR) using SHIFT. If False, the eval will be TPP.",
+          "description": "If True, the eval will be Spurious Correlation Removal (SCR). If False, the eval will be TPP.",
           "title": "Perform Spurious Correlation Removal",
           "type": "boolean"
         },
@@ -89,7 +89,7 @@
           "type": "integer"
         },
         "llm_dtype": {
-          "default": "bfloat16",
+          "default": "float32",
           "description": "",
           "title": "LLM Dtype",
           "type": "string"
@@ -125,12 +125,12 @@
             },
             "type": "array"
           },
-          "description": "Column1 Values apply only to the SHIFT metric. Column1 values represents the class pairs we train the linear probes on. In each case, we will create a perfectly biased dataset, such as all professors are males and all nurses are females.",
+          "description": "Column1 Values apply only to the SCR metric. Column1 values represents the class pairs we train the linear probes on. In each case, we will create a perfectly biased dataset, such as all professors are males and all nurses are females.",
           "title": "Column 1 Values Lookup",
           "type": "object"
         }
       },
-      "title": "ShiftAndTppEvalConfig",
+      "title": "ScrAndTppEvalConfig",
       "type": "object"
     },
     "TppMetricCategories": {
@@ -726,7 +726,7 @@
       "type": "string"
     },
     "eval_config": {
-      "$ref": "#/$defs/ShiftAndTppEvalConfig",
+      "$ref": "#/$defs/ScrAndTppEvalConfig",
       "description": "The configuration of the evaluation.",
       "title": "Eval Config Type"
     },
diff --git a/evals/sparse_probing/eval_output_schema_sparse_probing.json b/evals/sparse_probing/eval_output_schema_sparse_probing.json
index f0c14cf..ae9bd2c 100644
--- a/evals/sparse_probing/eval_output_schema_sparse_probing.json
+++ b/evals/sparse_probing/eval_output_schema_sparse_probing.json
@@ -531,7 +531,7 @@
       "type": "object"
     }
   },
-  "description": "Sparse probing evaluation description goes here.",
+  "description": "An evaluation using SAEs to probe for supervised concepts in LLMs. We use sparse probing with the top K SAE latents and probe for over 30 different classes across 5 datasets.",
   "properties": {
     "eval_type_id": {
       "default": "sparse_probing",
diff --git a/evals/unlearning/eval_output_schema_unlearning.json b/evals/unlearning/eval_output_schema_unlearning.json
index 4b6fc3a..7e94fc9 100644
--- a/evals/unlearning/eval_output_schema_unlearning.json
+++ b/evals/unlearning/eval_output_schema_unlearning.json
@@ -14,7 +14,7 @@
           "type": "integer"
         },
         "dataset_names": {
-          "description": "List of dataset names",
+          "description": "List of dataset names. We want to unlearn wmdp-bio while retaining knowledge in other datasets",
           "items": {
             "type": "string"
           },
@@ -23,12 +23,12 @@
         },
         "intervention_method": {
           "default": "clamp_feature_activation",
-          "description": "Intervention method",
+          "description": "Intervention method. We only support 'clamp_feature_activation' for now",
           "title": "Intervention Method",
           "type": "string"
         },
         "retain_thresholds": {
-          "description": "Retain thresholds",
+          "description": "We ignore features that activate more than this threshold on the retain dataset",
           "items": {
             "type": "number"
           },
@@ -36,7 +36,7 @@
           "type": "array"
         },
         "n_features_list": {
-          "description": "N features list",
+          "description": "Each N is the number of features we select and clamp to a negative value",
           "items": {
             "type": "integer"
           },
@@ -44,7 +44,7 @@
           "type": "array"
         },
         "multipliers": {
-          "description": "Multipliers",
+          "description": "A list of negative values. We iterate over this list, clamping the selected features to each value",
           "items": {
             "type": "integer"
           },
@@ -65,37 +65,37 @@
         },
         "dataset_size": {
           "default": 1024,
-          "description": "Dataset size",
+          "description": "Dataset size we use when calculating feature sparsity",
           "title": "Dataset Size",
           "type": "integer"
         },
         "seq_len": {
           "default": 1024,
-          "description": "Sequence length",
+          "description": "Sequence length when calculating feature sparsity",
           "title": "Sequence Length",
           "type": "integer"
         },
         "n_batch_loss_added": {
           "default": 50,
-          "description": "N batch loss added",
+          "description": "Number of batches to use when calculating the loss added by an intervention (currently not supported).",
           "title": "N Batch Loss Added",
           "type": "integer"
         },
         "target_metric": {
           "default": "correct",
-          "description": "Target metric",
+          "description": "Controls the type of `question_ids` we load. We support 'correct', `correct-iff-question`, and `correct-no-tricks",
           "title": "Target Metric",
           "type": "string"
         },
         "save_metrics": {
           "default": true,
-          "description": "Save metrics",
-          "title": "Save Metrics",
+          "description": "If true, we save the metrics for each set of intervention hyperparameters. This is required to be true currently, as the unlearning score is calculated over all results.",
+          "title": "Save Metrics Flag",
           "type": "boolean"
         },
         "model_name": {
           "default": "gemma-2-2b-it",
-          "description": "Model name",
+          "description": "Model name. Note that this should be a instruct model.",
           "title": "Model Name",
           "type": "string"
         },
@@ -126,7 +126,7 @@
     "UnlearningMetrics": {
       "properties": {
         "unlearning_score": {
-          "description": "Unlearning score",
+          "description": "Unlearning score, using methodology from APPLYING SPARSE AUTOENCODERS TO UNLEARN KNOWLEDGE IN LANGUAGE MODELS",
           "title": "Unlearning Score",
           "type": "number",
           "ui_default_display": true
@@ -139,7 +139,7 @@
       "type": "object"
     }
   },
-  "description": "Unlearning evaluation description goes here.",
+  "description": "An evaluation of the ability of SAEs to unlearn biology knowledge from LLMs, using methodology from `Applying Sparse Autoencoders to Unlearn Knowledge in Language Models`",
   "properties": {
     "eval_type_id": {
       "default": "unlearning",
diff --git a/sae_bench_utils/activation_collection.py b/sae_bench_utils/activation_collection.py
index 338af26..3432b8f 100644
--- a/sae_bench_utils/activation_collection.py
+++ b/sae_bench_utils/activation_collection.py
@@ -14,12 +14,14 @@
 LLM_NAME_TO_BATCH_SIZE = {
     "pythia-70m-deduped": 512,
     "gemma-2-2b": 32,
+    "gemma-2-9b": 32,
 }
 
 LLM_NAME_TO_DTYPE = {
     "pythia-70m-deduped": "float32",
     "gemma-2-2b": "bfloat16",
     "gemma-2-2b-it": "bfloat16",
+    "gemma-2-9b": "bfloat16",
 }