diff --git a/evals/absorption/eval_output_schema_absorption_first_letter.json b/evals/absorption/eval_output_schema_absorption_first_letter.json index d1dbc81..b0eef3e 100644 --- a/evals/absorption/eval_output_schema_absorption_first_letter.json +++ b/evals/absorption/eval_output_schema_absorption_first_letter.json @@ -33,10 +33,22 @@ "type": "integer" }, "model_name": { - "default": "pythia-70m-deduped", + "default": "gemma-2-2b", "description": "Model name", "title": "Model Name", "type": "string" + }, + "llm_batch_size": { + "default": 32, + "description": "LLM batch size, inference only", + "title": "LLM Batch Size", + "type": "integer" + }, + "llm_dtype": { + "default": "bfloat16", + "description": "LLM data type", + "title": "LLM Data Type", + "type": "string" } }, "title": "AbsorptionEvalConfig", diff --git a/evals/autointerp/eval_output_schema_autointerp.json b/evals/autointerp/eval_output_schema_autointerp.json new file mode 100644 index 0000000..c7b454e --- /dev/null +++ b/evals/autointerp/eval_output_schema_autointerp.json @@ -0,0 +1,283 @@ +{ + "$defs": { + "AutoInterpEvalConfig": { + "description": "Controls all parameters for how autointerp will work.\n\nArguments:\n model_name: The name of the model to use\n device: The device to use\n n_latents: The number of latents to use\n override_latents: The latents to use (overrides n_latents if supplied)\n dead_latent_threshold: The log sparsity value below which we consider a latent to be dead\n seed: The seed to use for all randomness\n\n buffer: The size of the buffer to use for scoring\n no_overlap: Whether to allow overlapping sequences for scoring\n act_threshold_frac: The fraction of the maximum activation to use as the activation threshold\n total_tokens: The total number of tokens we'll gather data for.\n batch_size: The batch size to use for the scoring phase\n scoring: Whether to perform the scoring phase, or just return explanation\n max_tokens_in_explanation: The maximum number of tokens to allow in an explanation\n use_demos_in_explanation: Whether to use demonstrations in the explanation prompt\n\n n_top_ex_for_generation: The number of top activating sequences to use for the generation phase\n n_iw_sampled_ex_for_generation: The number of importance-sampled sequences to use for the generation phase (this\n is a replacement for quantile sampling)\n\n n_top_ex_for_scoring: The number of top sequences to use for scoring\n n_random_ex_for_scoring: The number of random sequences to use for scoring\n n_iw_sampled_ex_for_scoring: The number of importance-sampled sequences to use for scoring", + "properties": { + "model_name": { + "default": "", + "description": "The name of the model to use", + "title": "Model Name", + "type": "string" + }, + "n_latents": { + "default": 1000, + "description": "The number of latents for the LLM judge to interpret", + "title": "Number of Latents", + "type": "integer" + }, + "override_latents": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The latents to use (overrides n_latents if supplied)", + "title": "Override Latents" + }, + "dead_latent_threshold": { + "default": 15, + "description": "Minimum number of required activations", + "title": "Dead Latent Threshold", + "type": "number" + }, + "random_seed": { + "default": 42, + "description": "The seed to use for all randomness", + "title": "Random Seed", + "type": "integer" + }, + "dataset_name": { + "default": "monology/pile-uncopyrighted", + "description": "The name of the dataset to use", + "title": "Dataset Name", + "type": "string" + }, + "llm_context_size": { + "default": 128, + "description": "The context size to use for the LLM", + "title": "LLM Context Size", + "type": "integer" + }, + "llm_batch_size": { + "default": 512, + "description": "Split up total tokens into batches of this size", + "title": "LLM Batch Size", + "type": "integer" + }, + "llm_dtype": { + "default": "float32", + "description": "The data type to use for the LLM", + "title": "LLM Data Type", + "type": "string" + }, + "buffer": { + "default": 10, + "description": "The size of the buffer to use for scoring", + "title": "Buffer Size", + "type": "integer" + }, + "no_overlap": { + "default": true, + "description": "Whether to allow overlapping sequences for scoring", + "title": "No Overlap", + "type": "boolean" + }, + "act_threshold_frac": { + "default": 0.01, + "description": "The fraction of the maximum activation to use as the activation threshold", + "title": "Activation Threshold Fraction", + "type": "number" + }, + "total_tokens": { + "default": 2000000, + "description": "The total number of tokens we'll gather data for", + "title": "Total Tokens", + "type": "integer" + }, + "scoring": { + "default": true, + "description": "Whether to perform the scoring phase, or just return explanation", + "title": "Scoring", + "type": "boolean" + }, + "max_tokens_in_explanation": { + "default": 30, + "description": "The maximum number of tokens to allow in an explanation", + "title": "Max Tokens in Explanation", + "type": "integer" + }, + "use_demos_in_explanation": { + "default": true, + "description": "Whether to use demonstrations in the explanation prompt", + "title": "Use Demos in Explanation", + "type": "boolean" + }, + "n_top_ex_for_generation": { + "default": 10, + "description": "The number of top activating sequences to use for the generation phase", + "title": "Number of Top Examples for Generation", + "type": "integer" + }, + "n_iw_sampled_ex_for_generation": { + "default": 5, + "description": "The number of importance-sampled sequences to use for the generation phase", + "title": "Number of IW Sampled Examples for Generation", + "type": "integer" + }, + "n_top_ex_for_scoring": { + "default": 2, + "description": "The number of top sequences to use for scoring", + "title": "Number of Top Examples for Scoring", + "type": "integer" + }, + "n_random_ex_for_scoring": { + "default": 10, + "description": "The number of random sequences to use for scoring", + "title": "Number of Random Examples for Scoring", + "type": "integer" + }, + "n_iw_sampled_ex_for_scoring": { + "default": 2, + "description": "The number of importance-sampled sequences to use for scoring", + "title": "Number of IW Sampled Examples for Scoring", + "type": "integer" + } + }, + "title": "AutoInterpEvalConfig", + "type": "object" + }, + "AutoInterpMetricCategories": { + "properties": { + "autointerp": { + "$ref": "#/$defs/AutoInterpMetrics", + "description": "Metrics related to autointerp", + "title": "AutoInterp" + } + }, + "required": [ + "autointerp" + ], + "title": "AutoInterpMetricCategories", + "type": "object" + }, + "AutoInterpMetrics": { + "properties": { + "autointerp_score": { + "description": "AutoInterp detection score, using methodology similar to Eleuther's 'Open Source Automated Interpretability for Sparse Autoencoder Features'", + "title": "AutoInterp Score", + "type": "number", + "ui_default_display": true + } + }, + "required": [ + "autointerp_score" + ], + "title": "AutoInterpMetrics", + "type": "object" + }, + "BaseResultDetail": { + "properties": {}, + "title": "BaseResultDetail", + "type": "object" + } + }, + "description": "An evaluation of the interpretability of SAE latents. This evaluation is based on Eleuther's 'Open Source Automated Interpretability for Sparse Autoencoder Features'", + "properties": { + "eval_type_id": { + "default": "autointerp", + "description": "The type of the evaluation", + "title": "Eval Type ID", + "type": "string" + }, + "eval_config": { + "$ref": "#/$defs/AutoInterpEvalConfig", + "description": "The configuration of the evaluation.", + "title": "Eval Config Type" + }, + "eval_id": { + "description": "A unique UUID identifying this specific eval run", + "title": "ID", + "type": "string" + }, + "datetime_epoch_millis": { + "description": "The datetime of the evaluation in epoch milliseconds", + "title": "DateTime (epoch ms)", + "type": "integer" + }, + "eval_result_metrics": { + "$ref": "#/$defs/AutoInterpMetricCategories", + "description": "The metrics of the evaluation, organized by category. Define your own categories and the metrics that go inside them.", + "title": "Result Metrics Categorized" + }, + "eval_result_details": { + "default": null, + "description": "Optional. The details of the evaluation. A list of objects that stores nested or more detailed data, such as details about the absorption of each letter.", + "items": { + "$ref": "#/$defs/BaseResultDetail" + }, + "title": "Result Details", + "type": "array" + }, + "sae_bench_commit_hash": { + "description": "The commit hash of the SAE Bench that ran the evaluation.", + "title": "SAE Bench Commit Hash", + "type": "string" + }, + "sae_lens_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "description": "The ID of the SAE in SAE Lens.", + "title": "SAE Lens ID" + }, + "sae_lens_release_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "description": "The release ID of the SAE in SAE Lens.", + "title": "SAE Lens Release ID" + }, + "sae_lens_version": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "description": "The version of SAE Lens that ran the evaluation.", + "title": "SAE Lens Version" + }, + "eval_result_unstructured": { + "anyOf": [ + {}, + { + "type": "null" + } + ], + "default": null, + "description": "Optional. Any additional outputs that don't fit into the structured eval_result_metrics or eval_result_details fields. Since these are unstructured, don't expect this to be easily renderable in UIs, or contain any titles or descriptions.", + "title": "Unstructured Results" + } + }, + "required": [ + "eval_config", + "eval_id", + "datetime_epoch_millis", + "eval_result_metrics", + "sae_bench_commit_hash", + "sae_lens_id", + "sae_lens_release_id", + "sae_lens_version" + ], + "title": "AutoInterp", + "type": "object" +} \ No newline at end of file diff --git a/evals/core/eval_output_schema_core.json b/evals/core/eval_output_schema_core.json index 4e8c3ed..129869f 100644 --- a/evals/core/eval_output_schema_core.json +++ b/evals/core/eval_output_schema_core.json @@ -8,6 +8,12 @@ "title": "Model Name", "type": "string" }, + "llm_dtype": { + "default": "float32", + "description": "LLM data type", + "title": "LLM Data Type", + "type": "string" + }, "batch_size_prompts": { "default": 16, "description": "Batch size for evaluation prompts", @@ -80,6 +86,12 @@ "title": "Compute Featurewise Weight-Based Metrics", "type": "boolean" }, + "exclude_special_tokens_from_reconstruction": { + "default": false, + "description": "Exclude special tokens like BOS, EOS, PAD from reconstruction", + "title": "Exclude Special Tokens from Reconstruction", + "type": "boolean" + }, "verbose": { "default": false, "description": "Enable verbose output", diff --git a/evals/scr_and_tpp/eval_output_schema_scr.json b/evals/scr_and_tpp/eval_output_schema_scr.json index 35c84aa..b6fadfc 100644 --- a/evals/scr_and_tpp/eval_output_schema_scr.json +++ b/evals/scr_and_tpp/eval_output_schema_scr.json @@ -1,6 +1,6 @@ { "$defs": { - "ShiftAndTppEvalConfig": { + "ScrAndTppEvalConfig": { "properties": { "random_seed": { "default": 42, @@ -9,7 +9,7 @@ "type": "integer" }, "dataset_names": { - "description": "List of dataset names for both the SHIFT and TPP metrics", + "description": "List of dataset names for both the SCR and TPP metrics", "items": { "type": "string" }, @@ -18,7 +18,7 @@ }, "perform_scr": { "default": true, - "description": "If True, the eval will be Spurious Correlation Removal (SCR) using SHIFT. If False, the eval will be TPP.", + "description": "If True, the eval will be Spurious Correlation Removal (SCR). If False, the eval will be TPP.", "title": "Perform Spurious Correlation Removal", "type": "boolean" }, @@ -89,7 +89,7 @@ "type": "integer" }, "llm_dtype": { - "default": "bfloat16", + "default": "float32", "description": "", "title": "LLM Dtype", "type": "string" @@ -125,30 +125,30 @@ }, "type": "array" }, - "description": "Column1 Values apply only to the SHIFT metric. Column1 values represents the class pairs we train the linear probes on. In each case, we will create a perfectly biased dataset, such as all professors are males and all nurses are females.", + "description": "Column1 Values apply only to the SCR metric. Column1 values represents the class pairs we train the linear probes on. In each case, we will create a perfectly biased dataset, such as all professors are males and all nurses are females.", "title": "Column 1 Values Lookup", "type": "object" } }, - "title": "ShiftAndTppEvalConfig", + "title": "ScrAndTppEvalConfig", "type": "object" }, - "ShiftMetricCategories": { + "ScrMetricCategories": { "properties": { - "shift_metrics": { - "$ref": "#/$defs/ShiftMetrics", - "description": "SHIFT SCR metrics, calculated for different numbers of ablated features. Also includes the results for both correlation removal directions.", - "title": "Shift Metrics", + "scr_metrics": { + "$ref": "#/$defs/ScrMetrics", + "description": "SCR metrics, calculated for different numbers of ablated features. Also includes the results for both correlation removal directions.", + "title": "SCR Metrics", "ui_default_display": true } }, "required": [ - "shift_metrics" + "scr_metrics" ], - "title": "ShiftMetricCategories", + "title": "ScrMetricCategories", "type": "object" }, - "ShiftMetrics": { + "ScrMetrics": { "properties": { "scr_dir1_threshold_2": { "anyOf": [ @@ -425,10 +425,10 @@ "title": "SCR Dir 2, Top 500 SAE latents" } }, - "title": "ShiftMetrics", + "title": "ScrMetrics", "type": "object" }, - "ShiftResultDetail": { + "ScrResultDetail": { "properties": { "dataset_name": { "description": "", @@ -713,11 +713,11 @@ "required": [ "dataset_name" ], - "title": "ShiftResultDetail", + "title": "ScrResultDetail", "type": "object" } }, - "description": "The SHIFT Spurious Correlation Removal (SCR) evaluation ablates SAE latents to shift the bias of a biased linear probe. The methodology is from `Evaluating Sparse Autoencoders on Targeted Concept Removal Tasks`.", + "description": "The Spurious Correlation Removal (SCR) evaluation ablates SAE latents to shift the bias of a biased linear probe. The methodology is from `Evaluating Sparse Autoencoders on Targeted Concept Removal Tasks`.", "properties": { "eval_type_id": { "default": "scr", @@ -726,7 +726,7 @@ "type": "string" }, "eval_config": { - "$ref": "#/$defs/ShiftAndTppEvalConfig", + "$ref": "#/$defs/ScrAndTppEvalConfig", "description": "The configuration of the evaluation.", "title": "Eval Config Type" }, @@ -741,16 +741,16 @@ "type": "integer" }, "eval_result_metrics": { - "$ref": "#/$defs/ShiftMetricCategories", + "$ref": "#/$defs/ScrMetricCategories", "description": "The metrics of the evaluation, organized by category. Define your own categories and the metrics that go inside them.", "title": "Result Metrics Categorized" }, "eval_result_details": { - "description": "Each object is a stat on the SHIFT SCR results for a single dataset.", + "description": "Each object is a stat on the SCR results for a single dataset.", "items": { - "$ref": "#/$defs/ShiftResultDetail" + "$ref": "#/$defs/ScrResultDetail" }, - "title": "Per-Dataset SHIFT Spurious Correlation Removal (SCR) Results", + "title": "Per-Dataset Spurious Correlation Removal (SCR) Results", "type": "array" }, "sae_bench_commit_hash": { @@ -816,6 +816,6 @@ "sae_lens_release_id", "sae_lens_version" ], - "title": "SHIFT", + "title": "SCR", "type": "object" } \ No newline at end of file diff --git a/evals/scr_and_tpp/eval_output_schema_tpp.json b/evals/scr_and_tpp/eval_output_schema_tpp.json index e34cbc8..f625d83 100644 --- a/evals/scr_and_tpp/eval_output_schema_tpp.json +++ b/evals/scr_and_tpp/eval_output_schema_tpp.json @@ -1,6 +1,6 @@ { "$defs": { - "ShiftAndTppEvalConfig": { + "ScrAndTppEvalConfig": { "properties": { "random_seed": { "default": 42, @@ -9,7 +9,7 @@ "type": "integer" }, "dataset_names": { - "description": "List of dataset names for both the SHIFT and TPP metrics", + "description": "List of dataset names for both the SCR and TPP metrics", "items": { "type": "string" }, @@ -18,7 +18,7 @@ }, "perform_scr": { "default": true, - "description": "If True, the eval will be Spurious Correlation Removal (SCR) using SHIFT. If False, the eval will be TPP.", + "description": "If True, the eval will be Spurious Correlation Removal (SCR). If False, the eval will be TPP.", "title": "Perform Spurious Correlation Removal", "type": "boolean" }, @@ -89,7 +89,7 @@ "type": "integer" }, "llm_dtype": { - "default": "bfloat16", + "default": "float32", "description": "", "title": "LLM Dtype", "type": "string" @@ -125,12 +125,12 @@ }, "type": "array" }, - "description": "Column1 Values apply only to the SHIFT metric. Column1 values represents the class pairs we train the linear probes on. In each case, we will create a perfectly biased dataset, such as all professors are males and all nurses are females.", + "description": "Column1 Values apply only to the SCR metric. Column1 values represents the class pairs we train the linear probes on. In each case, we will create a perfectly biased dataset, such as all professors are males and all nurses are females.", "title": "Column 1 Values Lookup", "type": "object" } }, - "title": "ShiftAndTppEvalConfig", + "title": "ScrAndTppEvalConfig", "type": "object" }, "TppMetricCategories": { @@ -726,7 +726,7 @@ "type": "string" }, "eval_config": { - "$ref": "#/$defs/ShiftAndTppEvalConfig", + "$ref": "#/$defs/ScrAndTppEvalConfig", "description": "The configuration of the evaluation.", "title": "Eval Config Type" }, diff --git a/evals/sparse_probing/eval_output_schema_sparse_probing.json b/evals/sparse_probing/eval_output_schema_sparse_probing.json index f0c14cf..ae9bd2c 100644 --- a/evals/sparse_probing/eval_output_schema_sparse_probing.json +++ b/evals/sparse_probing/eval_output_schema_sparse_probing.json @@ -531,7 +531,7 @@ "type": "object" } }, - "description": "Sparse probing evaluation description goes here.", + "description": "An evaluation using SAEs to probe for supervised concepts in LLMs. We use sparse probing with the top K SAE latents and probe for over 30 different classes across 5 datasets.", "properties": { "eval_type_id": { "default": "sparse_probing", diff --git a/evals/unlearning/eval_output_schema_unlearning.json b/evals/unlearning/eval_output_schema_unlearning.json index 4b6fc3a..7e94fc9 100644 --- a/evals/unlearning/eval_output_schema_unlearning.json +++ b/evals/unlearning/eval_output_schema_unlearning.json @@ -14,7 +14,7 @@ "type": "integer" }, "dataset_names": { - "description": "List of dataset names", + "description": "List of dataset names. We want to unlearn wmdp-bio while retaining knowledge in other datasets", "items": { "type": "string" }, @@ -23,12 +23,12 @@ }, "intervention_method": { "default": "clamp_feature_activation", - "description": "Intervention method", + "description": "Intervention method. We only support 'clamp_feature_activation' for now", "title": "Intervention Method", "type": "string" }, "retain_thresholds": { - "description": "Retain thresholds", + "description": "We ignore features that activate more than this threshold on the retain dataset", "items": { "type": "number" }, @@ -36,7 +36,7 @@ "type": "array" }, "n_features_list": { - "description": "N features list", + "description": "Each N is the number of features we select and clamp to a negative value", "items": { "type": "integer" }, @@ -44,7 +44,7 @@ "type": "array" }, "multipliers": { - "description": "Multipliers", + "description": "A list of negative values. We iterate over this list, clamping the selected features to each value", "items": { "type": "integer" }, @@ -65,37 +65,37 @@ }, "dataset_size": { "default": 1024, - "description": "Dataset size", + "description": "Dataset size we use when calculating feature sparsity", "title": "Dataset Size", "type": "integer" }, "seq_len": { "default": 1024, - "description": "Sequence length", + "description": "Sequence length when calculating feature sparsity", "title": "Sequence Length", "type": "integer" }, "n_batch_loss_added": { "default": 50, - "description": "N batch loss added", + "description": "Number of batches to use when calculating the loss added by an intervention (currently not supported).", "title": "N Batch Loss Added", "type": "integer" }, "target_metric": { "default": "correct", - "description": "Target metric", + "description": "Controls the type of `question_ids` we load. We support 'correct', `correct-iff-question`, and `correct-no-tricks", "title": "Target Metric", "type": "string" }, "save_metrics": { "default": true, - "description": "Save metrics", - "title": "Save Metrics", + "description": "If true, we save the metrics for each set of intervention hyperparameters. This is required to be true currently, as the unlearning score is calculated over all results.", + "title": "Save Metrics Flag", "type": "boolean" }, "model_name": { "default": "gemma-2-2b-it", - "description": "Model name", + "description": "Model name. Note that this should be a instruct model.", "title": "Model Name", "type": "string" }, @@ -126,7 +126,7 @@ "UnlearningMetrics": { "properties": { "unlearning_score": { - "description": "Unlearning score", + "description": "Unlearning score, using methodology from APPLYING SPARSE AUTOENCODERS TO UNLEARN KNOWLEDGE IN LANGUAGE MODELS", "title": "Unlearning Score", "type": "number", "ui_default_display": true @@ -139,7 +139,7 @@ "type": "object" } }, - "description": "Unlearning evaluation description goes here.", + "description": "An evaluation of the ability of SAEs to unlearn biology knowledge from LLMs, using methodology from `Applying Sparse Autoencoders to Unlearn Knowledge in Language Models`", "properties": { "eval_type_id": { "default": "unlearning", diff --git a/sae_bench_utils/activation_collection.py b/sae_bench_utils/activation_collection.py index 338af26..3432b8f 100644 --- a/sae_bench_utils/activation_collection.py +++ b/sae_bench_utils/activation_collection.py @@ -14,12 +14,14 @@ LLM_NAME_TO_BATCH_SIZE = { "pythia-70m-deduped": 512, "gemma-2-2b": 32, + "gemma-2-9b": 32, } LLM_NAME_TO_DTYPE = { "pythia-70m-deduped": "float32", "gemma-2-2b": "bfloat16", "gemma-2-2b-it": "bfloat16", + "gemma-2-9b": "bfloat16", }