From 2b2a6d36b6a1a1f561cc86ecdd0f425568bd2378 Mon Sep 17 00:00:00 2001 From: Johnny Lin Date: Mon, 27 Jan 2025 12:45:07 -0800 Subject: [PATCH] fix: Update json schema jsons --- ...output_schema_absorption_first_letter.json | 56 ++++++++++++--- .../eval_output_schema_autointerp.json | 8 ++- .../evals/core/eval_output_schema_core.json | 72 ++++++++++++++++++- sae_bench/evals/generate_json_schemas.py | 4 +- .../eval_output_schema_sparse_probing.json | 11 ++- 5 files changed, 134 insertions(+), 17 deletions(-) diff --git a/sae_bench/evals/absorption/eval_output_schema_absorption_first_letter.json b/sae_bench/evals/absorption/eval_output_schema_absorption_first_letter.json index 609c027..ba56f35 100644 --- a/sae_bench/evals/absorption/eval_output_schema_absorption_first_letter.json +++ b/sae_bench/evals/absorption/eval_output_schema_absorption_first_letter.json @@ -74,9 +74,15 @@ }, "AbsorptionMeanMetrics": { "properties": { - "mean_absorption_score": { - "description": "Average of the absorption scores across all letters", - "title": "Mean Absorption Score", + "mean_absorption_fraction_score": { + "description": "Average of the absorption fraction scores across all letters", + "title": "Mean Absorption Fraction Score", + "type": "number", + "ui_default_display": true + }, + "mean_full_absorption_score": { + "description": "Average of the full absorption scores across all letters", + "title": "Mean Full Absorption Score", "type": "number", "ui_default_display": true }, @@ -85,11 +91,33 @@ "title": "Mean Number of Split Features", "type": "number", "ui_default_display": true + }, + "std_dev_absorption_fraction_score": { + "description": "Standard deviation of the absorption fraction scores across all letters", + "title": "Standard Deviation of Absorption Fraction Score", + "type": "number", + "ui_default_display": true + }, + "std_dev_full_absorption_score": { + "description": "Standard deviation of the full absorption scores across all letters", + "title": "Standard Deviation of Full Absorption Score", + "type": "number", + "ui_default_display": true + }, + "std_dev_num_split_features": { + "description": "Standard deviation of the number of split features across all letters", + "title": "Standard Deviation of Number of Split Features", + "type": "number", + "ui_default_display": true } }, "required": [ - "mean_absorption_score", - "mean_num_split_features" + "mean_absorption_fraction_score", + "mean_full_absorption_score", + "mean_num_split_features", + "std_dev_absorption_fraction_score", + "std_dev_full_absorption_score", + "std_dev_num_split_features" ], "title": "AbsorptionMeanMetrics", "type": "object" @@ -116,14 +144,19 @@ "title": "First Letter", "type": "string" }, - "absorption_rate": { + "mean_absorption_fraction": { + "description": "", + "title": "Mean Absorption Fraction", + "type": "number" + }, + "full_absorption_rate": { "description": "", - "title": "Absorption Rate", + "title": "Rate of Full Absorption", "type": "number" }, - "num_absorption": { + "num_full_absorption": { "description": "", - "title": "Num Absorption", + "title": "Num Full Absorption", "type": "integer" }, "num_probe_true_positives": { @@ -139,8 +172,9 @@ }, "required": [ "first_letter", - "absorption_rate", - "num_absorption", + "mean_absorption_fraction", + "full_absorption_rate", + "num_full_absorption", "num_probe_true_positives", "num_split_features" ], diff --git a/sae_bench/evals/autointerp/eval_output_schema_autointerp.json b/sae_bench/evals/autointerp/eval_output_schema_autointerp.json index 3ac70db..df4e31a 100644 --- a/sae_bench/evals/autointerp/eval_output_schema_autointerp.json +++ b/sae_bench/evals/autointerp/eval_output_schema_autointerp.json @@ -164,10 +164,16 @@ "title": "AutoInterp Score", "type": "number", "ui_default_display": true + }, + "autointerp_std_dev": { + "description": "AutoInterp detection score standard deviation over all tested features", + "title": "AutoInterp Standard Deviation", + "type": "number" } }, "required": [ - "autointerp_score" + "autointerp_score", + "autointerp_std_dev" ], "title": "AutoInterpMetrics", "type": "object" diff --git a/sae_bench/evals/core/eval_output_schema_core.json b/sae_bench/evals/core/eval_output_schema_core.json index add9156..47fa525 100644 --- a/sae_bench/evals/core/eval_output_schema_core.json +++ b/sae_bench/evals/core/eval_output_schema_core.json @@ -133,6 +133,16 @@ "description": "Cosine similarity between encoder and decoder weights for each feature", "title": "Encoder-Decoder Cosine Similarity", "type": "number" + }, + "max_decoder_cosine_sim": { + "description": "Maximum cosine similarity with any other feature's decoder weights", + "title": "Max Decoder Cosine Similarity", + "type": "number" + }, + "max_encoder_cosine_sim": { + "description": "Maximum cosine similarity with any other feature's encoder weights", + "title": "Max Encoder Cosine Similarity", + "type": "number" } }, "required": [ @@ -141,7 +151,9 @@ "consistent_activation_heuristic", "encoder_bias", "encoder_norm", - "encoder_decoder_cosine_sim" + "encoder_decoder_cosine_sim", + "max_decoder_cosine_sim", + "max_encoder_cosine_sim" ], "title": "CoreFeatureMetric", "type": "object" @@ -177,6 +189,11 @@ "$ref": "#/$defs/TokenStatsMetrics", "description": "Statistics about the number of tokens used in evaluation", "title": "Token Statistics" + }, + "misc_metrics": { + "$ref": "#/$defs/MiscMetrics", + "description": "Miscellaneous metrics", + "title": "Miscellaneous Metrics" } }, "required": [ @@ -185,11 +202,62 @@ "reconstruction_quality", "shrinkage", "sparsity", - "token_stats" + "token_stats", + "misc_metrics" ], "title": "CoreMetricCategories", "type": "object" }, + "MiscMetrics": { + "properties": { + "freq_over_1_percent": { + "description": "Proportion of tokens that activate each feature more than 1% of the time", + "title": "Activation Frequency Over 1%", + "type": "number" + }, + "freq_over_10_percent": { + "description": "Proportion of tokens that activate each feature more than 10% of the time", + "title": "Activation Frequency Over 10%", + "type": "number" + }, + "normalized_freq_over_1_percent": { + "description": "Sum of > 1% activation frequency probabilities, normalized by the sum of all feature probabilities", + "title": "Normalized Activation Frequency Over 1%", + "type": "number" + }, + "normalized_freq_over_10_percent": { + "description": "Sum of > 10% activation frequency probabilities, normalized by the sum of all feature probabilities", + "title": "Normalized Activation Frequency Over 10%", + "type": "number" + }, + "average_max_encoder_cosine_sim": { + "description": "Average of the maximum cosine similarity with any other feature's encoder weights", + "title": "Average Max Encoder Cosine Similarity", + "type": "number" + }, + "average_max_decoder_cosine_sim": { + "description": "Average of the maximum cosine similarity with any other feature's decoder weights", + "title": "Average Max Decoder Cosine Similarity", + "type": "number" + }, + "frac_alive": { + "description": "Fraction of features that fired at least once during evaluation. This will likely be an underestimation due to a limited amount of tokens", + "title": "Fraction of Alive Features", + "type": "number" + } + }, + "required": [ + "freq_over_1_percent", + "freq_over_10_percent", + "normalized_freq_over_1_percent", + "normalized_freq_over_10_percent", + "average_max_encoder_cosine_sim", + "average_max_decoder_cosine_sim", + "frac_alive" + ], + "title": "MiscMetrics", + "type": "object" + }, "ModelBehaviorPreservationMetrics": { "properties": { "kl_div_score": { diff --git a/sae_bench/evals/generate_json_schemas.py b/sae_bench/evals/generate_json_schemas.py index 695919c..46e83ac 100644 --- a/sae_bench/evals/generate_json_schemas.py +++ b/sae_bench/evals/generate_json_schemas.py @@ -21,7 +21,9 @@ def main(): if file == "eval_output.py": print(file) module_path = os.path.relpath(os.path.join(root, file), base_dir) - module_name = module_path.replace("/", ".").replace(".py", "") + module_name = "sae_bench." + module_path.replace("/", ".").replace( + ".py", "" + ) try: module = __import__(module_name, fromlist=[""]) diff --git a/sae_bench/evals/sparse_probing/eval_output_schema_sparse_probing.json b/sae_bench/evals/sparse_probing/eval_output_schema_sparse_probing.json index 67e0b5c..8cc50b7 100644 --- a/sae_bench/evals/sparse_probing/eval_output_schema_sparse_probing.json +++ b/sae_bench/evals/sparse_probing/eval_output_schema_sparse_probing.json @@ -41,10 +41,17 @@ "type": "integer" }, "llm_batch_size": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], "default": null, "description": "LLM batch size. This is set by default in the main script, or it can be set with a command line argument.", - "title": "LLM Batch Size", - "type": "integer" + "title": "LLM Batch Size" }, "llm_dtype": { "default": "",