From 4b23575cca0f9422c7cdbefac093c6dead6a55eb Mon Sep 17 00:00:00 2001 From: callummcdougall Date: Wed, 16 Oct 2024 19:09:53 +0000 Subject: [PATCH] first commit --- evals/autointerp/README.md | 19 ++ evals/autointerp/config.py | 93 +++++++ evals/autointerp/demo.py | 48 ++++ evals/autointerp/logs_100.txt | 304 +++++++++++++++++++++ evals/autointerp/logs_4.txt | 213 +++++++++++++++ evals/autointerp/main.py | 421 ++++++++++++++++++++++++++++++ sae_bench_utils/indexing_utils.py | 91 +++++++ tests/test_utils.py | 49 +++- 8 files changed, 1237 insertions(+), 1 deletion(-) create mode 100644 evals/autointerp/README.md create mode 100644 evals/autointerp/config.py create mode 100644 evals/autointerp/demo.py create mode 100644 evals/autointerp/logs_100.txt create mode 100644 evals/autointerp/logs_4.txt create mode 100644 evals/autointerp/main.py create mode 100644 sae_bench_utils/indexing_utils.py diff --git a/evals/autointerp/README.md b/evals/autointerp/README.md new file mode 100644 index 0000000..30ade1c --- /dev/null +++ b/evals/autointerp/README.md @@ -0,0 +1,19 @@ +# AutoInterp + +## File structure + +There are 4 Python files in this folder: + +- `config.py` - this contains the config class for AutoInterp. +- `main.py` - this contains the main `AutoInterp` class, as well as the functions which are the interface to the rest of the SAEBench codebase. +- `demo.py` - you can run this via `python demo.py --api_key YOUR_API_KEY` to see an example output & how the function works. It creates & saves a log file (I've left the output of those files in the repo, so you can see what they look like). + +## Summary of how it works + +### Generation phase + +We run a batch through the model & SAE, getting activation values. We take some number of sequences from the top of the activation distribution, and also sample some number of sequences from the rest of the distribution with sample probability proportional to their activation (this is a stand-in for quantile sampling, which should be more compatible with e.g. Gated models which won't have values in all quantiles). We take these sequences and format the activating token using `<>` syntax, then feed them through the model and ask for an explanation. + +### Scoring phase + +We select some number of top sequences & importance weighting sampled sequences (like the generation phase), but also include some sequences chosen randomly from the rest of the distribution. We'll shuffle these together and give them to the LLM as a numbered list, and we'll ask the LLM to return a comma-separated list of the indices of the sequences which it thinks will activate this feature. \ No newline at end of file diff --git a/evals/autointerp/config.py b/evals/autointerp/config.py new file mode 100644 index 0000000..76ff3b1 --- /dev/null +++ b/evals/autointerp/config.py @@ -0,0 +1,93 @@ +from dataclasses import dataclass + + +@dataclass +class AutoInterpConfig: + """ + Controls all parameters for how autointerp will work. + + Arguments: + model_name: The name of the model to use + device: The device to use + n_latents: The number of latents to use + override_latents: The latents to use (overrides n_latents if supplied) + seed: The seed to use for all randomness + + buffer: The size of the buffer to use for scoring + no_overlap: Whether to allow overlapping sequences for scoring + act_threshold_frac: The fraction of the maximum activation to use as the activation threshold + total_tokens: The total number of tokens we'll gather data for. + batch_size: The batch size to use for the scoring phase + scoring: Whether to perform the scoring phase, or just return explanation + max_tokens_in_explanation: The maximum number of tokens to allow in an explanation + use_demos_in_explanation: Whether to use demonstrations in the explanation prompt + + n_top_ex_for_generation: The number of top activating sequences to use for the generation phase + n_iw_sampled_ex_for_generation: The number of importance-sampled sequences to use for the generation phase (this + is a replacement for quantile sampling) + + n_top_ex_for_scoring: The number of top sequences to use for scoring + n_random_ex_for_scoring: The number of random sequences to use for scoring + n_iw_sampled_ex_for_scoring: The number of importance-sampled sequences to use for scoring + """ + + # Important stuff + model_name: str + n_latents: int | None = None + override_latents: list[int] | None = None + seed: int = 0 + + # Main stuff + buffer: int = 10 + no_overlap: bool = True + act_threshold_frac: float = 0.01 + total_tokens: int = 10_000_000 + batch_size: int = 512 # split up total tokens into batches of this size + scoring: bool = True + max_tokens_in_explanation: int = 30 + use_demos_in_explanation: bool = True + + # Sequences included in generation phase + n_top_ex_for_generation: int = 10 + n_iw_sampled_ex_for_generation: int = 5 + + # Sequences included in scoring phase + n_top_ex_for_scoring: int = 4 + n_random_ex_for_scoring: int = 10 + n_iw_sampled_ex_for_scoring: int = 0 + + def __post_init__(self): + if self.n_latents is None: + assert self.override_latents is not None + self.latents = self.override_latents + self.n_latents = len(self.latents) + else: + assert self.override_latents is None + self.latents = None + + @property + def n_top_ex(self): + """When fetching data, we get the top examples for generation & scoring simultaneously.""" + return self.n_top_ex_for_generation + self.n_top_ex_for_scoring + + @property + def max_tokens_in_prediction(self) -> int: + """Predictions take the form of comma-separated numbers, which should all be single tokens.""" + return 2 * self.n_ex_for_scoring + 5 + + @property + def n_ex_for_generation(self) -> int: + return self.n_top_ex_for_generation + self.n_iw_sampled_ex_for_generation + + @property + def n_ex_for_scoring(self) -> int: + """For scoring phase, we use a randomly shuffled mix of top-k activations and random sequences.""" + return self.n_top_ex_for_scoring + self.n_random_ex_for_scoring + self.n_iw_sampled_ex_for_scoring + + @property + def n_iw_sampled_ex(self) -> int: + return self.n_iw_sampled_ex_for_generation + self.n_iw_sampled_ex_for_scoring + + @property + def n_correct_for_scoring(self) -> int: + return self.n_top_ex_for_scoring + self.n_iw_sampled_ex_for_scoring diff --git a/evals/autointerp/demo.py b/evals/autointerp/demo.py new file mode 100644 index 0000000..c9fa83a --- /dev/null +++ b/evals/autointerp/demo.py @@ -0,0 +1,48 @@ +import argparse +from pathlib import Path + +import torch +from evals.autointerp.config import AutoInterpConfig +from evals.autointerp.main import run_eval + +# Set up command-line argument parsing +parser = argparse.ArgumentParser(description="Run AutoInterp evaluation.") +parser.add_argument( + "--api_key", type=str, required=True, help="API key for the evaluation." +) +args = parser.parse_args() + +api_key = args.api_key # Use the API key supplied via command line + +device = torch.device( + "mps" + if torch.backends.mps.is_available() + else "cuda" + if torch.cuda.is_available() + else "cpu" +) + +selected_saes_dict = { + "gpt2-small-res-jb": ["blocks.7.hook_resid_pre"], +} +torch.set_grad_enabled(False) + +# ! Demo 1: just 4 specially chosen latents +cfg = AutoInterpConfig(model_name="gpt2-small", override_latents=[9, 11, 15, 16873]) +save_logs_path = Path(__file__).parent / "logs_4.txt" +save_logs_path.unlink(missing_ok=True) +results = run_eval( + cfg, selected_saes_dict, device, api_key, save_logs_path=save_logs_path +) +print(results) + +# ! Demo 2: 100 randomly chosen latents +cfg = AutoInterpConfig(model_name="gpt2-small", n_latents=100) +save_logs_path = Path(__file__).parent / "logs_100.txt" +save_logs_path.unlink(missing_ok=True) +results = run_eval( + cfg, selected_saes_dict, device, api_key, save_logs_path=save_logs_path +) +print(results) + +# python demo.py --api_key "YOUR_API_KEY" diff --git a/evals/autointerp/logs_100.txt b/evals/autointerp/logs_100.txt new file mode 100644 index 0000000..405b853 --- /dev/null +++ b/evals/autointerp/logs_100.txt @@ -0,0 +1,304 @@ +Summary table: +┌──────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────┬─────────────────┬──────────┐ +│ latent │ explanation │ predictions │ correct seqs │ score │ +├──────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────┼──────────────────────────┼─────────────────┼──────────┤ +│ 9892 │ references to violence or incidents involving guns and police responses │ [7, 14] │ [2, 7, 10, 14] │ 0.857143 │ +│ 9496 │ variations of the word 'initiate' and related terms regarding the start of actions or processes │ [1, 5, 12, 14] │ [1, 5, 12, 14] │ 1 │ +│ 11747 │ names and phrases containing the substring 'McG' and its variations │ [1, 2, 10, 13] │ [1, 2, 3, 13] │ 0.857143 │ +│ 10979 │ the concept of 'vector' in programming and its associated functions │ [6, 9, 11, 13] │ [6, 9, 11, 13] │ 1 │ +│ 12387 │ instances of the substring 'Indon' when discussing Indonesia or related topics │ [2] │ [1, 2, 5, 9] │ 0.785714 │ +│ 12306 │ concepts related to memory impairment and cognitive functions in various contexts │ [3, 5, 14] │ [1, 3, 5, 14] │ 0.928571 │ +│ 11045 │ the surname 'Singh' and other specific names like 'Hussein' and 'Herrera' │ [5, 12] │ [3, 5, 9, 12] │ 0.857143 │ +│ 10566 │ variations of the substring 'ht' and similar letter combinations │ [1, 3, 6, 13] │ [6, 7, 8, 13] │ 0.714286 │ +│ 10764 │ the concepts of knowledge and understanding expressed through various forms of the word "know" │ [5, 6, 12, 13] │ [5, 6, 12, 13] │ 1 │ +│ 12609 │ the phrase 'acts as' in various contexts │ [3, 6] │ [2, 3, 6, 8] │ 0.857143 │ +│ 9805 │ the conjunction 'or' in varied contexts suggesting alternatives or choices │ [1, 2, 8, 10, 11] │ [1, 2, 8, 11] │ 0.928571 │ +│ 12069 │ phrases indicating quantities or counts of people, items, or groups │ [1, 9, 10, 12, 14] │ [1, 10, 12, 14] │ 0.928571 │ +│ 13862 │ phrases related to official duties and responsibilities under oath or accountability │ [6, 12] │ [2, 6, 11, 14] │ 0.714286 │ +│ 12904 │ the word 'newly' followed by various contexts indicating recent developments or changes │ [1, 4, 7, 10] │ [1, 4, 7, 10] │ 1 │ +│ 13040 │ variations of the phrase "just in" and related expressions indicating timing and presence │ [1, 7, 10, 14] │ [1, 7, 10, 14] │ 1 │ +│ 14130 │ the concept of winning titles or elections │ [5, 7, 10, 12] │ [5, 7, 10, 12] │ 1 │ +│ 13868 │ text related to enjoyment and leisure activities such as movies and events │ [6, 10, 11] │ [6, 7, 10, 11] │ 0.928571 │ +│ 13300 │ the phrase 'either' indicating uncertainty or hesitation in statements │ [5, 6, 8, 9] │ [5, 6, 8, 9] │ 1 │ +│ 14798 │ comparisons indicated by the substring "than in" │ [5, 11] │ [1, 5, 7, 11] │ 0.857143 │ +│ 14697 │ specific names, titles, and scientific terms related to biology and media │ [1, 2, 5, 6, 11, 13] │ [1, 8, 10, 11] │ 0.571429 │ +│ 13976 │ variations of the word "clown" and concepts related to funerals or death │ [] │ [1, 10, 11, 13] │ 0.714286 │ +│ 13589 │ forms of the verb "to be" and related expressions about possibility and necessity │ [2, 3, 6, 9, 10, 11, 12] │ [1, 4, 6, 11] │ 0.5 │ +│ 11260 │ words describing mildness or positivity such as mild benign and pleasant │ [5, 7, 8, 10] │ [5, 7, 8, 10] │ 1 │ +│ 15987 │ phrases related to subscriptions, payments, and promotional marketing │ [11] │ [3, 5, 9, 11] │ 0.785714 │ +│ 16011 │ phrases related to increasing scores, setting up, or progressing in various contexts │ [2, 7, 11] │ [1, 7, 8, 11] │ 0.785714 │ +│ 15767 │ specific pronouns or demonstratives such as these that indicate focus or emphasis │ [1, 3, 9, 10, 12] │ [1, 8, 10, 14] │ 0.642857 │ +│ 16288 │ names of venues and facilities related to sports and events │ [5, 11, 13] │ [1, 5, 11, 13] │ 0.928571 │ +│ 16233 │ phrases expressing influence or the ability to affect someone's feelings or actions │ [2, 4, 8, 10, 14] │ [2, 4, 13, 14] │ 0.785714 │ +│ 15698 │ the word 'like' followed by examples or analogous phrases throughout the text │ [2, 7, 12, 13, 14] │ [2, 7, 13, 14] │ 0.928571 │ +│ 16309 │ phrases related to terms of agreements or conditions in various contexts │ [7, 9, 11, 13] │ [7, 9, 11, 13] │ 1 │ +│ 16339 │ terms related to scale and levels in various contexts │ [4, 5, 7, 9, 14] │ [4, 7, 9, 14] │ 0.928571 │ +│ 16332 │ terms related to previous studies and research findings │ [3, 7, 13, 14] │ [3, 7, 13, 14] │ 1 │ +│ 17006 │ social media interactions and expressions of admiration or support │ [3, 5, 6, 12] │ [3, 5, 6, 12] │ 1 │ +│ 905 │ the word "I" indicating personal thoughts or opinions │ [1, 3, 5, 13] │ [1, 3, 11, 13] │ 0.857143 │ +│ 257 │ terms related to economic political and business contexts and their implications │ [2, 4, 9, 10, 12, 14] │ [9, 10, 12, 13] │ 0.714286 │ +│ 17079 │ the term "think tank" in various contexts related to research and analysis │ [4, 7, 8, 10] │ [4, 7, 8, 10] │ 1 │ +│ 1379 │ the word 'Thumbnails' and coding syntax in technical contexts │ [] │ [1, 7, 10, 11] │ 0.714286 │ +│ 15021 │ the term 'external' in various contexts related to features and influences │ [5, 7, 9, 13] │ [5, 7, 9, 13] │ 1 │ +│ 15532 │ words related to popular culture and various geographic and cultural concepts │ [2, 6, 8, 11, 14] │ [2, 3, 9, 10] │ 0.5 │ +│ 17229 │ references to African Americans and their societal issues and experiences │ [1, 3, 4, 14] │ [1, 3, 4, 14] │ 1 │ +│ 2003 │ variations of the word 'motor' and related automotive terms │ [2, 3, 5, 10] │ [2, 3, 5, 10] │ 1 │ +│ 2092 │ numerical estimates of large groups of people affected by events or actions │ [5, 7, 8, 13] │ [5, 7, 8, 13] │ 1 │ +│ 17786 │ numerical data and section identifiers within various formats and contexts │ [2, 5, 8, 14] │ [2, 5, 8, 14] │ 1 │ +│ 2741 │ words related to various types of clothing and fashion items │ [3, 4, 12, 14] │ [3, 4, 12, 14] │ 1 │ +│ 18377 │ the phrase "wait" in various contexts indicating pauses or anticipation │ [2, 3, 7, 8] │ [2, 3, 7, 8] │ 1 │ +│ 2848 │ character names, titles of works, and specific phrases in creative contexts │ [2, 6, 8, 10] │ [2, 6, 8, 10] │ 1 │ +│ 19165 │ the concept of Las Vegas and related events or locations │ [6, 8, 9, 10] │ [6, 8, 9, 10] │ 1 │ +│ 1239 │ verbs that indicate actions or states of being in various contexts │ [2, 5, 6, 10, 12] │ [3, 5, 7, 14] │ 0.5 │ +│ 18478 │ various abbreviations and specialized terms across different contexts │ [2, 9, 10, 11, 14] │ [2, 6, 10, 14] │ 0.785714 │ +│ 18234 │ terms related to various types of public buildings and expansions or developments involving them │ [8, 10, 12] │ [8, 10, 11, 12] │ 0.928571 │ +│ 18497 │ the word 'keep' and variations of 'their' and 'the' in various contexts │ [3, 4, 6, 9, 10, 11, 14] │ [4, 6, 11, 14] │ 0.785714 │ +│ 3213 │ the phrase 'According to' followed by various sources or reports │ [3, 6, 7, 13] │ [3, 6, 7, 13] │ 1 │ +│ 3257 │ numerical data and performance metrics related to teams and rankings │ [8, 11, 12] │ [2, 9, 11, 12] │ 0.785714 │ +│ 19537 │ variations of the name 'Tress' and related terms in multiple contexts │ [4, 6, 12] │ [4, 6, 12, 14] │ 0.928571 │ +│ 19176 │ the substring "mid" in various contexts indicating time periods or midpoints │ [5, 6, 9, 13] │ [5, 6, 9, 13] │ 1 │ +│ 17904 │ words and concepts related to writing, inscriptions, and marks on surfaces │ [4, 5, 11, 14] │ [4, 5, 11, 14] │ 1 │ +│ 2774 │ names of people places and organizations along with specific terms related to language and nationality │ [2, 3, 4, 7, 9] │ [2, 6, 7, 14] │ 0.642857 │ +│ 3428 │ the concept of challenges, difficulties, or measures regarding inequality and resistance │ [9, 11] │ [1, 9, 10, 11] │ 0.857143 │ +│ 2157 │ numerical values and substrings related to statistics and data points │ [5, 6, 8, 10] │ [5, 6, 8, 10] │ 1 │ +│ 20428 │ the concept of allegations and accusations in various contexts │ [4, 6, 7, 9] │ [4, 6, 7, 9] │ 1 │ +│ 20537 │ words related to loss of consciousness and medical emergencies │ [3, 9, 11, 12] │ [3, 9, 11, 12] │ 1 │ +│ 20934 │ various forms of the word 'vote' and related concepts of voting and elections │ [1, 2, 12, 13] │ [1, 2, 12, 13] │ 1 │ +│ 3768 │ the word "then" in conditional statements │ [1, 5, 13, 14] │ [1, 5, 13, 14] │ 1 │ +│ 3801 │ names of individuals or references to notable figures or characters │ [4, 6, 12, 14] │ [7, 9, 10, 13] │ 0.428571 │ +│ 3432 │ the phrase 'impact on' in various contexts │ [6, 12, 14] │ [6, 7, 12, 14] │ 0.928571 │ +│ 3560 │ words containing the letter sequence 'c' and a few specific contexts related to them │ [2, 6, 7, 8, 10, 13, 14] │ [1, 7, 8, 10] │ 0.642857 │ +│ 20809 │ substrings found in names and titles across various contexts │ [3, 6, 13, 14] │ [1, 8, 12, 14] │ 0.571429 │ +│ 3896 │ specific substrings like "ourn", "ount", "ople", "ourners", "grieve", and "present" in │ [7, 13] │ [1, 3, 7, 13] │ 0.857143 │ +│ 20917 │ terms related to possession or ownership, particularly the word "their" in various contexts │ [3, 5, 8, 13] │ [3, 5, 8, 13] │ 1 │ +│ 4073 │ variations of the substring "ide," the concept of sides, and context related to nightlife and activities │ [5, 8, 12] │ [5, 8, 9, 12] │ 0.928571 │ +│ 22259 │ references to concrete masonry units and their specifications in construction contexts │ [] │ [8, 11, 12, 13] │ 0.714286 │ +│ 22181 │ threatening language and concepts related to violence and intimidation │ [2, 5, 10, 12] │ [4, 5, 6, 12] │ 0.714286 │ +│ 4862 │ names and expressions of uncertainty or emotional response in dialogues │ [2, 4, 6, 7] │ [4, 5, 6, 7] │ 0.857143 │ +│ 21004 │ demographic statistics and disparities in coverage or representation among different racial groups │ [5, 7, 14] │ [5, 7, 9, 14] │ 0.928571 │ +│ 5458 │ substrings of names and their variations across different contexts │ [1, 3, 4, 6, 9, 11] │ [3, 4, 9, 11] │ 0.857143 │ +│ 22515 │ the substring 'Mo' in various contexts and names │ [4, 7, 9, 14] │ [4, 7, 9, 14] │ 1 │ +│ 21956 │ the word 'source' in various contexts related to origins or causes │ [3, 5, 8, 11] │ [3, 5, 8, 11] │ 1 │ +│ 5100 │ words starting with specific letters and certain phrases related to digital and creative contexts │ [1, 4, 6, 9, 12] │ [3, 6, 11, 14] │ 0.5 │ +│ 22279 │ the word 'contain' and its variations in various contexts │ [1, 3, 5, 13] │ [1, 3, 5, 13] │ 1 │ +│ 22924 │ words related to specific locations or names containing the substring 'ork' │ [2, 12] │ [2, 5, 6, 12] │ 0.857143 │ +│ 5457 │ medical and scientific terms related to illnesses and elements like disease and chemicals │ [1, 3, 11] │ [1, 4, 9, 12] │ 0.642857 │ +│ 4795 │ the word 'if' in context of uncertainty or questioning │ [1, 5, 7, 9] │ [1, 5, 7, 9] │ 1 │ +│ 23003 │ words related to making or creating effects and processes │ [3, 5, 12] │ [2, 3, 5, 12] │ 0.928571 │ +│ 4958 │ the substring 'plan' and its variants in various contexts │ [1, 2, 5, 11] │ [1, 2, 5, 11] │ 1 │ +│ 23409 │ the word 'how' and concepts related to processes and methods │ [6, 8, 9, 13] │ [1, 6, 8, 13] │ 0.857143 │ +│ 24571 │ location names and associated events or organizations │ [1, 2, 3, 8, 9, 13] │ [1, 2, 9, 13] │ 0.857143 │ +│ 6289 │ the phrase 'son in law' and related family terms │ [11, 12] │ [1, 5, 11, 12] │ 0.857143 │ +│ 6489 │ references to the Philippines and the political figure Rodrigo Duterte │ [9, 10] │ [1, 9, 10, 12] │ 0.857143 │ +│ 6487 │ the word "Turkey" and variants primarily in political and military contexts │ [5, 6, 7, 11] │ [5, 6, 7, 11] │ 1 │ +│ 6618 │ the word 'form' in various contexts related to documents and processes │ [6, 9, 12] │ [4, 6, 9, 12] │ 0.928571 │ +│ 6608 │ the words related to proof or evidence, such as proven, unproven, and demonstrably │ [4, 6, 8, 13] │ [4, 6, 8, 13] │ 1 │ +│ 6790 │ aggressive actions and physical confrontations in various contexts │ [3, 9, 10, 13] │ [3, 9, 10, 13] │ 1 │ +│ 7243 │ names and proper nouns with specific substring patterns │ [3, 8, 12, 13] │ [3, 8, 12, 13] │ 1 │ +│ 8549 │ the term 'protesters' in various contexts of civic action and demonstrations │ [2, 10, 13, 14] │ [2, 10, 13, 14] │ 1 │ +│ 7213 │ the term 'Civil' related to various civil rights and conflicts │ [4, 8, 9, 12] │ [4, 8, 9, 12] │ 1 │ +│ 24076 │ the word 'interest' in various contexts of engagement and concern │ [4, 5, 8, 9] │ [4, 5, 8, 9] │ 1 │ +│ 7118 │ the substring 'ek' within various words and names │ [1, 6, 9] │ [1, 6, 9, 10] │ 0.928571 │ +│ 8404 │ navigational and interaction elements in online media and articles │ [2, 3, 5, 9] │ [2, 3, 5, 9] │ 1 │ +│ 7074 │ social media platform names and prompts related to sharing content │ [10, 11] │ [7, 9, 10, 11] │ 0.857143 │ +│ 11850 │ names of major companies and utilities related to energy and manufacturing sectors │ [2, 3, 12] │ [2, 3, 5, 12] │ 0.928571 │ +└──────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────────────┴─────────────────┴──────────┘ + +Worst scoring idx 3801, score = 0.42857142857142855 +Generation phase +┌───────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ system │ We're studying neurons in a neural network. Each neuron activates on some particular word/words/substring/concept in a │ +│ │ short document. The activating words in each document are indicated with << ... >>. We will give you a list of documents │ +│ │ on which the neuron activates, in order from most strongly activating to least strongly activating. Look at the parts of │ +│ │ the document the neuron activates for and summarize in a single sentence what the neuron is activating on. Try not to be │ +│ │ overly specific in your explanation. Note that some neurons will activate only on specific words or substrings, but │ +│ │ others will activate on most/all words in a sentence provided that sentence contains some particular concept. Your │ +│ │ explanation should cover most or all activating words (for example, don't give an explanation which is specific to a │ +│ │ single word if all words in a sentence cause the neuron to activate). Pay attention to things like the capitalization │ +│ │ and punctuation of the activating words or concepts, if that seems relevant. Keep the explanation as short and simple as │ +│ │ possible, limited to 20 words or less. Omit punctuation and formatting. You should avoid giving long lists of words. │ +│ │ Some examples: "This neuron activates on the word 'knows' in rhetorical questions", and "This neuron activates on verbs │ +│ │ related to decision-making and preferences", and "This neuron activates on the substring 'Ent' at the start of words", │ +│ │ and "This neuron activates on text about government economic policy". │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ user │ The activating documents are given below: 1. the honorary degree of Doctor of Business Administration at the<< │ +│ │ Robert>><< Gordon>> University in Aberdeen, thanks largely to his company 2. scored the Best Film and Screenplay prizes │ +│ │ at the<< David>> di<< Don>>atello awards. The story unfolds at 3. medicine unit at the country's largest hospital, │ +│ │ the<< John>> F<< Kennedy>> Memorial Medical Center in Monrovia. 4. first-come, first-served basis in the<< John>><< │ +│ │ Paul>><< Jones>> Arena, University Hall and<< McC>>ue Center 5. LV basketball team because its offices are located in │ +│ │ the<< Thomas>> &<< Mack>> and its practice facility is located right next 6. the small town of Mayberry on The<< Andy>> │ +│ │ Griffith Show on TV in the 1960s 7. Independent)↵↵—Please make note of The<< Mary>> Sues general comment policy.—↵↵ 8. │ +│ │ is back when he used to do bits on the<< David>> Lettermans show when Letterman was 9. Chuck Na, one of two men who │ +│ │ attended the<< George>> Mason event. I was bullied a lot 10. interred in the Tikhvin Cemetery at the<< Alexander>> │ +│ │ Nevsky Convent,[111] near his favourite 11. aid Cleaves in February and two shows by the<< Glenn>> Miller Orchestra in │ +│ │ early March.↵↵Updated at 12. could look like.↵↵Number26, a<< Peter>> Thiel-backed German startup thats setting 13. │ +│ │ from \( f(x) \) to the<< Taylor>> series \( T_{10}(x) \ 14. get the chance to. Herman will go spot for<< Ryan>> Preece │ +│ │ on occasion like he did on Mother 15. phyle was originally created in the fictional setting of<< Neal>> Stephensons │ +│ │ The<< Diamond>> Age, and David │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ assistant │ This neuron activates on names of individuals or references to notable figures or characters. │ +└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Top act │ Sequence │ +├───────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ 14.672 │ the honorary degree of Doctor of Business Administration at the<< Robert>><< Gordon>> University in Aberdeen, thanks largely to his company │ +│ 14.549 │ scored the Best Film and Screenplay prizes at the<< David>> di<< Don>>atello awards. The story unfolds at │ +│ 14.428 │ medicine unit at the country's largest hospital, the<< John>> F<< Kennedy>> Memorial Medical Center in Monrovia. │ +│ 14.203 │ first-come, first-served basis in the<< John>><< Paul>><< Jones>> Arena, University Hall and<< McC>>ue Center │ +│ 12.835 │ LV basketball team because its offices are located in the<< Thomas>> &<< Mack>> and its practice facility is located right next │ +│ 12.738 │ the small town of Mayberry on The<< Andy>> Griffith Show on TV in the 1960s │ +│ 12.408 │ Independent)↵↵—Please make note of The<< Mary>> Sues general comment policy.—↵↵ │ +│ 12.154 │ is back when he used to do bits on the<< David>> Lettermans show when Letterman was │ +│ 12.095 │ Chuck Na, one of two men who attended the<< George>> Mason event. I was bullied a lot │ +│ 11.963 │ interred in the Tikhvin Cemetery at the<< Alexander>> Nevsky Convent,[111] near his favourite │ +│ 10.736 │ aid Cleaves in February and two shows by the<< Glenn>> Miller Orchestra in early March.↵↵Updated at │ +│ 10.275 │ could look like.↵↵Number26, a<< Peter>> Thiel-backed German startup thats setting │ +│ 4.967 │ from \( f(x) \) to the<< Taylor>> series \( T_{10}(x) \ │ +│ 1.701 │ get the chance to. Herman will go spot for<< Ryan>> Preece on occasion like he did on Mother │ +│ 1.615 │ phyle was originally created in the fictional setting of<< Neal>> Stephensons The<< Diamond>> Age, and David │ +└───────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +Scoring phase +┌───────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ system │ We're studying neurons in a neural network. Each neuron activates on some particular word/words/substring/concept in a │ +│ │ short document. You will be given a short explanation of what this neuron activates for, and then be shown 14 example │ +│ │ sequences in random order. You will have to return a comma-separated list of the examples where you think the neuron │ +│ │ should activate at least once, on ANY of the words or substrings in the document. For example, your response might look │ +│ │ like "2, 3, 6, 8". Try not to be overly specific in your interpretation of the explanation. If you think there are no │ +│ │ examples where the neuron will activate, you should just respond with "None". You should include nothing else in your │ +│ │ response other than comma-separated numbers or the word "None" - this is important. │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ user │ Here is the explanation: this neuron fires on names of individuals or references to notable figures or characters. Here │ +│ │ are the examples: 1. PlayStation Vita portable, but early 2013 is obviously a different matter. Sony Japan announced │ +│ │ today that both the Wi 2. africanos y de los países delHubo tal migración que surgió un 3. advocacy organizations in │ +│ │ Southern California.<|endoftext|>Liver regeneration is the process by which the liver is able to replace 4. of towns │ +│ │ are going through," Niemeyer says of the budget cuts. "None of these things 5. England no natural left-sided option, │ +│ │ and with them struggling to keep the ball in the centre of the 6. Amaro Jr. insists it's only the media that is making │ +│ │ an issue of this. "Honestly," 7. -American stars in both radio and television. The Steve Harvey Show, Family Feud, and │ +│ │ Little Big 8. the Pixel Visual Core co-processor built into the company's Pixel 2 smartphones for the first time.↵ 9. │ +│ │ to UNLV basketball, the primary tenant of the Thomas & Mack Center: headaches. Lots and lots of 10. bing the influence │ +│ │ of big-money donors in the John A. Wilson Building.↵↵And, happiness 11. of systems and zpk2ss_cascade(z,p,k) — converts │ +│ │ Z 12. , understand the Koran, the hadith (the traditions and habits of Muhammad) or the sunna ( 13. rose.↵↵Study │ +│ │ Limitations↵↵The Roger Adams papers and other documents used in this research provide 14. .↵↵From the beginning, a │ +│ │ conservative bloc of lawmakers has urged leadership to ignore Obamas │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ assistant │ 4, 6, 12, 14 │ +└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────┬───────────┬──────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Top act │ Active? │ Predicted? │ Sequence │ +├───────────┼───────────┼──────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ 0.000 │ │ │ PlayStation Vita portable, but early 2013 is obviously a different matter. Sony Japan announced today that both the Wi │ +│ 0.000 │ │ │ africanos y de los países delHubo tal migración que surgió un │ +│ 0.000 │ │ │ advocacy organizations in Southern California.<|endoftext|>Liver regeneration is the process by which the liver is able to replace │ +│ 0.000 │ │ Y │ of towns are going through," Niemeyer says of the budget cuts. "None of these things │ +│ 0.000 │ │ │ England no natural left-sided option, and with them struggling to keep the ball in the centre of the │ +│ 0.000 │ │ Y │ Amaro Jr. insists it's only the media that is making an issue of this. "Honestly," │ +│ 12.714 │ Y │ │ -American stars in both radio and television. The Steve Harvey Show, Family Feud, and Little Big │ +│ 0.000 │ │ │ the Pixel Visual Core co-processor built into the company's Pixel 2 smartphones for the first time.↵ │ +│ 13.336 │ Y │ │ to UNLV basketball, the primary tenant of the Thomas & Mack Center: headaches. Lots and lots of │ +│ 14.544 │ Y │ │ bing the influence of big-money donors in the John A. Wilson Building.↵↵And, happiness │ +│ 0.000 │ │ │ of systems and zpk2ss_cascade(z,p,k) — converts Z │ +│ 0.000 │ │ Y │ , understand the Koran, the hadith (the traditions and habits of Muhammad) or the sunna ( │ +│ 12.708 │ Y │ │ rose.↵↵Study Limitations↵↵The Roger Adams papers and other documents used in this research provide │ +│ 0.000 │ │ Y │ .↵↵From the beginning, a conservative bloc of lawmakers has urged leadership to ignore Obamas │ +└───────────┴───────────┴──────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ + +Best scoring idx 9496, score = 1.0 +Generation phase +┌───────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ system │ We're studying neurons in a neural network. Each neuron activates on some particular word/words/substring/concept in a │ +│ │ short document. The activating words in each document are indicated with << ... >>. We will give you a list of documents │ +│ │ on which the neuron activates, in order from most strongly activating to least strongly activating. Look at the parts of │ +│ │ the document the neuron activates for and summarize in a single sentence what the neuron is activating on. Try not to be │ +│ │ overly specific in your explanation. Note that some neurons will activate only on specific words or substrings, but │ +│ │ others will activate on most/all words in a sentence provided that sentence contains some particular concept. Your │ +│ │ explanation should cover most or all activating words (for example, don't give an explanation which is specific to a │ +│ │ single word if all words in a sentence cause the neuron to activate). Pay attention to things like the capitalization │ +│ │ and punctuation of the activating words or concepts, if that seems relevant. Keep the explanation as short and simple as │ +│ │ possible, limited to 20 words or less. Omit punctuation and formatting. You should avoid giving long lists of words. │ +│ │ Some examples: "This neuron activates on the word 'knows' in rhetorical questions", and "This neuron activates on verbs │ +│ │ related to decision-making and preferences", and "This neuron activates on the substring 'Ent' at the start of words", │ +│ │ and "This neuron activates on text about government economic policy". │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ user │ The activating documents are given below: 1. ↵↵These internal documents show that the SRF<< initiated>> CHD research in │ +│ │ 1965 to protect market share and 2. (or other player covering home plate), or otherwise<< initiate>><< an>> avoidable │ +│ │ collision.↵↵Video 3. ↵↵In crowd sourcing, the original group that<< initiated>><< the>> project is allowed to keep │ +│ │ intellectual property and distribution 4. of processing and presenting the antigen to T cells to<< initiate>><< an>> │ +│ │ immune response [ 16 ]. The helminth 5. Readers might be surprised to discover that women usually<< initiated>> major │ +│ │ changes in the anti-abortion movement. 6. Our policy is to forward e-mails<< initiated>> from and received by private │ +│ │ e-mail accounts to 7. Robert de Courcy Laffan, that first<< initiated>><< the>> annual birthday procession and laying │ +│ │ of flowers on Shakespeare 8. Major League Baseball confirmed that the collision Rizzo<< initiated>> was illegal, but │ +│ │ then opted not to discipline him 9. Employment Effects of Two Northwest Minimum Wage<< Initi>><>. Economic │ +│ │ Inquiry. 45(1 10. of Nov. 3, 2011, during political demonstrations<< initiated>> by the Occupy Oakland movement, the │ +│ │ complaint 11. After the response was entered, the next trial was<< initiated>> by pressing the space bar. There were 20 │ +│ │ trials 12. said hes met with the department and<< instituted>> more de-escalation training to make 13. .↵↵To that end, │ +│ │ the administration has<< instituted>> several other transparency initiatives. It has followed through on 14. J. Edgar │ +│ │ Hoover ordered the agency to acquire and<< commence>> regular training with automatic shoulder weapons, including the │ +│ │ Thompson 15. people praying as they walk."↵↵RAMha<> in Port-au-Prince tweets: "I │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ assistant │ This neuron activates on variations of the word 'initiate' and related terms regarding the start of actions or │ +│ │ processes. │ +└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Top act │ Sequence │ +├───────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ 33.651 │ ↵↵These internal documents show that the SRF<< initiated>> CHD research in 1965 to protect market share and │ +│ 33.492 │ (or other player covering home plate), or otherwise<< initiate>><< an>> avoidable collision.↵↵Video │ +│ 31.308 │ ↵↵In crowd sourcing, the original group that<< initiated>><< the>> project is allowed to keep intellectual property and distribution │ +│ 31.226 │ of processing and presenting the antigen to T cells to<< initiate>><< an>> immune response [ 16 ]. The helminth │ +│ 30.588 │ Readers might be surprised to discover that women usually<< initiated>> major changes in the anti-abortion movement. │ +│ 30.148 │ Our policy is to forward e-mails<< initiated>> from and received by private e-mail accounts to │ +│ 29.706 │ Robert de Courcy Laffan, that first<< initiated>><< the>> annual birthday procession and laying of flowers on Shakespeare │ +│ 28.958 │ Major League Baseball confirmed that the collision Rizzo<< initiated>> was illegal, but then opted not to discipline him │ +│ 28.577 │ Employment Effects of Two Northwest Minimum Wage<< Initi>><>. Economic Inquiry. 45(1 │ +│ 28.308 │ of Nov. 3, 2011, during political demonstrations<< initiated>> by the Occupy Oakland movement, the complaint │ +│ 25.233 │ After the response was entered, the next trial was<< initiated>> by pressing the space bar. There were 20 trials │ +│ 15.639 │ said hes met with the department and<< instituted>> more de-escalation training to make │ +│ 12.571 │ .↵↵To that end, the administration has<< instituted>> several other transparency initiatives. It has followed through on │ +│ 10.688 │ J. Edgar Hoover ordered the agency to acquire and<< commence>> regular training with automatic shoulder weapons, including the Thompson │ +│ 4.830 │ people praying as they walk."↵↵RAMha<> in Port-au-Prince tweets: "I │ +└───────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +Scoring phase +┌───────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ system │ We're studying neurons in a neural network. Each neuron activates on some particular word/words/substring/concept in a │ +│ │ short document. You will be given a short explanation of what this neuron activates for, and then be shown 14 example │ +│ │ sequences in random order. You will have to return a comma-separated list of the examples where you think the neuron │ +│ │ should activate at least once, on ANY of the words or substrings in the document. For example, your response might look │ +│ │ like "4, 5, 7, 12". Try not to be overly specific in your interpretation of the explanation. If you think there are no │ +│ │ examples where the neuron will activate, you should just respond with "None". You should include nothing else in your │ +│ │ response other than comma-separated numbers or the word "None" - this is important. │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ user │ Here is the explanation: this neuron fires on variations of the word 'initiate' and related terms regarding the start of │ +│ │ actions or processes. Here are the examples: 1. the first spacecraft due to management errors, NASA had initiated a │ +│ │ crash review of the Mars Polar Lander to 2. 't repay. We need tourism to come back, we need the economy to come back and │ +│ │ we need a 3. and/or transmit any story or audio content published on this site under the terms of this license, but 4. │ +│ │ pull a protester forward and begin beating him over the head with sticks. Theres video of it 5. of factual errors │ +│ │ regarding the National Broadband Network project initiated by Labor, which has been continued as the Coalition 6. that │ +│ │ there was "no adequate explanation" for the black-white IQ gap.[162][163] In 7. while only 21 percent do. Among │ +│ │ moderates, the numbers are even worse with 70 percent reporting they lack confidence 8. legitimacy to two opposing │ +│ │ viewpoints. On the other hand, some integrative complexity scores are driven by elaborative 9. Sherpao of QWP objected │ +│ │ over the delay, saying that the assembly had passed resolutions asking the federal 10. force.↵↵The city has already │ +│ │ paid out millions in police misconduct lawsuits.↵↵Ortega 11. downe/Leinster), Paul OConnell (Young Munster) (capt), │ +│ │ Peter 12. Rizzo crosses the baseline to his left and initiates contact, deviating from his path to the 13. he told │ +│ │ reporters at the legislature Tuesday as the NDP government released the final year-end numbers.↵↵ 14. One can only │ +│ │ infer that because the Cultural Revolution was initiated by Mao--rather than a mass popular movement from │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ assistant │ 1, 5, 12, 14 │ +└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────┬───────────┬──────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Top act │ Active? │ Predicted? │ Sequence │ +├───────────┼───────────┼──────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ 30.255 │ Y │ Y │ the first spacecraft due to management errors, NASA had initiated a crash review of the Mars Polar Lander to │ +│ 0.000 │ │ │ 't repay. We need tourism to come back, we need the economy to come back and we need a │ +│ 0.000 │ │ │ and/or transmit any story or audio content published on this site under the terms of this license, but │ +│ 0.000 │ │ │ pull a protester forward and begin beating him over the head with sticks. Theres video of it │ +│ 28.859 │ Y │ Y │ of factual errors regarding the National Broadband Network project initiated by Labor, which has been continued as the Coalition │ +│ 0.000 │ │ │ that there was "no adequate explanation" for the black-white IQ gap.[162][163] In │ +│ 0.000 │ │ │ while only 21 percent do. Among moderates, the numbers are even worse with 70 percent reporting they lack confidence │ +│ 0.000 │ │ │ legitimacy to two opposing viewpoints. On the other hand, some integrative complexity scores are driven by elaborative │ +│ 0.000 │ │ │ Sherpao of QWP objected over the delay, saying that the assembly had passed resolutions asking the federal │ +│ 0.000 │ │ │ force.↵↵The city has already paid out millions in police misconduct lawsuits.↵↵Ortega │ +│ 0.000 │ │ │ downe/Leinster), Paul OConnell (Young Munster) (capt), Peter │ +│ 35.404 │ Y │ Y │ Rizzo crosses the baseline to his left and initiates contact, deviating from his path to the │ +│ 0.000 │ │ │ he told reporters at the legislature Tuesday as the NDP government released the final year-end numbers.↵↵ │ +│ 28.650 │ Y │ Y │ One can only infer that because the Cultural Revolution was initiated by Mao--rather than a mass popular movement from │ +└───────────┴───────────┴──────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ \ No newline at end of file diff --git a/evals/autointerp/logs_4.txt b/evals/autointerp/logs_4.txt new file mode 100644 index 0000000..c007ba8 --- /dev/null +++ b/evals/autointerp/logs_4.txt @@ -0,0 +1,213 @@ +Summary table: +┌──────────┬───────────────────────────────────────────────────────────────────────────┬────────────────┬────────────────┬──────────┐ +│ latent │ explanation │ predictions │ correct seqs │ score │ +├──────────┼───────────────────────────────────────────────────────────────────────────┼────────────────┼────────────────┼──────────┤ +│ 9 │ the phrase 'new' in various contexts of rules, policies, and measures │ [4, 6, 7, 10] │ [4, 6, 7, 10] │ 1 │ +│ 11 │ variations of the substring 'rel' within different contexts and concepts │ [4, 7, 8, 11] │ [4, 7, 8, 11] │ 1 │ +│ 15 │ phrases related to debit and credit card transactions and associated fees │ [4, 9, 11, 14] │ [4, 9, 11, 14] │ 1 │ +│ 16873 │ texts related to belief, righteousness, and spiritual guidance │ [2, 7, 10] │ [2, 7, 8, 10] │ 0.928571 │ +└──────────┴───────────────────────────────────────────────────────────────────────────┴────────────────┴────────────────┴──────────┘ + +Worst scoring idx 16873, score = 0.9285714285714286 +Generation phase +┌───────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ system │ We're studying neurons in a neural network. Each neuron activates on some particular word/words/substring/concept in a │ +│ │ short document. The activating words in each document are indicated with << ... >>. We will give you a list of documents │ +│ │ on which the neuron activates, in order from most strongly activating to least strongly activating. Look at the parts of │ +│ │ the document the neuron activates for and summarize in a single sentence what the neuron is activating on. Try not to be │ +│ │ overly specific in your explanation. Note that some neurons will activate only on specific words or substrings, but │ +│ │ others will activate on most/all words in a sentence provided that sentence contains some particular concept. Your │ +│ │ explanation should cover most or all activating words (for example, don't give an explanation which is specific to a │ +│ │ single word if all words in a sentence cause the neuron to activate). Pay attention to things like the capitalization │ +│ │ and punctuation of the activating words or concepts, if that seems relevant. Keep the explanation as short and simple as │ +│ │ possible, limited to 20 words or less. Omit punctuation and formatting. You should avoid giving long lists of words. │ +│ │ Some examples: "This neuron activates on the word 'knows' in rhetorical questions", and "This neuron activates on verbs │ +│ │ related to decision-making and preferences", and "This neuron activates on the substring 'Ent' at the start of words", │ +│ │ and "This neuron activates on text about government economic policy". │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ user │ The activating documents are given below: 1. Testament:<< Ver>><><<,>><< ver>><><<,>><< I>><< say>><< │ +│ │ unto>><< you>><<,>><< Except>><< a>><< corn>><< of>><< wheat>><< fall>><< into>><< the>> 2. <<↵>><><< do>><< │ +│ │ not>><< marry>><< poly>>the<><< women>><< until>><< they>><< believe>><<.>><< And>><< a>><< believing>><< │ +│ │ slave>><< woman>><< is>><< better>><< than>> 3. << testimony>><< to>><< all>><< nations>><<,>><< and>><< then>><< │ +│ │ the>><< end>><< will>><< come>><<">>↵↵I have played this CD for record 4. << thou>><< to>><< me>><><< And>><< │ +│ │ Jesus>><< answering>><< said>><< unto>><< him>><<,>><< S>><><< it>><< to>><< be>><< so>><< now>><<:>><< for>><< │ +│ │ thus>> 5. << ">><><< John>><< forb>><><< him>><<,>><< saying>><<,>><< I>><< have>><< a>><< need>><< to>><< │ +│ │ be>><< bapt>><><< of>><< thee>><<,>><< and>> 6. of Matthew 3:14–15:<< ">><><< John>><< forb>><><< │ +│ │ him>><<,>><< saying>><<,>><< I>><< have>><< a>><< need>> 7. << it>><< to>><< be>><< so>><< now>><<:>><< for>><< thus>><< │ +│ │ it>><< bec>><><><< us>><< to>><< fulfil>><< all>><< righteousness>><<",>> and<< he>><< finished>> 8. << │ +│ │ need>><< to>><< be>><< bapt>><><< of>><< thee>><<,>><< and>><< com>><><< thou>><< to>><< me>><><< And>><< │ +│ │ Jesus>><< answering>><< said>><< unto>><< him>> 9. << men>><< [>><><< your>><< women>><<]>><< until>><< they>><< │ +│ │ believe>><<.>><< And>><< a>><< believing>><< slave>><< is>><< better>><< than>><< a>> poly<><> 10. << to>><< │ +│ │ Paradise>><< and>><< to>><< forgiveness>><<,>><< by>><< His>><< permission>><<.>><< And>><< He>><< makes>><< clear>><< │ +│ │ His>><< verses>><< to>><< the>><< people>><< that>><< perhaps>> 11. <><<,>><< even>><< though>><< she>><< might>><< │ +│ │ please>><< you>><<.>><< And>><< do>><< not>><< marry>><< poly>><><><< men>><< [>><><< your>><< women>> │ +│ │ 12. <> alone<<:>><< but>><< if>><< it>><< dies>><<,>><< it>><< bring>><><< forth>><< much>><< fruit>><<.>><< │ +│ │ John>><< 12>><<:>><<24>><<↵>><<↵>> 13. << you>><<.>><< Those>><< invite>><< [>><><<]>><< to>><< the>><< │ +│ │ Fire>><<,>><< but>><< Allah>><< invites>><< to>><< Paradise>><< and>><< to>><< forgiveness>><<,>><< by>> 14. << your>><< │ +│ │ dreams>><< and>><< fresh>><< hopes>><< for>><< all>><< of>><< us>><< and>><< may>><< the>><< light>><< guide>><< us>><< │ +│ │ safely>> home<<.>><< We>><< have>><< much>> 15. Ami Shah, and was also<< greeted>> and congratulated<< by>> │ +│ │ Biden.↵↵Also present were Obama's old │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ assistant │ This neuron activates on texts related to belief, righteousness, and spiritual guidance. │ +└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Top act │ Sequence │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ 15.723 │ Testament:<< Ver>><><<,>><< ver>><><<,>><< I>><< say>><< unto>><< you>><<,>><< Except>><< a>><< corn>><< of>><< wheat>><< fall>><< into>><< the>> │ +│ 15.400 │ <<↵>><><< do>><< not>><< marry>><< poly>>the<><< women>><< until>><< they>><< believe>><<.>><< And>><< a>><< believing>><< slave>><< woman>><< is>><< better>><< than>> │ +│ 14.111 │ << testimony>><< to>><< all>><< nations>><<,>><< and>><< then>><< the>><< end>><< will>><< come>><<">>↵↵I have played this CD for record │ +│ 14.069 │ << thou>><< to>><< me>><><< And>><< Jesus>><< answering>><< said>><< unto>><< him>><<,>><< S>><><< it>><< to>><< be>><< so>><< now>><<:>><< for>><< thus>> │ +│ 13.461 │ << ">><><< John>><< forb>><><< him>><<,>><< saying>><<,>><< I>><< have>><< a>><< need>><< to>><< be>><< bapt>><><< of>><< thee>><<,>><< and>> │ +│ 13.461 │ of Matthew 3:14–15:<< ">><><< John>><< forb>><><< him>><<,>><< saying>><<,>><< I>><< have>><< a>><< need>> │ +│ 12.801 │ << it>><< to>><< be>><< so>><< now>><<:>><< for>><< thus>><< it>><< bec>><><><< us>><< to>><< fulfil>><< all>><< righteousness>><<",>> and<< he>><< finished>> │ +│ 12.377 │ << need>><< to>><< be>><< bapt>><><< of>><< thee>><<,>><< and>><< com>><><< thou>><< to>><< me>><><< And>><< Jesus>><< answering>><< said>><< unto>><< him>> │ +│ 11.644 │ << men>><< [>><><< your>><< women>><<]>><< until>><< they>><< believe>><<.>><< And>><< a>><< believing>><< slave>><< is>><< better>><< than>><< a>> poly<><> │ +│ 10.594 │ << to>><< Paradise>><< and>><< to>><< forgiveness>><<,>><< by>><< His>><< permission>><<.>><< And>><< He>><< makes>><< clear>><< His>><< verses>><< to>><< the>><< people>><< that>><< perhaps>> │ +│ 10.487 │ <><<,>><< even>><< though>><< she>><< might>><< please>><< you>><<.>><< And>><< do>><< not>><< marry>><< poly>><><><< men>><< [>><><< your>><< women>> │ +│ 10.154 │ <> alone<<:>><< but>><< if>><< it>><< dies>><<,>><< it>><< bring>><><< forth>><< much>><< fruit>><<.>><< John>><< 12>><<:>><<24>><<↵>><<↵>> │ +│ 10.066 │ << you>><<.>><< Those>><< invite>><< [>><><<]>><< to>><< the>><< Fire>><<,>><< but>><< Allah>><< invites>><< to>><< Paradise>><< and>><< to>><< forgiveness>><<,>><< by>> │ +│ 5.072 │ << your>><< dreams>><< and>><< fresh>><< hopes>><< for>><< all>><< of>><< us>><< and>><< may>><< the>><< light>><< guide>><< us>><< safely>> home<<.>><< We>><< have>><< much>> │ +│ 1.930 │ Ami Shah, and was also<< greeted>> and congratulated<< by>> Biden.↵↵Also present were Obama's old │ +└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +Scoring phase +┌───────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ system │ We're studying neurons in a neural network. Each neuron activates on some particular word/words/substring/concept in a │ +│ │ short document. You will be given a short explanation of what this neuron activates for, and then be shown 14 example │ +│ │ sequences in random order. You will have to return a comma-separated list of the examples where you think the neuron │ +│ │ should activate at least once, on ANY of the words or substrings in the document. For example, your response might look │ +│ │ like "1, 2, 6, 13". Try not to be overly specific in your interpretation of the explanation. If you think there are no │ +│ │ examples where the neuron will activate, you should just respond with "None". You should include nothing else in your │ +│ │ response other than comma-separated numbers or the word "None" - this is important. │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ user │ Here is the explanation: this neuron fires on texts related to belief, righteousness, and spiritual guidance. Here are │ +│ │ the examples: 1. said. A number of tornadoes have careered across prairie farmland, a large hailstorm forced 2. aqara │ +│ │ 2:221 =↵↵And do not marry polytheistic women until they believe. And 3. strategy will be released later this year.↵↵The │ +│ │ company also claimed that this is the perfect time AMD 4. Hornets are in a win-now-or-everybody-might-be-fired mode. │ +│ │ Cho 5. baby girl is ready for a good time, Freshman daughter drop off, 6. expressed as floating-point numbers are a │ +│ │ really lousy way to analyze and evaluate polynomials unless the 7. marry polytheistic women until they believe. And a │ +│ │ believing slave woman is better than a polytheist 8. says: "And this gospel of the kingdom will be preached in the │ +│ │ whole world as a testimony to all 9. ? Make sure youre in the loop – take five seconds to sign up for our FREE Agg 10. │ +│ │ Suffer it to be so now: for thus it becometh us to fulfil all righteousness", and 11. entities.↵↵José Manuel Barroso, │ +│ │ President of the European Commission, has already endorsed 12. locally.↵↵In total, Lee Unkrich and Adrian Molinas Coco │ +│ │ plucked 13. drops to historic low in new poll↵↵They are part of a remarkably picturesque and politically purple patch │ +│ │ 14. and can act as competitive differentiation in the market.↵↵A third type of balance to seek is across │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ assistant │ 2, 7, 10 │ +└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────┬───────────┬──────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Top act │ Active? │ Predicted? │ Sequence │ +├───────────┼───────────┼──────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ 0.000 │ │ │ said. A number of tornadoes have careered across prairie farmland, a large hailstorm forced │ +│ 15.400 │ Y │ Y │ aqara 2:221 =↵↵And do not marry polytheistic women until they believe. And │ +│ 0.000 │ │ │ strategy will be released later this year.↵↵The company also claimed that this is the perfect time AMD │ +│ 0.000 │ │ │ Hornets are in a win-now-or-everybody-might-be-fired mode. Cho │ +│ 0.000 │ │ │ baby girl is ready for a good time, Freshman daughter drop off, │ +│ 0.000 │ │ │ expressed as floating-point numbers are a really lousy way to analyze and evaluate polynomials unless the │ +│ 14.124 │ Y │ Y │ marry polytheistic women until they believe. And a believing slave woman is better than a polytheist │ +│ 18.166 │ Y │ │ says: "And this gospel of the kingdom will be preached in the whole world as a testimony to all │ +│ 0.000 │ │ │ ? Make sure youre in the loop – take five seconds to sign up for our FREE Agg │ +│ 12.874 │ Y │ Y │ Suffer it to be so now: for thus it becometh us to fulfil all righteousness", and │ +│ 0.000 │ │ │ entities.↵↵José Manuel Barroso, President of the European Commission, has already endorsed │ +│ 0.000 │ │ │ locally.↵↵In total, Lee Unkrich and Adrian Molinas Coco plucked │ +│ 0.000 │ │ │ drops to historic low in new poll↵↵They are part of a remarkably picturesque and politically purple patch │ +│ 0.000 │ │ │ and can act as competitive differentiation in the market.↵↵A third type of balance to seek is across │ +└───────────┴───────────┴──────────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┘ + +Best scoring idx 9, score = 1.0 +Generation phase +┌───────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ system │ We're studying neurons in a neural network. Each neuron activates on some particular word/words/substring/concept in a │ +│ │ short document. The activating words in each document are indicated with << ... >>. We will give you a list of documents │ +│ │ on which the neuron activates, in order from most strongly activating to least strongly activating. Look at the parts of │ +│ │ the document the neuron activates for and summarize in a single sentence what the neuron is activating on. Try not to be │ +│ │ overly specific in your explanation. Note that some neurons will activate only on specific words or substrings, but │ +│ │ others will activate on most/all words in a sentence provided that sentence contains some particular concept. Your │ +│ │ explanation should cover most or all activating words (for example, don't give an explanation which is specific to a │ +│ │ single word if all words in a sentence cause the neuron to activate). Pay attention to things like the capitalization │ +│ │ and punctuation of the activating words or concepts, if that seems relevant. Keep the explanation as short and simple as │ +│ │ possible, limited to 20 words or less. Omit punctuation and formatting. You should avoid giving long lists of words. │ +│ │ Some examples: "This neuron activates on the word 'knows' in rhetorical questions", and "This neuron activates on verbs │ +│ │ related to decision-making and preferences", and "This neuron activates on the substring 'Ent' at the start of words", │ +│ │ and "This neuron activates on text about government economic policy". │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ user │ The activating documents are given below: 1. .↵↵Airline industry↵↵Under the<< new>> rules, payment surcharges will have │ +│ │ to reflect the 2. The government did not however give details on how the<< new>> system would work.↵↵In Uruguay about $ │ +│ │ 3. School Committee chairwoman, Rebecca Stone, said the<< new>> policy was approved in the spring after discussion │ +│ │ began over 4. of Maywoods firehouses. The<< new>> flag mysteriously disappeared early Aug. 23. The order 5. │ +│ │ 145m.↵↵Enforcement↵↵The<< new>> rules are being brought in earlier than the rest of 6. require building cases for │ +│ │ criminal prosecution.↵↵The<< new>> rules, which were passed by the City Council in 7. be gradual, Gatens added.↵↵The<< │ +│ │ new>> Roadmap lays out a three-phase plan. 8. such a crime.↵↵Sun explained that the<< new>> criteria would convict the │ +│ │ offenders not only for the damage 9. its privacy policy this year.↵↵But the<< new>> Microsoft policy does allow for │ +│ │ such targeted advertising. Microsoft 10. in the U.K. / Germany), the<< new>> version is similar to the existing │ +│ │ incarnation in most respects 11. a "largely complementary" footprint and that the<< new>> company will "take its place │ +│ │ as one of the 12. Con International, Matt will both write and pencil the<< new>> Dark Horse monthly Dept. H, a murder │ +│ │ mystery 13. Kriss Kringle in his place. The<< new>> guy was doing just fine - but the women missed 14. ↵Griffin picked │ +│ │ up about 10 pounds for the<< new>> role -- a feat his brother said wasn't difficult 15. pre-criminal space.↵↵These<< │ +│ │ new>> measures have a terrifying potential to be used zealously │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ assistant │ This neuron activates on the phrase 'new' in various contexts of rules, policies, and measures. │ +└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Top act │ Sequence │ +├───────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ 43.451 │ .↵↵Airline industry↵↵Under the<< new>> rules, payment surcharges will have to reflect the │ +│ 42.861 │ The government did not however give details on how the<< new>> system would work.↵↵In Uruguay about $ │ +│ 42.159 │ School Committee chairwoman, Rebecca Stone, said the<< new>> policy was approved in the spring after discussion began over │ +│ 41.990 │ of Maywoods firehouses. The<< new>> flag mysteriously disappeared early Aug. 23. The order │ +│ 41.921 │ 145m.↵↵Enforcement↵↵The<< new>> rules are being brought in earlier than the rest of │ +│ 41.077 │ require building cases for criminal prosecution.↵↵The<< new>> rules, which were passed by the City Council in │ +│ 40.424 │ be gradual, Gatens added.↵↵The<< new>> Roadmap lays out a three-phase plan. │ +│ 40.388 │ such a crime.↵↵Sun explained that the<< new>> criteria would convict the offenders not only for the damage │ +│ 40.284 │ its privacy policy this year.↵↵But the<< new>> Microsoft policy does allow for such targeted advertising. Microsoft │ +│ 40.114 │ in the U.K. / Germany), the<< new>> version is similar to the existing incarnation in most respects │ +│ 35.471 │ a "largely complementary" footprint and that the<< new>> company will "take its place as one of the │ +│ 30.504 │ Con International, Matt will both write and pencil the<< new>> Dark Horse monthly Dept. H, a murder mystery │ +│ 28.822 │ Kriss Kringle in his place. The<< new>> guy was doing just fine - but the women missed │ +│ 25.595 │ ↵Griffin picked up about 10 pounds for the<< new>> role -- a feat his brother said wasn't difficult │ +│ 25.591 │ pre-criminal space.↵↵These<< new>> measures have a terrifying potential to be used zealously │ +└───────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +Scoring phase +┌───────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ system │ We're studying neurons in a neural network. Each neuron activates on some particular word/words/substring/concept in a │ +│ │ short document. You will be given a short explanation of what this neuron activates for, and then be shown 14 example │ +│ │ sequences in random order. You will have to return a comma-separated list of the examples where you think the neuron │ +│ │ should activate at least once, on ANY of the words or substrings in the document. For example, your response might look │ +│ │ like "1, 4, 11, 14". Try not to be overly specific in your interpretation of the explanation. If you think there are no │ +│ │ examples where the neuron will activate, you should just respond with "None". You should include nothing else in your │ +│ │ response other than comma-separated numbers or the word "None" - this is important. │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ user │ Here is the explanation: this neuron fires on the phrase 'new' in various contexts of rules, policies, and measures. │ +│ │ Here are the examples: 1. . In the Islamic Statement, women are given the role not only of the wife of a 2. were │ +│ │ opened too far."↵↵"This had to do with doing maintenance," Loughmiller said 3. that shed get a whopping majority, only │ +│ │ to discover once the votes were in that she 4. areas, including the Mediterranean.↵↵Although the new US rules apply to │ +│ │ all types of boat built from 5. identifiable, decomposing body on the beach.↵↵About the 2008 death of Brad Renfro 6. o │ +│ │ planned before the crisis in public finances. The new policy announced in April by the unlamented former 7. in five │ +│ │ California cities.↵↵The new division shakes out this way:↵↵Ana 8. the mainstream media, academia and the Democratic │ +│ │ Party -- that the Trump election had unleashed an unprecedented amount of anti 9. expression of additional genes [ 13 │ +│ │ 32 ]. Furthermore, IL-4 and IL-13 lead to an increased 10. the Canadian Centre for Policy Alternatives, said the new │ +│ │ TPP agreement is the same as the old flawed one 11. surprisingly powerful machine, with 512MB of RAM and a CPU faster │ +│ │ than the original Pi. Pi Zero is 12. those choices we should never talk down or attack the fundamental philosophies on │ +│ │ which progressive Scottish politics are based - the 13. was designated Single Of The Week from NME, and did well on the │ +│ │ Indie singles charts, as did 14. webpage, this is something I wanted to avoid. I wanted the consumers of the component │ +│ │ to not even need │ +├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ assistant │ 4, 6, 7, 10 │ +└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────┬───────────┬──────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Top act │ Active? │ Predicted? │ Sequence │ +├───────────┼───────────┼──────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ 0.000 │ │ │ . In the Islamic Statement, women are given the role not only of the wife of a │ +│ 0.000 │ │ │ were opened too far."↵↵"This had to do with doing maintenance," Loughmiller said │ +│ 0.000 │ │ │ that shed get a whopping majority, only to discover once the votes were in that she │ +│ 42.104 │ Y │ Y │ areas, including the Mediterranean.↵↵Although the new US rules apply to all types of boat built from │ +│ 0.000 │ │ │ identifiable, decomposing body on the beach.↵↵About the 2008 death of Brad Renfro │ +│ 42.177 │ Y │ Y │ o planned before the crisis in public finances. The new policy announced in April by the unlamented former │ +│ 40.274 │ Y │ Y │ in five California cities.↵↵The new division shakes out this way:↵↵Ana │ +│ 0.000 │ │ │ the mainstream media, academia and the Democratic Party -- that the Trump election had unleashed an unprecedented amount of anti │ +│ 0.000 │ │ │ expression of additional genes [ 13 32 ]. Furthermore, IL-4 and IL-13 lead to an increased │ +│ 41.242 │ Y │ Y │ the Canadian Centre for Policy Alternatives, said the new TPP agreement is the same as the old flawed one │ +│ 0.000 │ │ │ surprisingly powerful machine, with 512MB of RAM and a CPU faster than the original Pi. Pi Zero is │ +│ 0.000 │ │ │ those choices we should never talk down or attack the fundamental philosophies on which progressive Scottish politics are based - the │ +│ 0.000 │ │ │ was designated Single Of The Week from NME, and did well on the Indie singles charts, as did │ +│ 0.000 │ │ │ webpage, this is something I wanted to avoid. I wanted the consumers of the component to not even need │ +└───────────┴───────────┴──────────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ \ No newline at end of file diff --git a/evals/autointerp/main.py b/evals/autointerp/main.py new file mode 100644 index 0000000..a2f7c1e --- /dev/null +++ b/evals/autointerp/main.py @@ -0,0 +1,421 @@ +import asyncio +import random +import time +from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict +from pathlib import Path +from typing import Any, Iterator, Literal, TypeAlias + +import torch +from openai import OpenAI +from sae_lens import SAE, ActivationsStore, HookedSAETransformer +from sae_lens.toolkit.pretrained_saes_directory import get_pretrained_saes_directory +from tabulate import tabulate +from torch import Tensor +from tqdm import tqdm + +from evals.autointerp.config import AutoInterpConfig +from sae_bench_utils.indexing_utils import get_iw_sample_indices, get_k_largest_indices, index_with_buffer + +Messages: TypeAlias = list[dict[Literal["role", "content"], str]] + + +def display_messages(messages: Messages) -> str: + return tabulate([m.values() for m in messages], tablefmt="simple_grid", maxcolwidths=[None, 120]) + + +def str_bool(b: bool) -> str: + return "Y" if b else "" + + +class Example: + """ + Data for a single example sequence. + """ + + def __init__( + self, + toks: list[int], + acts: list[float], + act_threshold: float, + model: HookedSAETransformer, + ): + self.toks = toks + self.str_toks = model.to_str_tokens(torch.tensor(self.toks)) + self.acts = acts + self.act_threshold = act_threshold + self.toks_are_active = [act > act_threshold for act in self.acts] + self.is_active = any(self.toks_are_active) # this is what we predict in the scoring phase + + def to_str(self, mark_toks: bool = False) -> str: + return ( + "".join( + f"<<{tok}>>" if (mark_toks and is_active) else tok + for tok, is_active in zip(self.str_toks, self.toks_are_active) + ) + .replace("�", "") + .replace("\n", "↵") + # .replace(">><<", "") + ) + + +class Examples: + """ + Data for multiple example sequences. Includes methods for shuffling seuqences, and displaying them. + """ + + def __init__(self, examples: list[Example], shuffle: bool = False) -> None: + self.examples = examples + if shuffle: + random.shuffle(self.examples) + else: + self.examples = sorted(self.examples, key=lambda x: max(x.acts), reverse=True) + + def display(self, predictions: list[int] | None = None) -> str: + """ + Displays the list of sequences. If `predictions` is provided, then it'll include a column for both "is_active" + and these predictions of whether it's active. If not, then neither of those columns will be included. + """ + return tabulate( + [ + [max(ex.acts), ex.to_str(mark_toks=True)] + if predictions is None + else [ + max(ex.acts), + str_bool(ex.is_active), + str_bool(i + 1 in predictions), + ex.to_str(mark_toks=False), + ] + for i, ex in enumerate(self.examples) + ], + headers=["Top act"] + ([] if predictions is None else ["Active?", "Predicted?"]) + ["Sequence"], + tablefmt="simple_outline", + floatfmt=".3f", + ) + + def __len__(self) -> int: + return len(self.examples) + + def __iter__(self) -> Iterator[Example]: + return iter(self.examples) + + def __getitem__(self, i: int) -> Example: + return self.examples[i] + + +class AutoInterp: + """ + This is a start-to-end class for generating explanations and optionally scores. It's easiest to implement it as a + single class for the time being because there's data we'll need to fetch that'll be used in both the generation and + scoring phases. + """ + + def __init__(self, cfg: AutoInterpConfig, model: HookedSAETransformer, sae: SAE, device: str, api_key: str): + self.cfg = cfg + self.model = model + self.sae = sae + self.device = device + self.api_key = api_key + self.batch_size = cfg.total_tokens // model.cfg.n_ctx + self.act_store = ActivationsStore.from_sae( + model=model, sae=sae, streaming=True, store_batch_size_prompts=self.batch_size, device=str(self.device) + ) + if cfg.latents is not None: + self.latents = cfg.latents + else: + assert self.cfg.n_latents is not None + self.latents = random.sample(range(self.sae.cfg.d_sae), k=self.cfg.n_latents) + self.n_latents = len(self.latents) + + async def run(self, explanations_override: dict[int, str] = {}) -> tuple[dict[int, dict[str, Any]], dict[int, str]]: + """ + Runs both generation & scoring phases. Returns a dict where keys are latent indices, and values are dicts with: + + "explanation": str, the explanation generated for this latent + "predictions": list[int], the predicted activating indices + "correct seqs": list[int], the true activating indices + "score": float, the fraction of correct predictions (including positive and negative) + "logs": str, the logs for this latent + """ + generation_examples, scoring_examples = self.gather_data() + latents_with_data = sorted(generation_examples.keys()) + n_dead = self.n_latents - len(latents_with_data) + if n_dead > 0: + print(f"Found data for {len(latents_with_data)}/{self.n_latents} alive latents; {n_dead} dead") + + with ThreadPoolExecutor(max_workers=10) as executor: + tasks = [ + self.run_single_feature( + executor, + latent, + generation_examples[latent], + scoring_examples[latent], + explanations_override.get(latent, None), + ) + for latent in latents_with_data + ] + results = {} + for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Calling API (for gen & scoring)"): + result = await future + if result: + results[result["latent"]] = result + + return results + + async def run_single_feature( + self, + executor: ThreadPoolExecutor, + latent: int, + generation_examples: Examples, + scoring_examples: Examples, + explanation_override: str | None = None, + ) -> dict[str, Any] | None: + # Generation phase + gen_prompts = self.get_generation_prompts(generation_examples) + explanation_raw, logs = await asyncio.get_event_loop().run_in_executor( + executor, self.get_api_response, gen_prompts, self.cfg.max_tokens_in_explanation + ) + explanation = self.parse_explanation(explanation_raw) + results = { + "latent": latent, + "explanation": explanation, + "logs": f"Generation phase\n{logs}\n{generation_examples.display()}", + } + + # Scoring phase + if self.cfg.scoring: + scoring_prompts = self.get_scoring_prompts( + explanation=explanation_override or explanation, + scoring_examples=scoring_examples, + ) + predictions_raw, logs = await asyncio.get_event_loop().run_in_executor( + executor, self.get_api_response, scoring_prompts, self.cfg.max_tokens_in_prediction + ) + predictions = self.parse_predictions(predictions_raw) + if predictions is None: + return None + score = self.score_predictions(predictions, scoring_examples) + results |= { + "predictions": predictions, + "correct seqs": [i for i, ex in enumerate(scoring_examples, start=1) if ex.is_active], + "score": score, + "logs": results["logs"] + f"\nScoring phase\n{logs}\n{scoring_examples.display(predictions)}", + } + + return results + + def parse_explanation(self, explanation: str) -> str: + return explanation.split("activates on")[-1].rstrip(".").strip() + + def parse_predictions(self, predictions: str) -> list[int] | None: + predictions_split = predictions.strip().rstrip(".").replace("and", ",").replace("None", "").split(",") + predictions_list = [i.strip() for i in predictions_split if i.strip() != ""] + if predictions_list == []: + return [] + if not all(pred.strip().isdigit() for pred in predictions_list): + return None + predictions = [int(pred.strip()) for pred in predictions_list] + return predictions + + def score_predictions(self, predictions: list[str], scoring_examples: Examples) -> float: + classifications = [i in predictions for i in range(1, len(scoring_examples) + 1)] + correct_classifications = [ex.is_active for ex in scoring_examples] + return sum([c == cc for c, cc in zip(classifications, correct_classifications)]) / len(classifications) + + def get_api_response( + self, + messages: list[dict], + max_tokens: int, + n_completions: int = 1, + ) -> tuple[str | list[str], str]: + """Generic API usage function for OpenAI""" + for message in messages: + assert message.keys() == {"content", "role"} + assert message["role"] in ["system", "user", "assistant"] + + client = OpenAI(api_key=self.api_key) + + result = client.chat.completions.create( + model="gpt-4o-mini", + messages=messages, + n=n_completions, + max_tokens=max_tokens, + stream=False, + ) + response = [choice.message.content.strip() for choice in result.choices] + + logs = tabulate( + [m.values() for m in messages + [{"role": "assistant", "content": response[0]}]], + tablefmt="simple_grid", + maxcolwidths=[None, 120], + ) + response = response[0] if n_completions == 1 else response + + return response, logs + + def get_generation_prompts(self, generation_examples: Examples) -> Messages: + assert len(generation_examples) > 0, "No generation examples found" + + examples_as_str = "\n".join([f"{i+1}. {ex.to_str(mark_toks=True)}" for i, ex in enumerate(generation_examples)]) + + SYSTEM_PROMPT = """We're studying neurons in a neural network. Each neuron activates on some particular word/words/substring/concept in a short document. The activating words in each document are indicated with << ... >>. We will give you a list of documents on which the neuron activates, in order from most strongly activating to least strongly activating. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is activating on. Try not to be overly specific in your explanation. Note that some neurons will activate only on specific words or substrings, but others will activate on most/all words in a sentence provided that sentence contains some particular concept. Your explanation should cover most or all activating words (for example, don't give an explanation which is specific to a single word if all words in a sentence cause the neuron to activate). Pay attention to things like the capitalization and punctuation of the activating words or concepts, if that seems relevant. Keep the explanation as short and simple as possible, limited to 20 words or less. Omit punctuation and formatting. You should avoid giving long lists of words.""" + if self.cfg.use_demos_in_explanation: + SYSTEM_PROMPT += """ Some examples: "This neuron activates on the word 'knows' in rhetorical questions", and "This neuron activates on verbs related to decision-making and preferences", and "This neuron activates on the substring 'Ent' at the start of words", and "This neuron activates on text about government economic policy".""" + else: + SYSTEM_PROMPT += """Your response should be in the form "This neuron activates on...".""" + USER_PROMPT = f"""The activating documents are given below:\n\n{examples_as_str}""" + + return [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": USER_PROMPT}, + ] + + def get_scoring_prompts(self, explanation: str, scoring_examples: Examples) -> Messages: + assert len(scoring_examples) > 0, "No scoring examples found" + + examples_as_str = "\n".join([f"{i+1}. {ex.to_str(mark_toks=False)}" for i, ex in enumerate(scoring_examples)]) + + example_response = sorted( + random.sample(range(1, 1 + self.cfg.n_ex_for_scoring), k=self.cfg.n_correct_for_scoring) + ) + example_response_str = ", ".join([str(i) for i in example_response]) + SYSTEM_PROMPT = f"""We're studying neurons in a neural network. Each neuron activates on some particular word/words/substring/concept in a short document. You will be given a short explanation of what this neuron activates for, and then be shown {self.cfg.n_ex_for_scoring} example sequences in random order. You will have to return a comma-separated list of the examples where you think the neuron should activate at least once, on ANY of the words or substrings in the document. For example, your response might look like "{example_response_str}". Try not to be overly specific in your interpretation of the explanation. If you think there are no examples where the neuron will activate, you should just respond with "None". You should include nothing else in your response other than comma-separated numbers or the word "None" - this is important.""" + USER_PROMPT = f"Here is the explanation: this neuron fires on {explanation}.\n\nHere are the examples:\n\n{examples_as_str}" + + return [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": USER_PROMPT}, + ] + + def gather_data(self) -> tuple[dict[int, Examples], dict[int, Examples]]: + """ + Stores top acts / random seqs data, which is used for generation & scoring respectively. + """ + # Get all activations, split up into batches + tokens = self.act_store.get_batch_tokens() + batch_size, seq_len = tokens.shape + acts = torch.empty((0, seq_len, self.n_latents), device=self.device) + for _tokens in tqdm( + tokens.split(split_size=self.cfg.batch_size, dim=0), desc="Forward passes to get activation values" + ): + sae_in = self.act_store.get_activations(_tokens).squeeze(2).to(self.device) + acts = torch.concat([acts, self.sae.encode(sae_in, latents=self.latents)], dim=0) + + generation_examples = {} + scoring_examples = {} + + for i, latent in enumerate(self.latents): + # (1/3) Get random examples + rand_indices = torch.stack( + [ + torch.randint(0, batch_size, (self.cfg.n_random_ex_for_scoring,)), + torch.randint(self.cfg.buffer, seq_len - self.cfg.buffer, (self.cfg.n_random_ex_for_scoring,)), + ], + dim=-1, + ) + rand_toks = index_with_buffer(tokens, rand_indices, buffer=self.cfg.buffer) + + # (2/3) Get top-scoring examples (and their values) + top_indices = get_k_largest_indices( + acts[..., i], k=self.cfg.n_top_ex, buffer=self.cfg.buffer, no_overlap=self.cfg.no_overlap + ) + top_toks = index_with_buffer(tokens, top_indices, buffer=self.cfg.buffer) + top_values = index_with_buffer(acts[..., i], top_indices, buffer=self.cfg.buffer) + act_threshold = self.cfg.act_threshold_frac * top_values.max().item() + + # (3/3) Get importance-weighted examples (and their values), using a threshold so they're disjoint from top values + # Also, if we don't have enough values, then break - assume this is a dead feature + threshold = top_values[:, self.cfg.buffer].min().item() + if torch.where(acts[..., i] < threshold, acts[..., i], 0.0).max() < 1e-6: + continue + iw_indices = get_iw_sample_indices( + acts[..., i], k=self.cfg.n_iw_sampled_ex, buffer=self.cfg.buffer, threshold=threshold + ) + iw_toks = index_with_buffer(tokens, iw_indices, buffer=self.cfg.buffer) + iw_values = index_with_buffer(acts[..., i], iw_indices, buffer=self.cfg.buffer) + + # Get random values to use for splitting + rand_top_ex_split_indices = torch.randperm(self.cfg.n_top_ex) + top_gen_indices = rand_top_ex_split_indices[: self.cfg.n_top_ex_for_generation] + top_scoring_indices = rand_top_ex_split_indices[self.cfg.n_top_ex_for_generation :] + rand_iw_split_indices = torch.randperm(self.cfg.n_iw_sampled_ex) + iw_gen_indices = rand_iw_split_indices[: self.cfg.n_iw_sampled_ex_for_generation] + iw_scoring_indices = rand_iw_split_indices[self.cfg.n_iw_sampled_ex_for_generation :] + + def create_examples(all_toks: Tensor, all_acts: Tensor | None = None) -> Examples: + if all_acts is None: + all_acts = torch.zeros_like(all_toks).float() + return [ + Example(toks=toks, acts=acts, act_threshold=act_threshold, model=self.model) + for (toks, acts) in zip(all_toks.tolist(), all_acts.tolist()) + ] + + # Get the generation & scoring examples + generation_examples[latent] = Examples( + create_examples(top_toks[top_gen_indices], top_values[top_gen_indices]) + + create_examples(iw_toks[iw_gen_indices], iw_values[iw_gen_indices]), + ) + scoring_examples[latent] = Examples( + create_examples(top_toks[top_scoring_indices], top_values[top_scoring_indices]) + + create_examples(iw_toks[iw_scoring_indices], iw_values[iw_scoring_indices]) + + create_examples(rand_toks), + shuffle=True, + ) + + return generation_examples, scoring_examples + + +def run_eval( + config: AutoInterpConfig, + selected_saes_dict: dict[str, list[str]], # dict of SAE release name: list of SAE names to evaluate + device: str, + api_key: str, + save_logs_path: str | Path | None = None, +) -> dict[str, Any]: + """ + Runs autointerp eval. Returns results as a dict with the following structure: + + custom_eval_config - dict of config parameters used for this evaluation + custom_eval_results - nested dict of {sae_name: {"score": score}} + """ + results_dict = {} + + random.seed(config.seed) + torch.manual_seed(config.seed) + + results_dict = {"custom_eval_results": {}, "custom_eval_config": asdict(config)} + + model: HookedSAETransformer = HookedSAETransformer.from_pretrained(config.model_name, device=device) + + for release, sae_names in selected_saes_dict.items(): + saes_map = get_pretrained_saes_directory()[release].saes_map + for sae_name in sae_names: + # Load in SAE, and randomly choose a number of latents to use for this autointerp instance + sae_id = saes_map[sae_name] + sae = SAE.from_pretrained(release, sae_id, device=str(device))[0] + + # Get autointerp results + autointerp = AutoInterp(cfg=config, model=model, sae=sae, api_key=api_key, device=device) + results = asyncio.run(autointerp.run()) + + if save_logs_path is not None: + # Get summary results for all latents, as well logs for the best and worst-scoring latents + headers = ["latent", "explanation", "predictions", "correct seqs", "score"] + logs = "Summary table:\n" + tabulate( + [[results[latent][h] for h in headers] for latent in results], + headers=headers, + tablefmt="simple_outline", + ) + worst_result = min(results.values(), key=lambda x: x["score"]) + best_result = max(results.values(), key=lambda x: x["score"]) + logs += f"\n\nWorst scoring idx {worst_result['latent']}, score = {worst_result['score']}\n{worst_result['logs']}" + logs += f"\n\nBest scoring idx {best_result['latent']}, score = {best_result['score']}\n{best_result['logs']}" + # Save the results to a file + with open(save_logs_path, "a") as f: + f.write(logs) + + # Put important results into the results dict + score = sum([r["score"] for r in results.values()]) / len(results) + results_dict["custom_eval_results"][sae_name] = {"score": score} + + return results_dict diff --git a/sae_bench_utils/indexing_utils.py b/sae_bench_utils/indexing_utils.py new file mode 100644 index 0000000..c697942 --- /dev/null +++ b/sae_bench_utils/indexing_utils.py @@ -0,0 +1,91 @@ +import einops +import torch +from jaxtyping import Float, Int +from torch import Tensor + + +def get_k_largest_indices( + x: Float[Tensor, "batch seq"], + k: int, + buffer: int = 0, + no_overlap: bool = False, +) -> Int[Tensor, "k 2"]: + """ + Args: + x: The 2D tensor to get the top k largest elements from. + k: The number of top elements to get. + buffer: We won't choose any elements within `buffer` from the start or end of their seq (this helps if we + want more context around the chosen tokens). + no_overlap: If True, this ensures that no 2 top-activating tokens are in the same seq and within `buffer` of + each other. + + Returns: + indices: The index positions of the top k largest elements. + """ + x = x[:, buffer:-buffer] + indices = x.flatten().argsort(-1, descending=True) + rows = indices // x.size(1) + cols = indices % x.size(1) + buffer + + if no_overlap: + unique_indices = [] + seen_positions = set() + for row, col in zip(rows.tolist(), cols.tolist()): + if (row, col) not in seen_positions: + unique_indices.append((row, col)) + for offset in range(-buffer, buffer + 1): + seen_positions.add((row, col + offset)) + if len(unique_indices) == k: + break + rows, cols = torch.tensor( + unique_indices, dtype=torch.int64, device=x.device + ).unbind(dim=-1) + + return torch.stack((rows, cols), dim=1)[:k] + + +def get_iw_sample_indices( + x: Float[Tensor, "batch seq"], + k: int, + buffer: int = 0, + threshold: float | None = None, + use_squared_values: bool = True, +) -> Int[Tensor, "k 2"]: + """ + This function returns k indices from x, importance-sampled (i.e. chosen with probabilities in proportion to their + values). This is mean to be an alternative to quantile sampling, which accomplishes a similar thing. + + Also includes an optional threshold above which we won't sample. + """ + x = x[:, buffer:-buffer] + if threshold is not None: + x = torch.where(x >= threshold, torch.zeros_like(x), x) + if use_squared_values: + x = x.pow(2) + + probabilities = x.flatten() / x.flatten().sum() + indices = torch.multinomial(probabilities, k, replacement=False) + + rows = indices // x.size(1) + cols = indices % x.size(1) + buffer + return torch.stack((rows, cols), dim=1)[:k] + + +def index_with_buffer( + x: Float[Tensor, "batch seq"], + indices: Int[Tensor, "k 2"], + buffer: int = 0, +) -> Float[Tensor, "k buffer_x2_plus1"]: + """ + This function returns the tensor you get when indexing into `x` with indices, and taking a +-buffer range around + each index. For example, if `indices` is a list of the top activating tokens (returned by `get_k_largest_indices`), then + this function can get you the sequence context. + """ + assert indices.ndim == 2, "indices must have 2 dimensions" + assert indices.shape[1] == 2, "indices must have 2 columns" + rows, cols = indices.unbind(dim=-1) + rows = einops.repeat(rows, "k -> k buffer", buffer=buffer * 2 + 1) + cols = einops.repeat(cols, "k -> k buffer", buffer=buffer * 2 + 1) + torch.arange( + -buffer, buffer + 1, device=cols.device + ) + return x[rows, cols] diff --git a/tests/test_utils.py b/tests/test_utils.py index 1f5997e..c389dd3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,7 @@ import sae_bench_utils.formatting_utils as formatting_utils +import sae_bench_utils.indexing_utils as indexing_utils import sae_bench_utils.testing_utils as testing_utils +import torch def test_average_results(): @@ -25,4 +27,49 @@ def test_average_results(): # Call the function output = formatting_utils.average_results_dictionaries(results_dict, dataset_names) - testing_utils.compare_dicts_within_tolerance(output, expected_output, tolerance=1e-6) + testing_utils.compare_dicts_within_tolerance( + output, expected_output, tolerance=1e-6 + ) + + +def test_indexing_utils(): + x = torch.arange(40).reshape((2, 20)) + x[0, 10] += 50 # 2nd highest value + x[0, 11] += 100 # highest value + x[1, 1] += ( + 150 # not inside buffer (it's less than 3 from the start of the sequence) + ) + top_indices = indexing_utils.get_k_largest_indices( + x, k=2, buffer=3, no_overlap=False + ) + assert top_indices.tolist() == [[0, 11], [0, 10]] + top_indices_no_overlap = indexing_utils.get_k_largest_indices( + x, k=2, buffer=3, no_overlap=True + ) + assert top_indices_no_overlap.tolist() == [[0, 11], [1, 16]] + + # TODO - add test here (should get high values that are still strictly below 10) + # iw_indices = get_iw_sample_indices(x, k=5, buffer=3, threshold=10) + # # print(x[iw_indices[:, 0], iw_indices[:, 1]]) + + x_top_values_with_context = indexing_utils.index_with_buffer( + x, top_indices, buffer=3 + ) + assert x_top_values_with_context[0].tolist() == [ + 8, + 9, + 10 + 50, + 11 + 100, + 12, + 13, + 14, + ] # highest value in the middle + assert x_top_values_with_context[1].tolist() == [ + 7, + 8, + 9, + 10 + 50, + 11 + 100, + 12, + 13, + ] # 2nd highest value in the middle